diff options
Diffstat (limited to 'lib/UTF16.ml')
-rw-r--r-- | lib/UTF16.ml | 27 |
1 files changed, 20 insertions, 7 deletions
diff --git a/lib/UTF16.ml b/lib/UTF16.ml index bdc48c7..e325011 100644 --- a/lib/UTF16.ml +++ b/lib/UTF16.ml @@ -10,6 +10,10 @@ exception LexError of Lexing.position * string let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } = Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol) +module type Encoding = sig + val lexeme : Sedlexing.lexbuf -> string +end + (* The comment system is terrible. The same symbol can be used for : - starting a comment - inequality operation @@ -133,16 +137,18 @@ let rec skip_comment lexbuf = | any -> skip_comment lexbuf | _ -> raise Not_found -let rec token lexbuf = +(** Main lexer *) +let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = + fun (module E : Encoding) lexbuf -> match%sedlex lexbuf with | 0Xfeff -> (* Ignore the BOM *) - token lexbuf + token (module E) lexbuf | '#', Star space, location -> - let ident = Idents.of_uchars (Sedlexing.lexeme lexbuf) in + let ident = E.lexeme lexbuf in LOCATION_START ident - | Plus digit -> INTEGER (Sedlexing.Utf8.lexeme lexbuf) + | Plus digit -> INTEGER (E.lexeme lexbuf) | '+' -> PLUS | '-' -> MINUS | "+=" -> INCR @@ -184,13 +190,13 @@ let rec token lexbuf = match Bucket.find is_expression lexbuf with | Some i when i <> 0 -> EXCLAMATION | _ -> skip_comment lexbuf) - | spaces -> token lexbuf + | spaces -> token (module E) lexbuf | '\'' -> LITERAL (wait_balance read_quoted_string lexbuf) | '"' -> LITERAL (wait_balance read_dquoted_string lexbuf) | '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf) | _ -> let position = fst @@ Sedlexing.lexing_positions lexbuf in - let tok = Sedlexing.Utf16.lexeme lexbuf Little_endian false in + let tok = E.lexeme lexbuf in let msg = Format.asprintf "Unexpected character %S at %a" tok pp_pos position @@ -198,4 +204,11 @@ let rec token lexbuf = raise @@ LexError (position, msg) -let lexer buf = Sedlexing.with_tokenizer token buf +(** Tokenizer for menhir *) +let lexer : + (module Encoding) -> + Sedlexing.lexbuf -> + unit -> + token * Lexing.position * Lexing.position = + fun (module E : Encoding.S) lexbuf -> + Sedlexing.with_tokenizer (token (module E)) lexbuf |