aboutsummaryrefslogtreecommitdiff
path: root/lib/UTF16.ml
diff options
context:
space:
mode:
Diffstat (limited to 'lib/UTF16.ml')
-rw-r--r--lib/UTF16.ml27
1 files changed, 20 insertions, 7 deletions
diff --git a/lib/UTF16.ml b/lib/UTF16.ml
index bdc48c7..e325011 100644
--- a/lib/UTF16.ml
+++ b/lib/UTF16.ml
@@ -10,6 +10,10 @@ exception LexError of Lexing.position * string
let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } =
Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol)
+module type Encoding = sig
+ val lexeme : Sedlexing.lexbuf -> string
+end
+
(* The comment system is terrible. The same symbol can be used for :
- starting a comment
- inequality operation
@@ -133,16 +137,18 @@ let rec skip_comment lexbuf =
| any -> skip_comment lexbuf
| _ -> raise Not_found
-let rec token lexbuf =
+(** Main lexer *)
+let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token =
+ fun (module E : Encoding) lexbuf ->
match%sedlex lexbuf with
| 0Xfeff ->
(* Ignore the BOM *)
- token lexbuf
+ token (module E) lexbuf
| '#', Star space, location ->
- let ident = Idents.of_uchars (Sedlexing.lexeme lexbuf) in
+ let ident = E.lexeme lexbuf in
LOCATION_START ident
- | Plus digit -> INTEGER (Sedlexing.Utf8.lexeme lexbuf)
+ | Plus digit -> INTEGER (E.lexeme lexbuf)
| '+' -> PLUS
| '-' -> MINUS
| "+=" -> INCR
@@ -184,13 +190,13 @@ let rec token lexbuf =
match Bucket.find is_expression lexbuf with
| Some i when i <> 0 -> EXCLAMATION
| _ -> skip_comment lexbuf)
- | spaces -> token lexbuf
+ | spaces -> token (module E) lexbuf
| '\'' -> LITERAL (wait_balance read_quoted_string lexbuf)
| '"' -> LITERAL (wait_balance read_dquoted_string lexbuf)
| '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf)
| _ ->
let position = fst @@ Sedlexing.lexing_positions lexbuf in
- let tok = Sedlexing.Utf16.lexeme lexbuf Little_endian false in
+ let tok = E.lexeme lexbuf in
let msg =
Format.asprintf "Unexpected character %S at %a" tok pp_pos position
@@ -198,4 +204,11 @@ let rec token lexbuf =
raise @@ LexError (position, msg)
-let lexer buf = Sedlexing.with_tokenizer token buf
+(** Tokenizer for menhir *)
+let lexer :
+ (module Encoding) ->
+ Sedlexing.lexbuf ->
+ unit ->
+ token * Lexing.position * Lexing.position =
+ fun (module E : Encoding.S) lexbuf ->
+ Sedlexing.with_tokenizer (token (module E)) lexbuf