diff options
Diffstat (limited to 'lib/lexer.ml')
-rw-r--r-- | lib/lexer.ml | 119 |
1 files changed, 59 insertions, 60 deletions
diff --git a/lib/lexer.ml b/lib/lexer.ml index 1a2d788..a91bfdb 100644 --- a/lib/lexer.ml +++ b/lib/lexer.ml @@ -57,12 +57,14 @@ let incr_level lexbuf = | None -> Bucket.add is_expression lexbuf 1 | Some v -> Bucket.add is_expression lexbuf (v + 1) -let wait_balance rule lexbuf = - try[@warning "-52"] rule (Buffer.create 17) lexbuf +let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a = + fun rule lexbuf -> + try[@warning "-52"] + let token = rule (Buffer.create 256) lexbuf in + token with Failure "lexing: empty token" -> - let position, _ = Sedlexing.lexing_positions lexbuf in - let line = position.Lexing.pos_lnum - and content = Sedlexing.lexeme lexbuf |> Idents.of_uchars in + let position, _ = Lexbuf.positions lexbuf in + let line = position.Lexing.pos_lnum and content = Lexbuf.content lexbuf in (raise (UnclosedQuote { line; content }) [@warning "+52"]) let space = [%sedlex.regexp? ' ' | '\t'] @@ -76,82 +78,87 @@ let location_ident = [%sedlex.regexp? letters | digit] let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] -let rec read_long_string level buf lexbuf = +let rec read_long_string level buf buffer = + let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '{' -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string (level + 1) buf lexbuf + read_long_string (level + 1) buf buffer | '}' -> ( match level with | 0 -> Buffer.contents buf | _ -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string (level - 1) buf lexbuf) + read_long_string (level - 1) buf buffer) | eol -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string level buf lexbuf + Buffer.add_string buf (Lexbuf.content buffer); + read_long_string level buf buffer | any -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string level buf lexbuf + Buffer.add_string buf (Lexbuf.content buffer); + read_long_string level buf buffer | _ -> raise Not_found -let rec read_dquoted_string buf lexbuf = +let rec read_dquoted_string buf buffer = + let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | "\"\"" -> Buffer.add_char buf '"'; - read_dquoted_string buf lexbuf + read_dquoted_string buf buffer | '"' -> Buffer.contents buf | any -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_dquoted_string buf lexbuf + read_dquoted_string buf buffer | _ -> raise Not_found -let rec read_quoted_string buf lexbuf = +let rec read_quoted_string buf buffer = + let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | "''" -> Buffer.add_char buf '\''; - read_quoted_string buf lexbuf + read_quoted_string buf buffer | '\'' -> Buffer.contents buf | eol -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_quoted_string buf lexbuf + Buffer.add_string buf (Lexbuf.content buffer); + read_quoted_string buf buffer | any -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_quoted_string buf lexbuf + Buffer.add_string buf (Lexbuf.content buffer); + read_quoted_string buf buffer | _ -> raise Not_found -let rec skip_comment lexbuf = +let rec skip_comment buffer = + let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '{' -> - let _ = wait_balance (read_long_string 0) lexbuf in - skip_comment lexbuf + let _ = wait_balance (read_long_string 0) buffer in + skip_comment buffer | '\'' -> - let _ = wait_balance read_quoted_string lexbuf in - skip_comment lexbuf + let _ = wait_balance read_quoted_string buffer in + skip_comment buffer | '"' -> - let _ = wait_balance read_dquoted_string lexbuf in - skip_comment lexbuf + let _ = wait_balance read_dquoted_string buffer in + skip_comment buffer | eol -> (* Ugly hack used in order to put the eol in the front of the next parsing. *) Sedlexing.rollback lexbuf; COMMENT - | any -> skip_comment lexbuf + | any -> skip_comment buffer | _ -> raise Not_found (** Main lexer *) -let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = - fun (module E : Encoding) lexbuf -> +let rec token : Lexbuf.t -> token = + fun buffer -> + let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | 0Xfeff -> - Sedlexing.start lexbuf; + Lexbuf.start buffer; (* Ignore the BOM *) - token (module E) lexbuf + token buffer | '#', Star space, location -> let _start_pos, end_pos = Sedlexing.lexing_positions lexbuf in (* Extract the location name *) - let ident = E.lexeme lexbuf in + let ident = Lexbuf.content buffer in let () = match Str.string_match location_name ident 0 with | false -> () @@ -166,7 +173,7 @@ let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> Bucket.remove is_expression lexbuf; LOCATION_END - | Plus digit -> INTEGER (E.lexeme lexbuf) + | Plus digit -> INTEGER (Lexbuf.content buffer) | '+' -> PLUS | '-' -> MINUS | "+=" -> INCR @@ -202,15 +209,15 @@ let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = | '!' -> ( match Bucket.find is_expression lexbuf with | Some i when i <> 0 -> EXCLAMATION - | _ -> skip_comment lexbuf) - | spaces -> token (module E) lexbuf - | '\'' -> LITERAL (wait_balance read_quoted_string lexbuf) - | '"' -> LITERAL (wait_balance read_dquoted_string lexbuf) - | '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf) + | _ -> skip_comment buffer) + | spaces -> token buffer + | '\'' -> LITERAL (wait_balance read_quoted_string buffer) + | '"' -> LITERAL (wait_balance read_dquoted_string buffer) + | '{' -> LITERAL (wait_balance (read_long_string 0) buffer) | eof -> raise EOF | _ -> let position = fst @@ Sedlexing.lexing_positions lexbuf in - let tok = E.lexeme lexbuf in + let tok = Lexbuf.content buffer in let msg = Format.asprintf "Unexpected character %S at %a" tok pp_pos position @@ -218,32 +225,24 @@ let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = raise @@ LexError (position, msg) -let rec discard lexbuf = +let rec discard buffer = + let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '\'' -> - ignore (wait_balance read_quoted_string lexbuf); - discard lexbuf + ignore (wait_balance read_quoted_string buffer); + discard buffer | '"' -> - ignore (wait_balance read_dquoted_string lexbuf); - discard lexbuf + ignore (wait_balance read_dquoted_string buffer); + discard buffer | '{' -> - ignore (wait_balance (read_long_string 0) lexbuf); - discard lexbuf + ignore (wait_balance (read_long_string 0) buffer); + discard buffer | eof -> raise EOF | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> Bucket.remove is_expression lexbuf; () | '!' -> - ignore @@ skip_comment lexbuf; - discard lexbuf - | any -> discard lexbuf + ignore @@ skip_comment buffer; + discard buffer + | any -> discard buffer | _ -> raise EOF - -(** Tokenizer for menhir *) -let lexer : - (module Encoding) -> - Sedlexing.lexbuf -> - unit -> - token * Lexing.position * Lexing.position = - fun (module E : Encoding.S) lexbuf -> - Sedlexing.with_tokenizer (token (module E)) lexbuf |