diff options
author | Chimrod <> | 2023-09-28 09:16:56 +0200 |
---|---|---|
committer | Chimrod <> | 2023-09-29 10:00:21 +0200 |
commit | 7fc4021d888b4f16f8fa87c0ea1df68d3806df64 (patch) | |
tree | ec06b304b6e32a086aaeb4a1e866a255a199bc1a /lib/UTF16.ml | |
parent | 5dc0c5defdd7ebb152a00e8b2895787b54931779 (diff) |
Renamed the lexer
Diffstat (limited to 'lib/UTF16.ml')
-rw-r--r-- | lib/UTF16.ml | 214 |
1 files changed, 0 insertions, 214 deletions
diff --git a/lib/UTF16.ml b/lib/UTF16.ml deleted file mode 100644 index e325011..0000000 --- a/lib/UTF16.ml +++ /dev/null @@ -1,214 +0,0 @@ -(** - Lexer using sedlex - *) - -open Tokens - -exception UnclosedQuote of { content : string; line : int } -exception LexError of Lexing.position * string - -let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } = - Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol) - -module type Encoding = sig - val lexeme : Sedlexing.lexbuf -> string -end - -(* The comment system is terrible. The same symbol can be used for : - - starting a comment - - inequality operation - In order to manage this, I try to identify the context in a very basic way, - using a flag True False for determining the token to send. -*) -module Bucket = Ephemeron.K1.Bucket - -type bucket = (Sedlexing.lexbuf, int) Bucket.t - -let is_expression : bucket = Bucket.make () - -let incr_level lexbuf = - match Bucket.find is_expression lexbuf with - | None -> Bucket.add is_expression lexbuf 1 - | Some v -> Bucket.add is_expression lexbuf (v + 1) - -let decr_level lexbuf = - match Bucket.find is_expression lexbuf with - | None -> () - | Some v -> - if v > 1 then Bucket.add is_expression lexbuf (v - 1) - else Bucket.remove is_expression lexbuf - -let build_ident lexbuf = - let id = - Sedlexing.lexeme lexbuf |> Idents.of_uchars |> String.uppercase_ascii - in - try - let value = Hashtbl.find Idents.keyword_table id in - let _ = match value with IF | ELIF -> incr_level lexbuf | _ -> () in - value - with Not_found -> IDENT id - -let incr_level lexbuf = - match Bucket.find is_expression lexbuf with - | None -> Bucket.add is_expression lexbuf 1 - | Some v -> Bucket.add is_expression lexbuf (v + 1) - -let wait_balance rule lexbuf = - try[@warning "-52"] rule (Buffer.create 17) lexbuf - with Failure "lexing: empty token" -> - let position, _ = Sedlexing.lexing_positions lexbuf in - let line = position.Lexing.pos_lnum - and content = Sedlexing.lexeme lexbuf |> Idents.of_uchars in - (raise (UnclosedQuote { line; content }) [@warning "+52"]) - -let space = [%sedlex.regexp? ' ' | '\t'] -let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"] -let coma = [%sedlex.regexp? ','] -let digit = [%sedlex.regexp? '0' .. '9'] -let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_'] -let spaces = [%sedlex.regexp? Plus space] -let ident = [%sedlex.regexp? ('$' | letters), Star (digit | letters)] -let location_ident = [%sedlex.regexp? letters | digit] -let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] -let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] - -let rec read_long_string level buf lexbuf = - match%sedlex lexbuf with - | '{' -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string (level + 1) buf lexbuf - | '}' -> ( - match level with - | 0 -> Buffer.contents buf - | _ -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string (level - 1) buf lexbuf) - | eol -> - Sedlexing.new_line lexbuf; - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string level buf lexbuf - | any -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string level buf lexbuf - | _ -> raise Not_found - -let rec read_dquoted_string buf lexbuf = - match%sedlex lexbuf with - | "\"\"" -> - Buffer.add_char buf '"'; - read_dquoted_string buf lexbuf - | '"' -> Buffer.contents buf - | any -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_dquoted_string buf lexbuf - | _ -> raise Not_found - -let rec read_quoted_string buf lexbuf = - match%sedlex lexbuf with - | "''" -> - Buffer.add_char buf '\''; - read_quoted_string buf lexbuf - | '\'' -> Buffer.contents buf - | eol -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - Sedlexing.new_line lexbuf; - read_quoted_string buf lexbuf - | any -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_quoted_string buf lexbuf - | _ -> raise Not_found - -let rec skip_comment lexbuf = - match%sedlex lexbuf with - | '{' -> - let _ = wait_balance (read_long_string 0) lexbuf in - skip_comment lexbuf - | '\'' -> - let _ = wait_balance read_quoted_string lexbuf in - skip_comment lexbuf - | '"' -> - let _ = wait_balance read_dquoted_string lexbuf in - skip_comment lexbuf - | eol -> - (* Ugly hack used in order to put the eol in the front of the next - parsing. *) - Sedlexing.rollback lexbuf; - COMMENT - | any -> skip_comment lexbuf - | _ -> raise Not_found - -(** Main lexer *) -let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = - fun (module E : Encoding) lexbuf -> - match%sedlex lexbuf with - | 0Xfeff -> - (* Ignore the BOM *) - token (module E) lexbuf - | '#', Star space, location -> - let ident = E.lexeme lexbuf in - - LOCATION_START ident - | Plus digit -> INTEGER (E.lexeme lexbuf) - | '+' -> PLUS - | '-' -> MINUS - | "+=" -> INCR - | "-=" -> DECR - | "*=" -> MULT_EQUAL - | '/' -> DIV - | '*' -> STAR - | ':' -> - (* We are leaving the block, the comment will be handled again *) - decr_level lexbuf; - COLUMN - | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> LOCATION_END - | '[' -> L_BRACKET - | ']' -> R_BRACKET - | '(' -> - incr_level lexbuf; - L_PAREN - | ')' -> - decr_level lexbuf; - R_PAREN - | '<' -> LT - | '>' -> GT - | coma -> COMA - | eof -> - Bucket.remove is_expression lexbuf; - EOF - | '=' -> - incr_level lexbuf; - EQUAL - | ident -> build_ident lexbuf - | eol -> - Bucket.add is_expression lexbuf 0; - Sedlexing.new_line lexbuf; - EOL - | '&' -> - Bucket.add is_expression lexbuf 0; - AMPERSAND - | '!' -> ( - match Bucket.find is_expression lexbuf with - | Some i when i <> 0 -> EXCLAMATION - | _ -> skip_comment lexbuf) - | spaces -> token (module E) lexbuf - | '\'' -> LITERAL (wait_balance read_quoted_string lexbuf) - | '"' -> LITERAL (wait_balance read_dquoted_string lexbuf) - | '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf) - | _ -> - let position = fst @@ Sedlexing.lexing_positions lexbuf in - let tok = E.lexeme lexbuf in - - let msg = - Format.asprintf "Unexpected character %S at %a" tok pp_pos position - in - - raise @@ LexError (position, msg) - -(** Tokenizer for menhir *) -let lexer : - (module Encoding) -> - Sedlexing.lexbuf -> - unit -> - token * Lexing.position * Lexing.position = - fun (module E : Encoding.S) lexbuf -> - Sedlexing.with_tokenizer (token (module E)) lexbuf |