diff options
Diffstat (limited to 'lib/UTF16.ml')
-rw-r--r-- | lib/UTF16.ml | 201 |
1 files changed, 201 insertions, 0 deletions
diff --git a/lib/UTF16.ml b/lib/UTF16.ml new file mode 100644 index 0000000..bdc48c7 --- /dev/null +++ b/lib/UTF16.ml @@ -0,0 +1,201 @@ +(** + Lexer using sedlex + *) + +open Tokens + +exception UnclosedQuote of { content : string; line : int } +exception LexError of Lexing.position * string + +let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } = + Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol) + +(* The comment system is terrible. The same symbol can be used for : + - starting a comment + - inequality operation + In order to manage this, I try to identify the context in a very basic way, + using a flag True False for determining the token to send. +*) +module Bucket = Ephemeron.K1.Bucket + +type bucket = (Sedlexing.lexbuf, int) Bucket.t + +let is_expression : bucket = Bucket.make () + +let incr_level lexbuf = + match Bucket.find is_expression lexbuf with + | None -> Bucket.add is_expression lexbuf 1 + | Some v -> Bucket.add is_expression lexbuf (v + 1) + +let decr_level lexbuf = + match Bucket.find is_expression lexbuf with + | None -> () + | Some v -> + if v > 1 then Bucket.add is_expression lexbuf (v - 1) + else Bucket.remove is_expression lexbuf + +let build_ident lexbuf = + let id = + Sedlexing.lexeme lexbuf |> Idents.of_uchars |> String.uppercase_ascii + in + try + let value = Hashtbl.find Idents.keyword_table id in + let _ = match value with IF | ELIF -> incr_level lexbuf | _ -> () in + value + with Not_found -> IDENT id + +let incr_level lexbuf = + match Bucket.find is_expression lexbuf with + | None -> Bucket.add is_expression lexbuf 1 + | Some v -> Bucket.add is_expression lexbuf (v + 1) + +let wait_balance rule lexbuf = + try[@warning "-52"] rule (Buffer.create 17) lexbuf + with Failure "lexing: empty token" -> + let position, _ = Sedlexing.lexing_positions lexbuf in + let line = position.Lexing.pos_lnum + and content = Sedlexing.lexeme lexbuf |> Idents.of_uchars in + (raise (UnclosedQuote { line; content }) [@warning "+52"]) + +let space = [%sedlex.regexp? ' ' | '\t'] +let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"] +let coma = [%sedlex.regexp? ','] +let digit = [%sedlex.regexp? '0' .. '9'] +let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_'] +let spaces = [%sedlex.regexp? Plus space] +let ident = [%sedlex.regexp? ('$' | letters), Star (digit | letters)] +let location_ident = [%sedlex.regexp? letters | digit] +let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] +let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] + +let rec read_long_string level buf lexbuf = + match%sedlex lexbuf with + | '{' -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_long_string (level + 1) buf lexbuf + | '}' -> ( + match level with + | 0 -> Buffer.contents buf + | _ -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_long_string (level - 1) buf lexbuf) + | eol -> + Sedlexing.new_line lexbuf; + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_long_string level buf lexbuf + | any -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_long_string level buf lexbuf + | _ -> raise Not_found + +let rec read_dquoted_string buf lexbuf = + match%sedlex lexbuf with + | "\"\"" -> + Buffer.add_char buf '"'; + read_dquoted_string buf lexbuf + | '"' -> Buffer.contents buf + | any -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_dquoted_string buf lexbuf + | _ -> raise Not_found + +let rec read_quoted_string buf lexbuf = + match%sedlex lexbuf with + | "''" -> + Buffer.add_char buf '\''; + read_quoted_string buf lexbuf + | '\'' -> Buffer.contents buf + | eol -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + Sedlexing.new_line lexbuf; + read_quoted_string buf lexbuf + | any -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_quoted_string buf lexbuf + | _ -> raise Not_found + +let rec skip_comment lexbuf = + match%sedlex lexbuf with + | '{' -> + let _ = wait_balance (read_long_string 0) lexbuf in + skip_comment lexbuf + | '\'' -> + let _ = wait_balance read_quoted_string lexbuf in + skip_comment lexbuf + | '"' -> + let _ = wait_balance read_dquoted_string lexbuf in + skip_comment lexbuf + | eol -> + (* Ugly hack used in order to put the eol in the front of the next + parsing. *) + Sedlexing.rollback lexbuf; + COMMENT + | any -> skip_comment lexbuf + | _ -> raise Not_found + +let rec token lexbuf = + match%sedlex lexbuf with + | 0Xfeff -> + (* Ignore the BOM *) + token lexbuf + | '#', Star space, location -> + let ident = Idents.of_uchars (Sedlexing.lexeme lexbuf) in + + LOCATION_START ident + | Plus digit -> INTEGER (Sedlexing.Utf8.lexeme lexbuf) + | '+' -> PLUS + | '-' -> MINUS + | "+=" -> INCR + | "-=" -> DECR + | "*=" -> MULT_EQUAL + | '/' -> DIV + | '*' -> STAR + | ':' -> + (* We are leaving the block, the comment will be handled again *) + decr_level lexbuf; + COLUMN + | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> LOCATION_END + | '[' -> L_BRACKET + | ']' -> R_BRACKET + | '(' -> + incr_level lexbuf; + L_PAREN + | ')' -> + decr_level lexbuf; + R_PAREN + | '<' -> LT + | '>' -> GT + | coma -> COMA + | eof -> + Bucket.remove is_expression lexbuf; + EOF + | '=' -> + incr_level lexbuf; + EQUAL + | ident -> build_ident lexbuf + | eol -> + Bucket.add is_expression lexbuf 0; + Sedlexing.new_line lexbuf; + EOL + | '&' -> + Bucket.add is_expression lexbuf 0; + AMPERSAND + | '!' -> ( + match Bucket.find is_expression lexbuf with + | Some i when i <> 0 -> EXCLAMATION + | _ -> skip_comment lexbuf) + | spaces -> token lexbuf + | '\'' -> LITERAL (wait_balance read_quoted_string lexbuf) + | '"' -> LITERAL (wait_balance read_dquoted_string lexbuf) + | '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf) + | _ -> + let position = fst @@ Sedlexing.lexing_positions lexbuf in + let tok = Sedlexing.Utf16.lexeme lexbuf Little_endian false in + + let msg = + Format.asprintf "Unexpected character %S at %a" tok pp_pos position + in + + raise @@ LexError (position, msg) + +let lexer buf = Sedlexing.with_tokenizer token buf |