1 files changed, 201 insertions, 0 deletions
diff --git a/lib/UTF16.ml b/lib/UTF16.ml
new file mode 100644
index 0000000..bdc48c7
--- /dev/null
+++ b/lib/UTF16.ml
@@ -0,0 +1,201 @@
+(** 
+    Lexer using sedlex
+ *)
+
+open Tokens
+
+exception UnclosedQuote of { content : string; line : int }
+exception LexError of Lexing.position * string
+
+let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } =
+  Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol)
+
+(* The comment system is terrible. The same symbol can be used for :
+     - starting a comment
+     - inequality operation
+   In order to manage this, I try to identify the context in a very basic way,
+   using a flag True False for determining the token to send.
+*)
+module Bucket = Ephemeron.K1.Bucket
+
+type bucket = (Sedlexing.lexbuf, int) Bucket.t
+
+let is_expression : bucket = Bucket.make ()
+
+let incr_level lexbuf =
+  match Bucket.find is_expression lexbuf with
+  | None -> Bucket.add is_expression lexbuf 1
+  | Some v -> Bucket.add is_expression lexbuf (v + 1)
+
+let decr_level lexbuf =
+  match Bucket.find is_expression lexbuf with
+  | None -> ()
+  | Some v ->
+      if v > 1 then Bucket.add is_expression lexbuf (v - 1)
+      else Bucket.remove is_expression lexbuf
+
+let build_ident lexbuf =
+  let id =
+    Sedlexing.lexeme lexbuf |> Idents.of_uchars |> String.uppercase_ascii
+  in
+  try
+    let value = Hashtbl.find Idents.keyword_table id in
+    let _ = match value with IF | ELIF -> incr_level lexbuf | _ -> () in
+    value
+  with Not_found -> IDENT id
+
+let incr_level lexbuf =
+  match Bucket.find is_expression lexbuf with
+  | None -> Bucket.add is_expression lexbuf 1
+  | Some v -> Bucket.add is_expression lexbuf (v + 1)
+
+let wait_balance rule lexbuf =
+  try[@warning "-52"] rule (Buffer.create 17) lexbuf
+  with Failure "lexing: empty token" ->
+    let position, _ = Sedlexing.lexing_positions lexbuf in
+    let line = position.Lexing.pos_lnum
+    and content = Sedlexing.lexeme lexbuf |> Idents.of_uchars in
+    (raise (UnclosedQuote { line; content }) [@warning "+52"])
+
+let space = [%sedlex.regexp? ' ' | '\t']
+let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"]
+let coma = [%sedlex.regexp? ',']
+let digit = [%sedlex.regexp? '0' .. '9']
+let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_']
+let spaces = [%sedlex.regexp? Plus space]
+let ident = [%sedlex.regexp? ('$' | letters), Star (digit | letters)]
+let location_ident = [%sedlex.regexp? letters | digit]
+let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^']
+let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident]
+
+let rec read_long_string level buf lexbuf =
+  match%sedlex lexbuf with
+  | '{' ->
+      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+      read_long_string (level + 1) buf lexbuf
+  | '}' -> (
+      match level with
+      | 0 -> Buffer.contents buf
+      | _ ->
+          Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+          read_long_string (level - 1) buf lexbuf)
+  | eol ->
+      Sedlexing.new_line lexbuf;
+      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+      read_long_string level buf lexbuf
+  | any ->
+      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+      read_long_string level buf lexbuf
+  | _ -> raise Not_found
+
+let rec read_dquoted_string buf lexbuf =
+  match%sedlex lexbuf with
+  | "\"\"" ->
+      Buffer.add_char buf '"';
+      read_dquoted_string buf lexbuf
+  | '"' -> Buffer.contents buf
+  | any ->
+      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+      read_dquoted_string buf lexbuf
+  | _ -> raise Not_found
+
+let rec read_quoted_string buf lexbuf =
+  match%sedlex lexbuf with
+  | "''" ->
+      Buffer.add_char buf '\'';
+      read_quoted_string buf lexbuf
+  | '\'' -> Buffer.contents buf
+  | eol ->
+      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+      Sedlexing.new_line lexbuf;
+      read_quoted_string buf lexbuf
+  | any ->
+      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+      read_quoted_string buf lexbuf
+  | _ -> raise Not_found
+
+let rec skip_comment lexbuf =
+  match%sedlex lexbuf with
+  | '{' ->
+      let _ = wait_balance (read_long_string 0) lexbuf in
+      skip_comment lexbuf
+  | '\'' ->
+      let _ = wait_balance read_quoted_string lexbuf in
+      skip_comment lexbuf
+  | '"' ->
+      let _ = wait_balance read_dquoted_string lexbuf in
+      skip_comment lexbuf
+  | eol ->
+      (* Ugly hack used in order to put the eol in the front of the next
+         parsing. *)
+      Sedlexing.rollback lexbuf;
+      COMMENT
+  | any -> skip_comment lexbuf
+  | _ -> raise Not_found
+
+let rec token lexbuf =
+  match%sedlex lexbuf with
+  | 0Xfeff ->
+      (* Ignore the BOM *)
+      token lexbuf
+  | '#', Star space, location ->
+      let ident = Idents.of_uchars (Sedlexing.lexeme lexbuf) in
+
+      LOCATION_START ident
+  | Plus digit -> INTEGER (Sedlexing.Utf8.lexeme lexbuf)
+  | '+' -> PLUS
+  | '-' -> MINUS
+  | "+=" -> INCR
+  | "-=" -> DECR
+  | "*=" -> MULT_EQUAL
+  | '/' -> DIV
+  | '*' -> STAR
+  | ':' ->
+      (* We are leaving the block, the comment will be handled again *)
+      decr_level lexbuf;
+      COLUMN
+  | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> LOCATION_END
+  | '[' -> L_BRACKET
+  | ']' -> R_BRACKET
+  | '(' ->
+      incr_level lexbuf;
+      L_PAREN
+  | ')' ->
+      decr_level lexbuf;
+      R_PAREN
+  | '<' -> LT
+  | '>' -> GT
+  | coma -> COMA
+  | eof ->
+      Bucket.remove is_expression lexbuf;
+      EOF
+  | '=' ->
+      incr_level lexbuf;
+      EQUAL
+  | ident -> build_ident lexbuf
+  | eol ->
+      Bucket.add is_expression lexbuf 0;
+      Sedlexing.new_line lexbuf;
+      EOL
+  | '&' ->
+      Bucket.add is_expression lexbuf 0;
+      AMPERSAND
+  | '!' -> (
+      match Bucket.find is_expression lexbuf with
+      | Some i when i <> 0 -> EXCLAMATION
+      | _ -> skip_comment lexbuf)
+  | spaces -> token lexbuf
+  | '\'' -> LITERAL (wait_balance read_quoted_string lexbuf)
+  | '"' -> LITERAL (wait_balance read_dquoted_string lexbuf)
+  | '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf)
+  | _ ->
+      let position = fst @@ Sedlexing.lexing_positions lexbuf in
+      let tok = Sedlexing.Utf16.lexeme lexbuf Little_endian false in
+
+      let msg =
+        Format.asprintf "Unexpected character %S at %a" tok pp_pos position
+      in
+
+      raise @@ LexError (position, msg)
+
+let lexer buf = Sedlexing.with_tokenizer token buf