Renamed the lexer

author: Chimrod <> 2023-09-28 09:16:56 +0200
committer: Chimrod <> 2023-09-29 10:00:21 +0200
commit: 7fc4021d888b4f16f8fa87c0ea1df68d3806df64 (patch)
tree: ec06b304b6e32a086aaeb4a1e866a255a199bc1a /lib/UTF16.ml
parent: 5dc0c5defdd7ebb152a00e8b2895787b54931779 (diff)
1 files changed, 0 insertions, 214 deletions
diff --git a/lib/UTF16.ml b/lib/UTF16.ml
deleted file mode 100644
index e325011..0000000
--- a/lib/UTF16.ml
+++ /dev/null
@@ -1,214 +0,0 @@
-(** 
-    Lexer using sedlex
- *)
-
-open Tokens
-
-exception UnclosedQuote of { content : string; line : int }
-exception LexError of Lexing.position * string
-
-let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } =
-  Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol)
-
-module type Encoding = sig
-  val lexeme : Sedlexing.lexbuf -> string
-end
-
-(* The comment system is terrible. The same symbol can be used for :
-     - starting a comment
-     - inequality operation
-   In order to manage this, I try to identify the context in a very basic way,
-   using a flag True False for determining the token to send.
-*)
-module Bucket = Ephemeron.K1.Bucket
-
-type bucket = (Sedlexing.lexbuf, int) Bucket.t
-
-let is_expression : bucket = Bucket.make ()
-
-let incr_level lexbuf =
-  match Bucket.find is_expression lexbuf with
-  | None -> Bucket.add is_expression lexbuf 1
-  | Some v -> Bucket.add is_expression lexbuf (v + 1)
-
-let decr_level lexbuf =
-  match Bucket.find is_expression lexbuf with
-  | None -> ()
-  | Some v ->
-      if v > 1 then Bucket.add is_expression lexbuf (v - 1)
-      else Bucket.remove is_expression lexbuf
-
-let build_ident lexbuf =
-  let id =
-    Sedlexing.lexeme lexbuf |> Idents.of_uchars |> String.uppercase_ascii
-  in
-  try
-    let value = Hashtbl.find Idents.keyword_table id in
-    let _ = match value with IF | ELIF -> incr_level lexbuf | _ -> () in
-    value
-  with Not_found -> IDENT id
-
-let incr_level lexbuf =
-  match Bucket.find is_expression lexbuf with
-  | None -> Bucket.add is_expression lexbuf 1
-  | Some v -> Bucket.add is_expression lexbuf (v + 1)
-
-let wait_balance rule lexbuf =
-  try[@warning "-52"] rule (Buffer.create 17) lexbuf
-  with Failure "lexing: empty token" ->
-    let position, _ = Sedlexing.lexing_positions lexbuf in
-    let line = position.Lexing.pos_lnum
-    and content = Sedlexing.lexeme lexbuf |> Idents.of_uchars in
-    (raise (UnclosedQuote { line; content }) [@warning "+52"])
-
-let space = [%sedlex.regexp? ' ' | '\t']
-let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"]
-let coma = [%sedlex.regexp? ',']
-let digit = [%sedlex.regexp? '0' .. '9']
-let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_']
-let spaces = [%sedlex.regexp? Plus space]
-let ident = [%sedlex.regexp? ('$' | letters), Star (digit | letters)]
-let location_ident = [%sedlex.regexp? letters | digit]
-let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^']
-let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident]
-
-let rec read_long_string level buf lexbuf =
-  match%sedlex lexbuf with
-  | '{' ->
-      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
-      read_long_string (level + 1) buf lexbuf
-  | '}' -> (
-      match level with
-      | 0 -> Buffer.contents buf
-      | _ ->
-          Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
-          read_long_string (level - 1) buf lexbuf)
-  | eol ->
-      Sedlexing.new_line lexbuf;
-      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
-      read_long_string level buf lexbuf
-  | any ->
-      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
-      read_long_string level buf lexbuf
-  | _ -> raise Not_found
-
-let rec read_dquoted_string buf lexbuf =
-  match%sedlex lexbuf with
-  | "\"\"" ->
-      Buffer.add_char buf '"';
-      read_dquoted_string buf lexbuf
-  | '"' -> Buffer.contents buf
-  | any ->
-      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
-      read_dquoted_string buf lexbuf
-  | _ -> raise Not_found
-
-let rec read_quoted_string buf lexbuf =
-  match%sedlex lexbuf with
-  | "''" ->
-      Buffer.add_char buf '\'';
-      read_quoted_string buf lexbuf
-  | '\'' -> Buffer.contents buf
-  | eol ->
-      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
-      Sedlexing.new_line lexbuf;
-      read_quoted_string buf lexbuf
-  | any ->
-      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
-      read_quoted_string buf lexbuf
-  | _ -> raise Not_found
-
-let rec skip_comment lexbuf =
-  match%sedlex lexbuf with
-  | '{' ->
-      let _ = wait_balance (read_long_string 0) lexbuf in
-      skip_comment lexbuf
-  | '\'' ->
-      let _ = wait_balance read_quoted_string lexbuf in
-      skip_comment lexbuf
-  | '"' ->
-      let _ = wait_balance read_dquoted_string lexbuf in
-      skip_comment lexbuf
-  | eol ->
-      (* Ugly hack used in order to put the eol in the front of the next
-         parsing. *)
-      Sedlexing.rollback lexbuf;
-      COMMENT
-  | any -> skip_comment lexbuf
-  | _ -> raise Not_found
-
-(** Main lexer *)
-let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token =
- fun (module E : Encoding) lexbuf ->
-  match%sedlex lexbuf with
-  | 0Xfeff ->
-      (* Ignore the BOM *)
-      token (module E) lexbuf
-  | '#', Star space, location ->
-      let ident = E.lexeme lexbuf in
-
-      LOCATION_START ident
-  | Plus digit -> INTEGER (E.lexeme lexbuf)
-  | '+' -> PLUS
-  | '-' -> MINUS
-  | "+=" -> INCR
-  | "-=" -> DECR
-  | "*=" -> MULT_EQUAL
-  | '/' -> DIV
-  | '*' -> STAR
-  | ':' ->
-      (* We are leaving the block, the comment will be handled again *)
-      decr_level lexbuf;
-      COLUMN
-  | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> LOCATION_END
-  | '[' -> L_BRACKET
-  | ']' -> R_BRACKET
-  | '(' ->
-      incr_level lexbuf;
-      L_PAREN
-  | ')' ->
-      decr_level lexbuf;
-      R_PAREN
-  | '<' -> LT
-  | '>' -> GT
-  | coma -> COMA
-  | eof ->
-      Bucket.remove is_expression lexbuf;
-      EOF
-  | '=' ->
-      incr_level lexbuf;
-      EQUAL
-  | ident -> build_ident lexbuf
-  | eol ->
-      Bucket.add is_expression lexbuf 0;
-      Sedlexing.new_line lexbuf;
-      EOL
-  | '&' ->
-      Bucket.add is_expression lexbuf 0;
-      AMPERSAND
-  | '!' -> (
-      match Bucket.find is_expression lexbuf with
-      | Some i when i <> 0 -> EXCLAMATION
-      | _ -> skip_comment lexbuf)
-  | spaces -> token (module E) lexbuf
-  | '\'' -> LITERAL (wait_balance read_quoted_string lexbuf)
-  | '"' -> LITERAL (wait_balance read_dquoted_string lexbuf)
-  | '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf)
-  | _ ->
-      let position = fst @@ Sedlexing.lexing_positions lexbuf in
-      let tok = E.lexeme lexbuf in
-
-      let msg =
-        Format.asprintf "Unexpected character %S at %a" tok pp_pos position
-      in
-
-      raise @@ LexError (position, msg)
-
-(** Tokenizer for menhir *)
-let lexer :
-    (module Encoding) ->
-    Sedlexing.lexbuf ->
-    unit ->
-    token * Lexing.position * Lexing.position =
- fun (module E : Encoding.S) lexbuf ->
-  Sedlexing.with_tokenizer (token (module E)) lexbuf
author	Chimrod <>	2023-09-28 09:16:56 +0200
committer	Chimrod <>	2023-09-29 10:00:21 +0200
commit	7fc4021d888b4f16f8fa87c0ea1df68d3806df64 (patch)
tree	ec06b304b6e32a086aaeb4a1e866a255a199bc1a /lib/UTF16.ml
parent	5dc0c5defdd7ebb152a00e8b2895787b54931779 (diff)