diff options
| author | Chimrod <> | 2023-09-28 09:16:56 +0200 | 
|---|---|---|
| committer | Chimrod <> | 2023-09-29 10:00:21 +0200 | 
| commit | 7fc4021d888b4f16f8fa87c0ea1df68d3806df64 (patch) | |
| tree | ec06b304b6e32a086aaeb4a1e866a255a199bc1a /lib/UTF16.ml | |
| parent | 5dc0c5defdd7ebb152a00e8b2895787b54931779 (diff) | |
Renamed the lexer
Diffstat (limited to 'lib/UTF16.ml')
| -rw-r--r-- | lib/UTF16.ml | 214 | 
1 files changed, 0 insertions, 214 deletions
diff --git a/lib/UTF16.ml b/lib/UTF16.ml deleted file mode 100644 index e325011..0000000 --- a/lib/UTF16.ml +++ /dev/null @@ -1,214 +0,0 @@ -(**  -    Lexer using sedlex - *) - -open Tokens - -exception UnclosedQuote of { content : string; line : int } -exception LexError of Lexing.position * string - -let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } = -  Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol) - -module type Encoding = sig -  val lexeme : Sedlexing.lexbuf -> string -end - -(* The comment system is terrible. The same symbol can be used for : -     - starting a comment -     - inequality operation -   In order to manage this, I try to identify the context in a very basic way, -   using a flag True False for determining the token to send. -*) -module Bucket = Ephemeron.K1.Bucket - -type bucket = (Sedlexing.lexbuf, int) Bucket.t - -let is_expression : bucket = Bucket.make () - -let incr_level lexbuf = -  match Bucket.find is_expression lexbuf with -  | None -> Bucket.add is_expression lexbuf 1 -  | Some v -> Bucket.add is_expression lexbuf (v + 1) - -let decr_level lexbuf = -  match Bucket.find is_expression lexbuf with -  | None -> () -  | Some v -> -      if v > 1 then Bucket.add is_expression lexbuf (v - 1) -      else Bucket.remove is_expression lexbuf - -let build_ident lexbuf = -  let id = -    Sedlexing.lexeme lexbuf |> Idents.of_uchars |> String.uppercase_ascii -  in -  try -    let value = Hashtbl.find Idents.keyword_table id in -    let _ = match value with IF | ELIF -> incr_level lexbuf | _ -> () in -    value -  with Not_found -> IDENT id - -let incr_level lexbuf = -  match Bucket.find is_expression lexbuf with -  | None -> Bucket.add is_expression lexbuf 1 -  | Some v -> Bucket.add is_expression lexbuf (v + 1) - -let wait_balance rule lexbuf = -  try[@warning "-52"] rule (Buffer.create 17) lexbuf -  with Failure "lexing: empty token" -> -    let position, _ = Sedlexing.lexing_positions lexbuf in -    let line = position.Lexing.pos_lnum -    and content = Sedlexing.lexeme lexbuf |> Idents.of_uchars in -    (raise (UnclosedQuote { line; content }) [@warning "+52"]) - -let space = [%sedlex.regexp? ' ' | '\t'] -let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"] -let coma = [%sedlex.regexp? ','] -let digit = [%sedlex.regexp? '0' .. '9'] -let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_'] -let spaces = [%sedlex.regexp? Plus space] -let ident = [%sedlex.regexp? ('$' | letters), Star (digit | letters)] -let location_ident = [%sedlex.regexp? letters | digit] -let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] -let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] - -let rec read_long_string level buf lexbuf = -  match%sedlex lexbuf with -  | '{' -> -      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); -      read_long_string (level + 1) buf lexbuf -  | '}' -> ( -      match level with -      | 0 -> Buffer.contents buf -      | _ -> -          Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); -          read_long_string (level - 1) buf lexbuf) -  | eol -> -      Sedlexing.new_line lexbuf; -      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); -      read_long_string level buf lexbuf -  | any -> -      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); -      read_long_string level buf lexbuf -  | _ -> raise Not_found - -let rec read_dquoted_string buf lexbuf = -  match%sedlex lexbuf with -  | "\"\"" -> -      Buffer.add_char buf '"'; -      read_dquoted_string buf lexbuf -  | '"' -> Buffer.contents buf -  | any -> -      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); -      read_dquoted_string buf lexbuf -  | _ -> raise Not_found - -let rec read_quoted_string buf lexbuf = -  match%sedlex lexbuf with -  | "''" -> -      Buffer.add_char buf '\''; -      read_quoted_string buf lexbuf -  | '\'' -> Buffer.contents buf -  | eol -> -      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); -      Sedlexing.new_line lexbuf; -      read_quoted_string buf lexbuf -  | any -> -      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); -      read_quoted_string buf lexbuf -  | _ -> raise Not_found - -let rec skip_comment lexbuf = -  match%sedlex lexbuf with -  | '{' -> -      let _ = wait_balance (read_long_string 0) lexbuf in -      skip_comment lexbuf -  | '\'' -> -      let _ = wait_balance read_quoted_string lexbuf in -      skip_comment lexbuf -  | '"' -> -      let _ = wait_balance read_dquoted_string lexbuf in -      skip_comment lexbuf -  | eol -> -      (* Ugly hack used in order to put the eol in the front of the next -         parsing. *) -      Sedlexing.rollback lexbuf; -      COMMENT -  | any -> skip_comment lexbuf -  | _ -> raise Not_found - -(** Main lexer *) -let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = - fun (module E : Encoding) lexbuf -> -  match%sedlex lexbuf with -  | 0Xfeff -> -      (* Ignore the BOM *) -      token (module E) lexbuf -  | '#', Star space, location -> -      let ident = E.lexeme lexbuf in - -      LOCATION_START ident -  | Plus digit -> INTEGER (E.lexeme lexbuf) -  | '+' -> PLUS -  | '-' -> MINUS -  | "+=" -> INCR -  | "-=" -> DECR -  | "*=" -> MULT_EQUAL -  | '/' -> DIV -  | '*' -> STAR -  | ':' -> -      (* We are leaving the block, the comment will be handled again *) -      decr_level lexbuf; -      COLUMN -  | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> LOCATION_END -  | '[' -> L_BRACKET -  | ']' -> R_BRACKET -  | '(' -> -      incr_level lexbuf; -      L_PAREN -  | ')' -> -      decr_level lexbuf; -      R_PAREN -  | '<' -> LT -  | '>' -> GT -  | coma -> COMA -  | eof -> -      Bucket.remove is_expression lexbuf; -      EOF -  | '=' -> -      incr_level lexbuf; -      EQUAL -  | ident -> build_ident lexbuf -  | eol -> -      Bucket.add is_expression lexbuf 0; -      Sedlexing.new_line lexbuf; -      EOL -  | '&' -> -      Bucket.add is_expression lexbuf 0; -      AMPERSAND -  | '!' -> ( -      match Bucket.find is_expression lexbuf with -      | Some i when i <> 0 -> EXCLAMATION -      | _ -> skip_comment lexbuf) -  | spaces -> token (module E) lexbuf -  | '\'' -> LITERAL (wait_balance read_quoted_string lexbuf) -  | '"' -> LITERAL (wait_balance read_dquoted_string lexbuf) -  | '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf) -  | _ -> -      let position = fst @@ Sedlexing.lexing_positions lexbuf in -      let tok = E.lexeme lexbuf in - -      let msg = -        Format.asprintf "Unexpected character %S at %a" tok pp_pos position -      in - -      raise @@ LexError (position, msg) - -(** Tokenizer for menhir *) -let lexer : -    (module Encoding) -> -    Sedlexing.lexbuf -> -    unit -> -    token * Lexing.position * Lexing.position = - fun (module E : Encoding.S) lexbuf -> -  Sedlexing.with_tokenizer (token (module E)) lexbuf  | 
