From 97ab5c9a21166f0bffee482210d69877fd6809fa Mon Sep 17 00:00:00 2001 From: Chimrod <> Date: Fri, 6 Oct 2023 08:35:56 +0200 Subject: Moved qparser and syntax in the library folder --- lib/qparser/lexer.ml | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 lib/qparser/lexer.ml (limited to 'lib/qparser/lexer.ml') diff --git a/lib/qparser/lexer.ml b/lib/qparser/lexer.ml new file mode 100644 index 0000000..c643577 --- /dev/null +++ b/lib/qparser/lexer.ml @@ -0,0 +1,212 @@ +(** + Lexer using sedlex + *) + +open Tokens + +exception UnclosedQuote of { content : string; line : int } +exception LexError of Lexing.position * string +exception EOF + +let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } = + Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol) + +(* Extract the location name from the pattern *) +let location_name = Str.regexp {|.* \(.*\)|} + +let build_ident buffer = + let id = Lexbuf.content buffer |> String.uppercase_ascii in + try + let value = Hashtbl.find Idents.keyword_table id in + let _ = + match value with IF | ELIF -> Lexbuf.incr_level buffer | _ -> () + in + value + with Not_found -> IDENT id + +let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a = + fun rule lexbuf -> + let _, position = Lexbuf.positions lexbuf in + Lexbuf.set_start_position lexbuf position; + + try[@warning "-52"] + let token = rule (Buffer.create 256) lexbuf in + token + with Failure "lexing: empty token" -> + let position, _ = Lexbuf.positions lexbuf in + let line = position.Lexing.pos_lnum and content = Lexbuf.content lexbuf in + (raise (UnclosedQuote { line; content }) [@warning "+52"]) + +let space = [%sedlex.regexp? ' ' | '\t'] +let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"] +let coma = [%sedlex.regexp? ','] +let digit = [%sedlex.regexp? '0' .. '9'] +let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_'] +let spaces = [%sedlex.regexp? Plus space] +let ident = [%sedlex.regexp? ('$' | letters), Star (digit | letters)] +let location_ident = [%sedlex.regexp? letters | digit] +let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] +let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] + +let rec read_long_string level buf buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | '{' -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_long_string (level + 1) buf buffer + | '}' -> ( + match level with + | 0 -> Buffer.contents buf + | _ -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_long_string (level - 1) buf buffer) + | eol -> + Buffer.add_string buf (Lexbuf.content buffer); + read_long_string level buf buffer + | any -> + Buffer.add_string buf (Lexbuf.content buffer); + read_long_string level buf buffer + | _ -> raise Not_found + +let rec read_dquoted_string buf buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | "\"\"" -> + Buffer.add_char buf '"'; + read_dquoted_string buf buffer + | '"' -> Buffer.contents buf + | any -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_dquoted_string buf buffer + | _ -> raise Not_found + +let rec read_quoted_string buf buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | "''" -> + Buffer.add_char buf '\''; + read_quoted_string buf buffer + | '\'' -> Buffer.contents buf + | eol -> + Buffer.add_string buf (Lexbuf.content buffer); + read_quoted_string buf buffer + | any -> + Buffer.add_string buf (Lexbuf.content buffer); + read_quoted_string buf buffer + | _ -> raise Not_found + +let rec skip_comment buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | '{' -> + let _ = wait_balance (read_long_string 0) buffer in + skip_comment buffer + | '\'' -> + let _ = wait_balance read_quoted_string buffer in + skip_comment buffer + | '"' -> + let _ = wait_balance read_dquoted_string buffer in + skip_comment buffer + | eol -> + (* Ugly hack used in order to put the eol in the front of the next + parsing. *) + Sedlexing.rollback lexbuf; + COMMENT + | any -> skip_comment buffer + | _ -> raise Not_found + +(** Main lexer *) +let rec token : Lexbuf.t -> token = + fun buffer -> + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | 0Xfeff -> + (* Ignore the BOM *) + token buffer + | '#', Star space, location -> + (* Extract the location name *) + let ident = Lexbuf.content buffer in + let () = + match Str.string_match location_name ident 0 with + | false -> () + | true -> Sedlexing.set_filename lexbuf (Str.matched_group 1 ident) + in + + (* Restart the line number (new location here) *) + Lexbuf.start buffer; + + LOCATION_START ident + | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> + Lexbuf.reset_level buffer; + LOCATION_END + | Plus digit -> INTEGER (Lexbuf.content buffer) + | '+' -> PLUS + | '-' -> MINUS + | "+=" -> INCR + | "-=" -> DECR + | "*=" -> MULT_EQUAL + | '/' -> DIV + | '*' -> STAR + | ':' -> + (* We are leaving the block, the comment will be handled again *) + Lexbuf.decr_level buffer; + + COLUMN + | '[' -> L_BRACKET + | ']' -> R_BRACKET + | '(' -> + Lexbuf.incr_level buffer; + L_PAREN + | ')' -> + Lexbuf.decr_level buffer; + R_PAREN + | '<' -> LT + | '>' -> GT + | coma -> COMA + | '=' -> + Lexbuf.incr_level buffer; + EQUAL + | ident -> build_ident buffer + | eol -> + Lexbuf.reset_level buffer; + EOL + | '&' -> + Lexbuf.reset_level buffer; + AMPERSAND + | '!' -> if Lexbuf.level buffer > 0 then EXCLAMATION else skip_comment buffer + | spaces -> token buffer + | '\'' -> LITERAL (wait_balance read_quoted_string buffer) + | '"' -> LITERAL (wait_balance read_dquoted_string buffer) + | '{' -> LITERAL (wait_balance (read_long_string 0) buffer) + | eof -> raise EOF + | _ -> + let position = fst @@ Sedlexing.lexing_positions lexbuf in + let tok = Lexbuf.content buffer in + + let msg = + Format.asprintf "Unexpected character %S at %a" tok pp_pos position + in + + raise @@ LexError (position, msg) + +let rec discard buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | '\'' -> + ignore (wait_balance read_quoted_string buffer); + discard buffer + | '"' -> + ignore (wait_balance read_dquoted_string buffer); + discard buffer + | '{' -> + ignore (wait_balance (read_long_string 0) buffer); + discard buffer + | eof -> raise EOF + | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> + Lexbuf.reset_level buffer; + () + | '!' -> + ignore @@ skip_comment buffer; + discard buffer + | any -> discard buffer + | _ -> raise EOF -- cgit v1.2.3