From dd060261e35fcb8a57f03b01dbe84ab772a2a199 Mon Sep 17 00:00:00 2001 From: Chimrod <> Date: Sat, 28 Oct 2023 16:47:23 +0200 Subject: Set up a context for parsing the literal strings --- lib/qparser/lex_state.ml | 154 +++++++++++++++++++++++++++++++++++++++++ lib/qparser/lex_state.mli | 14 ++++ lib/qparser/lexbuf.ml | 31 ++++++--- lib/qparser/lexbuf.mli | 24 +++++-- lib/qparser/lexer.ml | 120 ++++++++++++-------------------- lib/qparser/lexer.mli | 7 ++ lib/qparser/qsp_expression.mly | 4 +- test/syntax.ml | 12 ++++ 8 files changed, 274 insertions(+), 92 deletions(-) create mode 100644 lib/qparser/lex_state.ml create mode 100644 lib/qparser/lex_state.mli diff --git a/lib/qparser/lex_state.ml b/lib/qparser/lex_state.ml new file mode 100644 index 0000000..37400e7 --- /dev/null +++ b/lib/qparser/lex_state.ml @@ -0,0 +1,154 @@ +exception Out_of_context + +let space = [%sedlex.regexp? ' ' | '\t'] +let spaces = [%sedlex.regexp? Plus space] +let single_quote = [%sedlex.regexp? '\''] +let double_quote = [%sedlex.regexp? '"'] + +let leave_text end_wrapper buf buffer = + Lexbuf.leave_state buffer; + Lexbuf.enter_state buffer (Lexbuf.EndString end_wrapper); + Lexbuf.rollback buffer; + Tokens.LITERAL (Buffer.contents buf) + +let rec nestedQuotedStringWraper : Lexbuf.stringWraper = + let rec start_string f buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + (* There is no more way to add start a quoted string here *) + | spaces -> start_string f buffer + | double_quote -> + Lexbuf.enter_state buffer (Lexbuf.String dQuotedStringWraper); + Tokens.TEXT_MARKER + | '{' -> + Lexbuf.enter_state buffer (Lexbuf.MString 0); + Tokens.TEXT_MARKER + | _ -> f buffer + in + + { + start_string; + wrap = (fun _ -> raise Out_of_context); + end_string = + (fun buffer -> + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | "''" -> + Lexbuf.leave_state buffer; + TEXT_MARKER + | _ -> raise Not_found); + } + +and quotedStringWraper : Lexbuf.stringWraper = + (* This function need to be recursirve in case of successive functions to + escape *) + let rec wrap f buf buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | "''" -> + Buffer.add_char buf '\''; + wrap f buf buffer + | single_quote -> leave_text quotedStringWraper buf buffer + | _ -> f buf buffer + in + + let rec start_string f buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | spaces -> start_string f buffer + | "''" -> + Lexbuf.enter_state buffer (Lexbuf.String nestedQuotedStringWraper); + Tokens.TEXT_MARKER + | double_quote -> + Lexbuf.enter_state buffer (Lexbuf.String dQuotedStringWraper); + Tokens.TEXT_MARKER + | '{' -> + Lexbuf.enter_state buffer (Lexbuf.MString 0); + Tokens.TEXT_MARKER + | _ -> f buffer + in + + { + start_string; + wrap; + end_string = + (fun buffer -> + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | single_quote -> + Lexbuf.leave_state buffer; + TEXT_MARKER + | _ -> raise Not_found); + } + +and dQuotedStringWraper : Lexbuf.stringWraper = + let rec wrap f buf buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | "\"\"" -> + Buffer.add_char buf '"'; + wrap f buf buffer + | double_quote -> leave_text dQuotedStringWraper buf buffer + | _ -> f buf buffer + in + + let rec start_string f buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | spaces -> start_string f buffer + | single_quote -> + Lexbuf.enter_state buffer (Lexbuf.String nestedQuotedStringWraper); + Tokens.TEXT_MARKER + | '{' -> + Lexbuf.enter_state buffer (Lexbuf.MString 0); + Tokens.TEXT_MARKER + | _ -> f buffer + in + + { + start_string; + wrap; + end_string = + (fun buffer -> + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | double_quote -> + Lexbuf.leave_state buffer; + TEXT_MARKER + | _ -> raise Not_found); + } + +let defaultWraper : Lexbuf.stringWraper = + let rec start_string f buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | spaces -> start_string f buffer + | single_quote -> + Lexbuf.enter_state buffer (Lexbuf.String quotedStringWraper); + Tokens.TEXT_MARKER + | double_quote -> + Lexbuf.enter_state buffer (Lexbuf.String dQuotedStringWraper); + Tokens.TEXT_MARKER + | '{' -> + Lexbuf.enter_state buffer (Lexbuf.MString 0); + Tokens.TEXT_MARKER + | _ -> f buffer + in + { + start_string; + wrap = (fun _f _buf _buffer -> raise Out_of_context); + end_string = (fun _buffer -> raise Out_of_context); + } + +let readLongStringWraper : Lexbuf.stringWraper = + { + defaultWraper with + end_string = + (fun buffer -> + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | "}" -> + Lexbuf.leave_state buffer; + TEXT_MARKER + | _ -> raise Not_found); + } diff --git a/lib/qparser/lex_state.mli b/lib/qparser/lex_state.mli new file mode 100644 index 0000000..1e69faf --- /dev/null +++ b/lib/qparser/lex_state.mli @@ -0,0 +1,14 @@ +(** This module keep a track of the different way to start, escape and end a + string in the lexer. + + Depending on how the string was started (a single quote or double quote), + we have differents caracters for every of thoses actions. +*) + +val defaultWraper : Lexbuf.stringWraper +(** The default string lexer. Used when we start the lexing. *) + +val quotedStringWraper : Lexbuf.stringWraper +val nestedQuotedStringWraper : Lexbuf.stringWraper +val dQuotedStringWraper : Lexbuf.stringWraper +val readLongStringWraper : Lexbuf.stringWraper diff --git a/lib/qparser/lexbuf.ml b/lib/qparser/lexbuf.ml index 9498f4a..2433ea5 100644 --- a/lib/qparser/lexbuf.ml +++ b/lib/qparser/lexbuf.ml @@ -1,11 +1,3 @@ -type state = - | Token - | String - | DString - | MString of int - | EndString - | Expression - type t = { buffer : Sedlexing.lexbuf; mutable start_p : Lexing.position option; @@ -13,6 +5,29 @@ type t = { reset_line : bool; } +and lexer = t -> Tokens.token +and buffer_builder = Buffer.t -> lexer + +and stringWraper = { + start_string : lexer -> lexer; + (** Start a new string. This function is used insed the token lexer, in + order to identify how to start a new string *) + wrap : buffer_builder -> buffer_builder; + (** function used to escape the character and add it to the buffer. This + function is used inside the string lexer. *) + end_string : lexer; + (** Function used to match the end of the string. This function is used + after the string lexer, in order to identify the end patten for a + string *) +} + +and state = + | Token of stringWraper + | String of stringWraper + | MString of int + | EndString of stringWraper + | Expression + let state : t -> state option = fun t -> Stack.top_opt t.state let enter_state : t -> state -> unit = fun t state -> Stack.push state t.state let leave_state : t -> unit = fun t -> ignore (Stack.pop_opt t.state) diff --git a/lib/qparser/lexbuf.mli b/lib/qparser/lexbuf.mli index 5fda8ff..dd13da4 100644 --- a/lib/qparser/lexbuf.mli +++ b/lib/qparser/lexbuf.mli @@ -43,12 +43,28 @@ val rollback : t -> unit using a stack for determining the token to send. *) +type lexer = t -> Tokens.token +type buffer_builder = Buffer.t -> lexer + +type stringWraper = { + start_string : lexer -> lexer; + (** Start a new string. This function is used insed the token lexer, in + order to identify how to start a new string *) + wrap : buffer_builder -> buffer_builder; + (** function used to escape the character and add it to the buffer. This + function is used inside the string lexer. *) + end_string : lexer; + (** Function used to match the end of the string. This function is used + after the string lexer, in order to identify the end patten for a + string *) +} + type state = - | Token (** Default state, parsing the tokens *) - | String (** String enclosed by [''] *) - | DString (** String enclosed by [""] *) + | Token of stringWraper (** Default state, parsing the tokens *) + | String of stringWraper (** String enclosed by [''] *) | MString of int (** String enclosed by [{}]*) - | EndString (** State raised just before closing the string *) + | EndString of stringWraper + (** State raised just before closing the string *) | Expression (** Expression where [!] is an operator *) val state : t -> state option diff --git a/lib/qparser/lexer.ml b/lib/qparser/lexer.ml index 7878299..114846c 100644 --- a/lib/qparser/lexer.ml +++ b/lib/qparser/lexer.ml @@ -53,7 +53,6 @@ let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a = with Not_found -> raise UnclosedQuote let space = [%sedlex.regexp? ' ' | '\t'] -let spaces = [%sedlex.regexp? Plus space] let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"] let coma = [%sedlex.regexp? ','] let digit = [%sedlex.regexp? '0' .. '9'] @@ -63,29 +62,6 @@ let location_ident = [%sedlex.regexp? letters | digit] let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] -(** Change the state when we are ending a string. Send the text marker to the - parser in order to tell the string is over. - - This can work because the state EndString is only raised when the same - token is fetched inside the appropriate sting method lexer. The - [Lexbuf.rollback] function is called in order to let the same token occur - again. - *) -let end_string : Lexbuf.t -> token = - fun buffer -> - let lexbuf = Lexbuf.buffer buffer in - match%sedlex lexbuf with - | '}' -> - Lexbuf.leave_state buffer; - TEXT_MARKER - | '\'' -> - Lexbuf.leave_state buffer; - TEXT_MARKER - | '"' -> - Lexbuf.leave_state buffer; - TEXT_MARKER - | _ -> raise Not_found - let rec read_long_string level buf buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with @@ -96,7 +72,8 @@ let rec read_long_string level buf buffer = match level with | 0 -> Lexbuf.leave_state buffer; - Lexbuf.enter_state buffer Lexbuf.EndString; + Lexbuf.enter_state buffer + (Lexbuf.EndString Lex_state.readLongStringWraper); Lexbuf.rollback buffer; LITERAL (Buffer.contents buf) | _ -> @@ -111,55 +88,47 @@ let rec read_long_string level buf buffer = read_long_string level buf buffer | _ -> raise Not_found -let rec read_dquoted_string buf buffer = +(** Read the text inside a ['] *) +let rec read_quoted_string f buf buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with - | "\"\"" -> - Buffer.add_char buf '"'; - read_dquoted_string buf buffer - | '"' -> - Lexbuf.leave_state buffer; - Lexbuf.enter_state buffer Lexbuf.EndString; - Lexbuf.rollback buffer; - LITERAL (Buffer.contents buf) - | any -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_dquoted_string buf buffer - | _ -> raise Not_found - -let rec read_quoted_string buf buffer = - let lexbuf = Lexbuf.buffer buffer in - match%sedlex lexbuf with - | "''" -> - Buffer.add_char buf '\''; - read_quoted_string buf buffer - | '\'' -> - Lexbuf.leave_state buffer; - Lexbuf.enter_state buffer Lexbuf.EndString; - Lexbuf.rollback buffer; - LITERAL (Buffer.contents buf) | eol -> Buffer.add_string buf (Lexbuf.content buffer); - read_quoted_string buf buffer + (f (read_quoted_string f)) buf buffer | any -> Buffer.add_string buf (Lexbuf.content buffer); - read_quoted_string buf buffer - | _ -> raise Not_found + (f (read_quoted_string f)) buf buffer + | _ -> + let location, _ = Lexbuf.positions buffer in + let line = location.Lexing.pos_lnum + and file = location.Lexing.pos_fname in + Format.eprintf "read_quoted_string : %s:%d\n" file line; + raise Not_found let rec skip_comment buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '{' -> let _ = wait_balance (read_long_string 0) buffer in - let _ = end_string buffer in + let _ = Lex_state.readLongStringWraper.end_string buffer in skip_comment buffer | '\'' -> - let _ = wait_balance read_quoted_string buffer in - let _ = end_string buffer in + let _ = + wait_balance + (Lex_state.quotedStringWraper.wrap + (read_quoted_string Lex_state.quotedStringWraper.wrap)) + buffer + in + let _ = Lex_state.quotedStringWraper.end_string buffer in skip_comment buffer | '"' -> - let _ = wait_balance read_dquoted_string buffer in - let _ = end_string buffer in + let _ = + wait_balance + (Lex_state.dQuotedStringWraper.wrap + (read_quoted_string Lex_state.dQuotedStringWraper.wrap)) + buffer + in + let _ = Lex_state.dQuotedStringWraper.end_string buffer in skip_comment buffer | eol -> (* Ugly hack used in order to put the eol in the front of the next @@ -170,13 +139,13 @@ let rec skip_comment buffer = | _ -> raise Not_found (** Main lexer *) -let rec token : Lexbuf.t -> token = +let rec parse_token : Lexbuf.t -> token = fun buffer -> let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | 0Xfeff -> (* Ignore the BOM *) - token buffer + parse_token buffer | '#', Star space, location -> (* Extract the location name *) let ident = Lexbuf.content buffer in @@ -215,7 +184,7 @@ let rec token : Lexbuf.t -> token = R_PAREN | ">>" -> Lexbuf.leave_state buffer; - token buffer + parse_token buffer | '<' -> LT | '>' -> GT | coma -> COMA @@ -235,16 +204,6 @@ let rec token : Lexbuf.t -> token = match Lexbuf.state buffer with | Some Lexbuf.Expression -> EXCLAMATION | _ -> skip_comment buffer) - | spaces -> token buffer - | '\'' -> - Lexbuf.enter_state buffer Lexbuf.String; - TEXT_MARKER - | '"' -> - Lexbuf.enter_state buffer Lexbuf.DString; - TEXT_MARKER - | '{' -> - Lexbuf.enter_state buffer (Lexbuf.MString 0); - TEXT_MARKER | '}' -> TEXT_MARKER | eof -> raise EOF | _ -> @@ -254,20 +213,27 @@ let rec token : Lexbuf.t -> token = let main buffer = match Lexbuf.state buffer with - | Some Lexbuf.String -> wait_balance read_quoted_string buffer - | Some Lexbuf.DString -> wait_balance read_dquoted_string buffer + | Some (Lexbuf.String w) -> + wait_balance (w.wrap @@ read_quoted_string w.wrap) buffer | Some (Lexbuf.MString level) -> wait_balance (read_long_string level) buffer - | Some Lexbuf.EndString -> end_string buffer - | _ -> token buffer + | Some (Lexbuf.EndString w) -> w.end_string buffer + | Some (Lexbuf.Token w) -> w.start_string parse_token buffer + | _ -> Lex_state.defaultWraper.start_string parse_token buffer let rec discard buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '\'' -> - ignore (wait_balance read_quoted_string buffer); + ignore + (wait_balance + (read_quoted_string Lex_state.quotedStringWraper.wrap) + buffer); discard buffer | '"' -> - ignore (wait_balance read_dquoted_string buffer); + ignore + (wait_balance + (read_quoted_string Lex_state.quotedStringWraper.wrap) + buffer); discard buffer | '{' -> ignore (wait_balance (read_long_string 0) buffer); diff --git a/lib/qparser/lexer.mli b/lib/qparser/lexer.mli index 26d59cb..be10ddb 100644 --- a/lib/qparser/lexer.mli +++ b/lib/qparser/lexer.mli @@ -1,3 +1,10 @@ +(** Provide a lexer for the langage. The function [main] read the source and + identify the next to token to give to the parser. + + Personal note: parsing the QSP is really complicated. The language was + designed for regex and I have to twist the lexer in order to get something + working. *) + exception EOF exception UnclosedQuote exception LexError of string diff --git a/lib/qparser/qsp_expression.mly b/lib/qparser/qsp_expression.mly index 58da39e..b470a88 100644 --- a/lib/qparser/qsp_expression.mly +++ b/lib/qparser/qsp_expression.mly @@ -34,9 +34,7 @@ op = binary_operator expr2 = expression { Analyzer.Expression.boperator $loc op expr1 expr2 } - | TEXT_MARKER - v = LITERAL - TEXT_MARKER + | v = delimited(TEXT_MARKER, LITERAL, TEXT_MARKER) { Analyzer.Expression.literal $loc v } | i = INTEGER { Analyzer.Expression.integer $loc i } | v = variable { Analyzer.Expression.ident v } diff --git a/test/syntax.ml b/test/syntax.ml index 56fac8e..a49bd1c 100644 --- a/test/syntax.ml +++ b/test/syntax.ml @@ -784,6 +784,17 @@ let test_precedence8 () = Tree.Ast.Integer (_position, "1") ) )); ] +let nested_string () = + _test_instruction + {|'Delete'|} + [ + Tree.Ast.Expression + (Tree.Ast.Literal + ( _position, + {|Delete|} + )); + ] + let test = ( "Syntax", [ @@ -848,4 +859,5 @@ let test = Alcotest.test_case "inline if else if" `Quick test_mutiple_inline_ifs; Alcotest.test_case "Precedence7" `Quick test_precedence7; Alcotest.test_case "Precedence8" `Quick test_precedence8; + Alcotest.test_case "Nested string" `Quick nested_string; ] ) -- cgit v1.2.3