From 05f74bee05c0c56da593a5e89069711d5993e3b1 Mon Sep 17 00:00:00 2001 From: Chimrod <> Date: Mon, 30 Oct 2023 08:22:39 +0100 Subject: Managed the strings in strings --- lib/qparser/dune | 1 + lib/qparser/lex_state.ml | 108 +++++++++++++++++++---------- lib/qparser/lex_state.mli | 14 +++- lib/qparser/lexbuf.ml | 23 ++++++- lib/qparser/lexbuf.mli | 8 ++- lib/qparser/lexer.ml | 168 ++++++++++++++++++++++++++++++++-------------- 6 files changed, 228 insertions(+), 94 deletions(-) (limited to 'lib/qparser') diff --git a/lib/qparser/dune b/lib/qparser/dune index f62c90e..8297268 100644 --- a/lib/qparser/dune +++ b/lib/qparser/dune @@ -4,6 +4,7 @@ str menhirLib qsp_syntax + sedlex ) (preprocess (pps sedlex.ppx diff --git a/lib/qparser/lex_state.ml b/lib/qparser/lex_state.ml index 37400e7..3cf757d 100644 --- a/lib/qparser/lex_state.ml +++ b/lib/qparser/lex_state.ml @@ -1,5 +1,14 @@ +(** This module provide functions used to parse the strings. + + *) + exception Out_of_context +let pr_err buffer = + let location, _ = Lexbuf.positions buffer in + let line = location.Lexing.pos_lnum and file = location.Lexing.pos_fname in + Format.eprintf "Error found at : %s:%d\n" file line + let space = [%sedlex.regexp? ' ' | '\t'] let spaces = [%sedlex.regexp? Plus space] let single_quote = [%sedlex.regexp? '\''] @@ -12,23 +21,21 @@ let leave_text end_wrapper buf buffer = Tokens.LITERAL (Buffer.contents buf) let rec nestedQuotedStringWraper : Lexbuf.stringWraper = - let rec start_string f buffer = + let start_string f buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with (* There is no more way to add start a quoted string here *) - | spaces -> start_string f buffer - | double_quote -> - Lexbuf.enter_state buffer (Lexbuf.String dQuotedStringWraper); - Tokens.TEXT_MARKER - | '{' -> - Lexbuf.enter_state buffer (Lexbuf.MString 0); - Tokens.TEXT_MARKER | _ -> f buffer in { start_string; - wrap = (fun _ -> raise Out_of_context); + wrap = + (fun f ?(nested = false) buf buffer -> + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | "''" -> leave_text nestedQuotedStringWraper buf buffer + | _ -> f ~nested buf buffer); end_string = (fun buffer -> let lexbuf = Lexbuf.buffer buffer in @@ -39,38 +46,61 @@ let rec nestedQuotedStringWraper : Lexbuf.stringWraper = | _ -> raise Not_found); } -and quotedStringWraper : Lexbuf.stringWraper = - (* This function need to be recursirve in case of successive functions to - escape *) - let rec wrap f buf buffer = +and nestedDquotedStringWraper : Lexbuf.stringWraper = + let start_string f buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with - | "''" -> - Buffer.add_char buf '\''; - wrap f buf buffer - | single_quote -> leave_text quotedStringWraper buf buffer - | _ -> f buf buffer + (* There is no more way to add start a quoted string here *) + | _ -> f buffer in + let rec wrap (f : Lexbuf.buffer_builder) ?(nested = false) buf buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | double_quote -> + Buffer.add_char buf '"'; + wrap ~nested:true f buf buffer + | double_quote, double_quote -> + leave_text nestedDquotedStringWraper buf buffer + | _ -> f ~nested buf buffer + in + { + start_string; + wrap; + end_string = + (fun buffer -> + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | double_quote, double_quote -> + Lexbuf.leave_state buffer; + TEXT_MARKER + | _ -> raise Not_found); + } + +and quotedStringWraper : Lexbuf.stringWraper = let rec start_string f buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | spaces -> start_string f buffer - | "''" -> + | single_quote -> raise Out_of_context + | single_quote, single_quote -> Lexbuf.enter_state buffer (Lexbuf.String nestedQuotedStringWraper); Tokens.TEXT_MARKER - | double_quote -> - Lexbuf.enter_state buffer (Lexbuf.String dQuotedStringWraper); - Tokens.TEXT_MARKER - | '{' -> - Lexbuf.enter_state buffer (Lexbuf.MString 0); - Tokens.TEXT_MARKER | _ -> f buffer in + let rec quoted_wrap (f : Lexbuf.buffer_builder) ?(nested = false) buf buffer = + let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with + | single_quote, single_quote -> + Buffer.add_char buf '\''; + quoted_wrap ~nested f buf buffer + | single_quote -> leave_text quotedStringWraper buf buffer + | _ -> f ~nested buf buffer + in { start_string; - wrap; + wrap = quoted_wrap; end_string = (fun buffer -> let lexbuf = Lexbuf.buffer buffer in @@ -78,29 +108,32 @@ and quotedStringWraper : Lexbuf.stringWraper = | single_quote -> Lexbuf.leave_state buffer; TEXT_MARKER - | _ -> raise Not_found); + | _ -> + pr_err buffer; + raise Not_found); } and dQuotedStringWraper : Lexbuf.stringWraper = - let rec wrap f buf buffer = + let rec wrap (f : Lexbuf.buffer_builder) ?(nested = false) buf buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with - | "\"\"" -> + | single_quote -> + Buffer.add_char buf '\''; + wrap ~nested:true f buf buffer + | double_quote, double_quote -> Buffer.add_char buf '"'; wrap f buf buffer | double_quote -> leave_text dQuotedStringWraper buf buffer - | _ -> f buf buffer + | _ -> f ~nested buf buffer in let rec start_string f buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | spaces -> start_string f buffer - | single_quote -> - Lexbuf.enter_state buffer (Lexbuf.String nestedQuotedStringWraper); - Tokens.TEXT_MARKER - | '{' -> - Lexbuf.enter_state buffer (Lexbuf.MString 0); + | double_quote -> raise Out_of_context + | double_quote, double_quote -> + Lexbuf.enter_state buffer (Lexbuf.String nestedDquotedStringWraper); Tokens.TEXT_MARKER | _ -> f buffer in @@ -136,7 +169,10 @@ let defaultWraper : Lexbuf.stringWraper = in { start_string; - wrap = (fun _f _buf _buffer -> raise Out_of_context); + wrap = + (fun _f ?nested _buf _buffer -> + ignore nested; + raise Out_of_context); end_string = (fun _buffer -> raise Out_of_context); } diff --git a/lib/qparser/lex_state.mli b/lib/qparser/lex_state.mli index 1e69faf..a3d1ed0 100644 --- a/lib/qparser/lex_state.mli +++ b/lib/qparser/lex_state.mli @@ -1,14 +1,24 @@ (** This module keep a track of the different way to start, escape and end a string in the lexer. + When a new string should be started ? + Which sequence identify the end of the string + How to handle the escaped characters inside this string + … + Depending on how the string was started (a single quote or double quote), we have differents caracters for every of thoses actions. -*) + + + The defaultWraper is used in any case, and other wrapper are stacked above + when needed. *) val defaultWraper : Lexbuf.stringWraper (** The default string lexer. Used when we start the lexing. *) val quotedStringWraper : Lexbuf.stringWraper -val nestedQuotedStringWraper : Lexbuf.stringWraper val dQuotedStringWraper : Lexbuf.stringWraper val readLongStringWraper : Lexbuf.stringWraper + +exception Out_of_context +(** This exception should not be raised in a normal situation. *) diff --git a/lib/qparser/lexbuf.ml b/lib/qparser/lexbuf.ml index 2433ea5..1d93f67 100644 --- a/lib/qparser/lexbuf.ml +++ b/lib/qparser/lexbuf.ml @@ -1,3 +1,5 @@ +open StdLabels + type t = { buffer : Sedlexing.lexbuf; mutable start_p : Lexing.position option; @@ -6,7 +8,7 @@ type t = { } and lexer = t -> Tokens.token -and buffer_builder = Buffer.t -> lexer +and buffer_builder = ?nested:bool -> Buffer.t -> lexer and stringWraper = { start_string : lexer -> lexer; @@ -22,15 +24,22 @@ and stringWraper = { } and state = - | Token of stringWraper + | Token | String of stringWraper | MString of int | EndString of stringWraper | Expression +let pp_state format = function + | Token -> Format.fprintf format "Token" + | String _ -> Format.fprintf format "String" + | MString _ -> Format.fprintf format "MString" + | EndString _ -> Format.fprintf format "EndString" + | Expression -> Format.fprintf format "Expression" + let state : t -> state option = fun t -> Stack.top_opt t.state let enter_state : t -> state -> unit = fun t state -> Stack.push state t.state -let leave_state : t -> unit = fun t -> ignore (Stack.pop_opt t.state) +let leave_state : t -> unit = fun t -> ignore @@ Stack.pop_opt t.state let buffer : t -> Sedlexing.lexbuf = fun t -> t.buffer let start : t -> unit = @@ -80,3 +89,11 @@ let tokenize : (t -> 'a) -> t -> unit -> 'a * Lexing.position * Lexing.position lexer let rollback : t -> unit = fun t -> Sedlexing.rollback t.buffer + +let overlay : t -> lexer -> lexer = + fun t lexer -> + let rev_list = Stack.fold (fun acc a -> a :: acc) [] t.state in + List.fold_left rev_list ~init:lexer ~f:(fun (acc : lexer) layer -> + match layer with + | String wraper | EndString wraper -> wraper.start_string acc + | _ -> acc) diff --git a/lib/qparser/lexbuf.mli b/lib/qparser/lexbuf.mli index dd13da4..ac3b262 100644 --- a/lib/qparser/lexbuf.mli +++ b/lib/qparser/lexbuf.mli @@ -44,7 +44,7 @@ val rollback : t -> unit *) type lexer = t -> Tokens.token -type buffer_builder = Buffer.t -> lexer +and buffer_builder = ?nested:bool -> Buffer.t -> t -> Tokens.token type stringWraper = { start_string : lexer -> lexer; @@ -60,13 +60,15 @@ type stringWraper = { } type state = - | Token of stringWraper (** Default state, parsing the tokens *) + | Token (** Default state, parsing the tokens *) | String of stringWraper (** String enclosed by [''] *) | MString of int (** String enclosed by [{}]*) | EndString of stringWraper (** State raised just before closing the string *) | Expression (** Expression where [!] is an operator *) +val pp_state : Format.formatter -> state -> unit + val state : t -> state option (** Get the current state for the lexer. @@ -77,3 +79,5 @@ val enter_state : t -> state -> unit val leave_state : t -> unit (** Leave the current state *) + +val overlay : t -> lexer -> lexer diff --git a/lib/qparser/lexer.ml b/lib/qparser/lexer.ml index 114846c..8f3645c 100644 --- a/lib/qparser/lexer.ml +++ b/lib/qparser/lexer.ml @@ -3,11 +3,17 @@ *) open Tokens +open StdLabels exception UnclosedQuote exception LexError of string exception EOF +let pr_err buffer = + let location, _ = Lexbuf.positions buffer in + let line = location.Lexing.pos_lnum and file = location.Lexing.pos_fname in + Format.eprintf "read_quoted_string : %s:%d\n" file line + (* Extract the location name from the pattern *) let location_name = Str.regexp {|.* \(.*\)|} @@ -62,12 +68,26 @@ let location_ident = [%sedlex.regexp? letters | digit] let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] -let rec read_long_string level buf buffer = +let rec read_long_string ?(nested = false) level buf buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with + | "<<" -> + if not nested then ( + match Buffer.length buf with + | 0 -> + Lexbuf.enter_state buffer Lexbuf.Token; + ENTER_EMBED + | _ -> + let result = Tokens.LITERAL (Buffer.contents buf) in + Buffer.reset buf; + Lexbuf.rollback buffer; + result) + else ( + Buffer.add_string buf (Lexbuf.content buffer); + read_long_string ~nested level buf buffer) | '{' -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string (level + 1) buf buffer + read_long_string ~nested (level + 1) buf buffer | '}' -> ( match level with | 0 -> @@ -79,64 +99,87 @@ let rec read_long_string level buf buffer = | _ -> (* We have nested strings. Do not terminate end *) Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string (level - 1) buf buffer) + read_long_string ~nested (level - 1) buf buffer) + | '\'' | '"' -> + Buffer.add_string buf (Lexbuf.content buffer); + read_long_string ~nested:true level buf buffer | eol -> Buffer.add_string buf (Lexbuf.content buffer); - read_long_string level buf buffer + read_long_string ~nested level buf buffer | any -> Buffer.add_string buf (Lexbuf.content buffer); - read_long_string level buf buffer - | _ -> raise Not_found + read_long_string ~nested level buf buffer + | _ -> + pr_err buffer; + raise Not_found (** Read the text inside a ['] *) -let rec read_quoted_string f buf buffer = +let rec read_quoted_string : Lexbuf.stringWraper -> Lexbuf.buffer_builder = + fun f ?(nested = false) buf buffer -> let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with - | eol -> + | "<<" -> + if not nested then ( + match Buffer.length buf with + | 0 -> + Lexbuf.enter_state buffer Lexbuf.Token; + ENTER_EMBED + | _ -> + let result = Tokens.LITERAL (Buffer.contents buf) in + Buffer.reset buf; + Lexbuf.rollback buffer; + result) + else ( + Buffer.add_string buf (Lexbuf.content buffer); + (f.wrap (read_quoted_string f)) buf buffer) + | eol | any -> Buffer.add_string buf (Lexbuf.content buffer); - (f (read_quoted_string f)) buf buffer - | any -> - Buffer.add_string buf (Lexbuf.content buffer); - (f (read_quoted_string f)) buf buffer + (f.wrap (read_quoted_string f)) buf buffer | _ -> - let location, _ = Lexbuf.positions buffer in - let line = location.Lexing.pos_lnum - and file = location.Lexing.pos_fname in - Format.eprintf "read_quoted_string : %s:%d\n" file line; + pr_err buffer; raise Not_found let rec skip_comment buffer = + (* Simplified way to skip the content of a string until the end marker. + (expect the string to be well formed) *) + let rec parse_until_end f = + let _ = wait_balance f buffer in + match Lexbuf.state buffer with + | Some Lexbuf.Token -> + Lexbuf.leave_state buffer; + parse_until_end f + | Some (Lexbuf.EndString _) -> () + | _ -> parse_until_end f + in let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '{' -> - let _ = wait_balance (read_long_string 0) buffer in + parse_until_end (read_long_string 0); let _ = Lex_state.readLongStringWraper.end_string buffer in skip_comment buffer | '\'' -> - let _ = - wait_balance - (Lex_state.quotedStringWraper.wrap - (read_quoted_string Lex_state.quotedStringWraper.wrap)) - buffer - in + parse_until_end + (Lex_state.quotedStringWraper.wrap + (read_quoted_string Lex_state.quotedStringWraper)); let _ = Lex_state.quotedStringWraper.end_string buffer in skip_comment buffer | '"' -> - let _ = - wait_balance - (Lex_state.dQuotedStringWraper.wrap - (read_quoted_string Lex_state.dQuotedStringWraper.wrap)) - buffer - in + parse_until_end + (Lex_state.dQuotedStringWraper.wrap + (read_quoted_string Lex_state.dQuotedStringWraper)); let _ = Lex_state.dQuotedStringWraper.end_string buffer in skip_comment buffer | eol -> (* Ugly hack used in order to put the eol in the front of the next - parsing. *) + parsing. + This is required because the eol is also a part of the syntax, and do + cannot be discard with the end of the comment. *) Lexbuf.rollback buffer; COMMENT | any -> skip_comment buffer - | _ -> raise Not_found + | _ -> + pr_err buffer; + raise Not_found (** Main lexer *) let rec parse_token : Lexbuf.t -> token = @@ -183,19 +226,20 @@ let rec parse_token : Lexbuf.t -> token = Lexbuf.leave_state buffer; R_PAREN | ">>" -> + (* Leave the expression if we have any*) + leave_expression buffer; + (* Now leave the token mode and return to the string *) Lexbuf.leave_state buffer; - parse_token buffer + LEAVE_EMBED | '<' -> LT | '>' -> GT | coma -> COMA | '=' -> Lexbuf.enter_state buffer Lexbuf.Expression; - EQUAL | ident -> build_ident buffer | eol -> leave_expression buffer; - EOL | '&' -> leave_expression buffer; @@ -214,31 +258,53 @@ let rec parse_token : Lexbuf.t -> token = let main buffer = match Lexbuf.state buffer with | Some (Lexbuf.String w) -> - wait_balance (w.wrap @@ read_quoted_string w.wrap) buffer + wait_balance (w.wrap @@ read_quoted_string w) buffer | Some (Lexbuf.MString level) -> wait_balance (read_long_string level) buffer | Some (Lexbuf.EndString w) -> w.end_string buffer - | Some (Lexbuf.Token w) -> w.start_string parse_token buffer - | _ -> Lex_state.defaultWraper.start_string parse_token buffer + | _ -> + let parser = + parse_token |> Lex_state.defaultWraper.start_string + |> Lexbuf.overlay buffer + in + parser buffer let rec discard buffer = let lexbuf = Lexbuf.buffer buffer in + match%sedlex lexbuf with - | '\'' -> - ignore - (wait_balance - (read_quoted_string Lex_state.quotedStringWraper.wrap) - buffer); - discard buffer - | '"' -> - ignore - (wait_balance - (read_quoted_string Lex_state.quotedStringWraper.wrap) - buffer); - discard buffer + | '\'' -> ( + match Lexbuf.state buffer with + | Some (Lexbuf.String _) -> + (* If we are inside a string, close it. *) + Lexbuf.leave_state buffer; + discard buffer + | _ -> + (* Otherwise wait skip until the end of the starting one *) + ignore + (read_quoted_string Lex_state.quotedStringWraper (Buffer.create 16) + buffer); + discard buffer) + | '"' -> ( + match Lexbuf.state buffer with + | Some (Lexbuf.String _) -> + Lexbuf.leave_state buffer; + discard buffer + | _ -> + ignore + (read_quoted_string Lex_state.quotedStringWraper (Buffer.create 16) + buffer); + discard buffer) + | '}' -> ( + match Lexbuf.state buffer with + | Some (Lexbuf.MString _) -> + Lexbuf.leave_state buffer; + discard buffer + | _ -> + ignore (read_long_string 0 (Buffer.create 16) buffer); + discard buffer) | '{' -> - ignore (wait_balance (read_long_string 0) buffer); + ignore (read_long_string 0 (Buffer.create 16) buffer); discard buffer - | eof -> raise EOF | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> leave_expression buffer; () -- cgit v1.2.3