From dd060261e35fcb8a57f03b01dbe84ab772a2a199 Mon Sep 17 00:00:00 2001
From: Chimrod <>
Date: Sat, 28 Oct 2023 16:47:23 +0200
Subject: Set up a context for parsing the literal strings

---
 lib/qparser/lex_state.ml       | 154 +++++++++++++++++++++++++++++++++++++++++
 lib/qparser/lex_state.mli      |  14 ++++
 lib/qparser/lexbuf.ml          |  31 ++++++---
 lib/qparser/lexbuf.mli         |  24 +++++--
 lib/qparser/lexer.ml           | 120 ++++++++++++--------------------
 lib/qparser/lexer.mli          |   7 ++
 lib/qparser/qsp_expression.mly |   4 +-
 7 files changed, 262 insertions(+), 92 deletions(-)
 create mode 100644 lib/qparser/lex_state.ml
 create mode 100644 lib/qparser/lex_state.mli

(limited to 'lib')

diff --git a/lib/qparser/lex_state.ml b/lib/qparser/lex_state.ml
new file mode 100644
index 0000000..37400e7
--- /dev/null
+++ b/lib/qparser/lex_state.ml
@@ -0,0 +1,154 @@
+exception Out_of_context
+
+let space = [%sedlex.regexp? ' ' | '\t']
+let spaces = [%sedlex.regexp? Plus space]
+let single_quote = [%sedlex.regexp? '\'']
+let double_quote = [%sedlex.regexp? '"']
+
+let leave_text end_wrapper buf buffer =
+  Lexbuf.leave_state buffer;
+  Lexbuf.enter_state buffer (Lexbuf.EndString end_wrapper);
+  Lexbuf.rollback buffer;
+  Tokens.LITERAL (Buffer.contents buf)
+
+let rec nestedQuotedStringWraper : Lexbuf.stringWraper =
+  let rec start_string f buffer =
+    let lexbuf = Lexbuf.buffer buffer in
+    match%sedlex lexbuf with
+    (* There is no more way to add start a quoted string here *)
+    | spaces -> start_string f buffer
+    | double_quote ->
+        Lexbuf.enter_state buffer (Lexbuf.String dQuotedStringWraper);
+        Tokens.TEXT_MARKER
+    | '{' ->
+        Lexbuf.enter_state buffer (Lexbuf.MString 0);
+        Tokens.TEXT_MARKER
+    | _ -> f buffer
+  in
+
+  {
+    start_string;
+    wrap = (fun _ -> raise Out_of_context);
+    end_string =
+      (fun buffer ->
+        let lexbuf = Lexbuf.buffer buffer in
+        match%sedlex lexbuf with
+        | "''" ->
+            Lexbuf.leave_state buffer;
+            TEXT_MARKER
+        | _ -> raise Not_found);
+  }
+
+and quotedStringWraper : Lexbuf.stringWraper =
+  (* This function need to be recursirve in case of successive functions to
+     escape *)
+  let rec wrap f buf buffer =
+    let lexbuf = Lexbuf.buffer buffer in
+    match%sedlex lexbuf with
+    | "''" ->
+        Buffer.add_char buf '\'';
+        wrap f buf buffer
+    | single_quote -> leave_text quotedStringWraper buf buffer
+    | _ -> f buf buffer
+  in
+
+  let rec start_string f buffer =
+    let lexbuf = Lexbuf.buffer buffer in
+    match%sedlex lexbuf with
+    | spaces -> start_string f buffer
+    | "''" ->
+        Lexbuf.enter_state buffer (Lexbuf.String nestedQuotedStringWraper);
+        Tokens.TEXT_MARKER
+    | double_quote ->
+        Lexbuf.enter_state buffer (Lexbuf.String dQuotedStringWraper);
+        Tokens.TEXT_MARKER
+    | '{' ->
+        Lexbuf.enter_state buffer (Lexbuf.MString 0);
+        Tokens.TEXT_MARKER
+    | _ -> f buffer
+  in
+
+  {
+    start_string;
+    wrap;
+    end_string =
+      (fun buffer ->
+        let lexbuf = Lexbuf.buffer buffer in
+        match%sedlex lexbuf with
+        | single_quote ->
+            Lexbuf.leave_state buffer;
+            TEXT_MARKER
+        | _ -> raise Not_found);
+  }
+
+and dQuotedStringWraper : Lexbuf.stringWraper =
+  let rec wrap f buf buffer =
+    let lexbuf = Lexbuf.buffer buffer in
+    match%sedlex lexbuf with
+    | "\"\"" ->
+        Buffer.add_char buf '"';
+        wrap f buf buffer
+    | double_quote -> leave_text dQuotedStringWraper buf buffer
+    | _ -> f buf buffer
+  in
+
+  let rec start_string f buffer =
+    let lexbuf = Lexbuf.buffer buffer in
+    match%sedlex lexbuf with
+    | spaces -> start_string f buffer
+    | single_quote ->
+        Lexbuf.enter_state buffer (Lexbuf.String nestedQuotedStringWraper);
+        Tokens.TEXT_MARKER
+    | '{' ->
+        Lexbuf.enter_state buffer (Lexbuf.MString 0);
+        Tokens.TEXT_MARKER
+    | _ -> f buffer
+  in
+
+  {
+    start_string;
+    wrap;
+    end_string =
+      (fun buffer ->
+        let lexbuf = Lexbuf.buffer buffer in
+        match%sedlex lexbuf with
+        | double_quote ->
+            Lexbuf.leave_state buffer;
+            TEXT_MARKER
+        | _ -> raise Not_found);
+  }
+
+let defaultWraper : Lexbuf.stringWraper =
+  let rec start_string f buffer =
+    let lexbuf = Lexbuf.buffer buffer in
+    match%sedlex lexbuf with
+    | spaces -> start_string f buffer
+    | single_quote ->
+        Lexbuf.enter_state buffer (Lexbuf.String quotedStringWraper);
+        Tokens.TEXT_MARKER
+    | double_quote ->
+        Lexbuf.enter_state buffer (Lexbuf.String dQuotedStringWraper);
+        Tokens.TEXT_MARKER
+    | '{' ->
+        Lexbuf.enter_state buffer (Lexbuf.MString 0);
+        Tokens.TEXT_MARKER
+    | _ -> f buffer
+  in
+  {
+    start_string;
+    wrap = (fun _f _buf _buffer -> raise Out_of_context);
+    end_string = (fun _buffer -> raise Out_of_context);
+  }
+
+let readLongStringWraper : Lexbuf.stringWraper =
+  {
+    defaultWraper with
+    end_string =
+      (fun buffer ->
+        let lexbuf = Lexbuf.buffer buffer in
+        match%sedlex lexbuf with
+        | "}" ->
+            Lexbuf.leave_state buffer;
+            TEXT_MARKER
+        | _ -> raise Not_found);
+  }
diff --git a/lib/qparser/lex_state.mli b/lib/qparser/lex_state.mli
new file mode 100644
index 0000000..1e69faf
--- /dev/null
+++ b/lib/qparser/lex_state.mli
@@ -0,0 +1,14 @@
+(** This module keep a track of the different way to start, escape and end a
+    string in the lexer. 
+
+    Depending on how the string was started (a single quote or double quote),
+    we have differents caracters for every of thoses actions.
+*)
+
+val defaultWraper : Lexbuf.stringWraper
+(** The default string lexer. Used when we start the lexing. *)
+
+val quotedStringWraper : Lexbuf.stringWraper
+val nestedQuotedStringWraper : Lexbuf.stringWraper
+val dQuotedStringWraper : Lexbuf.stringWraper
+val readLongStringWraper : Lexbuf.stringWraper
diff --git a/lib/qparser/lexbuf.ml b/lib/qparser/lexbuf.ml
index 9498f4a..2433ea5 100644
--- a/lib/qparser/lexbuf.ml
+++ b/lib/qparser/lexbuf.ml
@@ -1,11 +1,3 @@
-type state =
-  | Token
-  | String
-  | DString
-  | MString of int
-  | EndString
-  | Expression
-
 type t = {
   buffer : Sedlexing.lexbuf;
   mutable start_p : Lexing.position option;
@@ -13,6 +5,29 @@ type t = {
   reset_line : bool;
 }
 
+and lexer = t -> Tokens.token
+and buffer_builder = Buffer.t -> lexer
+
+and stringWraper = {
+  start_string : lexer -> lexer;
+      (** Start a new string. This function is used insed the token lexer, in
+          order to identify how to start a new string *)
+  wrap : buffer_builder -> buffer_builder;
+      (** function used to escape the character and add it to the buffer. This
+          function is used inside the string lexer. *)
+  end_string : lexer;
+      (** Function used to match the end of the string. This function is used
+          after the string lexer, in order to identify the end patten for a
+          string *)
+}
+
+and state =
+  | Token of stringWraper
+  | String of stringWraper
+  | MString of int
+  | EndString of stringWraper
+  | Expression
+
 let state : t -> state option = fun t -> Stack.top_opt t.state
 let enter_state : t -> state -> unit = fun t state -> Stack.push state t.state
 let leave_state : t -> unit = fun t -> ignore (Stack.pop_opt t.state)
diff --git a/lib/qparser/lexbuf.mli b/lib/qparser/lexbuf.mli
index 5fda8ff..dd13da4 100644
--- a/lib/qparser/lexbuf.mli
+++ b/lib/qparser/lexbuf.mli
@@ -43,12 +43,28 @@ val rollback : t -> unit
    using a stack for determining the token to send.
 *)
 
+type lexer = t -> Tokens.token
+type buffer_builder = Buffer.t -> lexer
+
+type stringWraper = {
+  start_string : lexer -> lexer;
+      (** Start a new string. This function is used insed the token lexer, in
+          order to identify how to start a new string *)
+  wrap : buffer_builder -> buffer_builder;
+      (** function used to escape the character and add it to the buffer. This
+          function is used inside the string lexer. *)
+  end_string : lexer;
+      (** Function used to match the end of the string. This function is used
+          after the string lexer, in order to identify the end patten for a
+          string *)
+}
+
 type state =
-  | Token  (** Default state, parsing the tokens *)
-  | String  (** String enclosed by [''] *)
-  | DString  (** String enclosed by [""] *)
+  | Token of stringWraper  (** Default state, parsing the tokens *)
+  | String of stringWraper  (** String enclosed by [''] *)
   | MString of int  (** String enclosed by [{}]*)
-  | EndString  (** State raised just before closing the string *)
+  | EndString of stringWraper
+      (** State raised just before closing the string *)
   | Expression  (** Expression where [!] is an operator *)
 
 val state : t -> state option
diff --git a/lib/qparser/lexer.ml b/lib/qparser/lexer.ml
index 7878299..114846c 100644
--- a/lib/qparser/lexer.ml
+++ b/lib/qparser/lexer.ml
@@ -53,7 +53,6 @@ let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a =
   with Not_found -> raise UnclosedQuote
 
 let space = [%sedlex.regexp? ' ' | '\t']
-let spaces = [%sedlex.regexp? Plus space]
 let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"]
 let coma = [%sedlex.regexp? ',']
 let digit = [%sedlex.regexp? '0' .. '9']
@@ -63,29 +62,6 @@ let location_ident = [%sedlex.regexp? letters | digit]
 let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^']
 let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident]
 
-(** Change the state when we are ending a string. Send the text marker to the
-    parser in order to tell the string is over.
-
-    This can work because the state EndString is only raised when the same
-    token is fetched inside the appropriate sting method lexer. The
-    [Lexbuf.rollback] function is called in order to let the same token occur
-    again.
- *)
-let end_string : Lexbuf.t -> token =
- fun buffer ->
-  let lexbuf = Lexbuf.buffer buffer in
-  match%sedlex lexbuf with
-  | '}' ->
-      Lexbuf.leave_state buffer;
-      TEXT_MARKER
-  | '\'' ->
-      Lexbuf.leave_state buffer;
-      TEXT_MARKER
-  | '"' ->
-      Lexbuf.leave_state buffer;
-      TEXT_MARKER
-  | _ -> raise Not_found
-
 let rec read_long_string level buf buffer =
   let lexbuf = Lexbuf.buffer buffer in
   match%sedlex lexbuf with
@@ -96,7 +72,8 @@ let rec read_long_string level buf buffer =
       match level with
       | 0 ->
           Lexbuf.leave_state buffer;
-          Lexbuf.enter_state buffer Lexbuf.EndString;
+          Lexbuf.enter_state buffer
+            (Lexbuf.EndString Lex_state.readLongStringWraper);
           Lexbuf.rollback buffer;
           LITERAL (Buffer.contents buf)
       | _ ->
@@ -111,55 +88,47 @@ let rec read_long_string level buf buffer =
       read_long_string level buf buffer
   | _ -> raise Not_found
 
-let rec read_dquoted_string buf buffer =
+(** Read the text inside a ['] *)
+let rec read_quoted_string f buf buffer =
   let lexbuf = Lexbuf.buffer buffer in
   match%sedlex lexbuf with
-  | "\"\"" ->
-      Buffer.add_char buf '"';
-      read_dquoted_string buf buffer
-  | '"' ->
-      Lexbuf.leave_state buffer;
-      Lexbuf.enter_state buffer Lexbuf.EndString;
-      Lexbuf.rollback buffer;
-      LITERAL (Buffer.contents buf)
-  | any ->
-      Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
-      read_dquoted_string buf buffer
-  | _ -> raise Not_found
-
-let rec read_quoted_string buf buffer =
-  let lexbuf = Lexbuf.buffer buffer in
-  match%sedlex lexbuf with
-  | "''" ->
-      Buffer.add_char buf '\'';
-      read_quoted_string buf buffer
-  | '\'' ->
-      Lexbuf.leave_state buffer;
-      Lexbuf.enter_state buffer Lexbuf.EndString;
-      Lexbuf.rollback buffer;
-      LITERAL (Buffer.contents buf)
   | eol ->
       Buffer.add_string buf (Lexbuf.content buffer);
-      read_quoted_string buf buffer
+      (f (read_quoted_string f)) buf buffer
   | any ->
       Buffer.add_string buf (Lexbuf.content buffer);
-      read_quoted_string buf buffer
-  | _ -> raise Not_found
+      (f (read_quoted_string f)) buf buffer
+  | _ ->
+      let location, _ = Lexbuf.positions buffer in
+      let line = location.Lexing.pos_lnum
+      and file = location.Lexing.pos_fname in
+      Format.eprintf "read_quoted_string : %s:%d\n" file line;
+      raise Not_found
 
 let rec skip_comment buffer =
   let lexbuf = Lexbuf.buffer buffer in
   match%sedlex lexbuf with
   | '{' ->
       let _ = wait_balance (read_long_string 0) buffer in
-      let _ = end_string buffer in
+      let _ = Lex_state.readLongStringWraper.end_string buffer in
       skip_comment buffer
   | '\'' ->
-      let _ = wait_balance read_quoted_string buffer in
-      let _ = end_string buffer in
+      let _ =
+        wait_balance
+          (Lex_state.quotedStringWraper.wrap
+             (read_quoted_string Lex_state.quotedStringWraper.wrap))
+          buffer
+      in
+      let _ = Lex_state.quotedStringWraper.end_string buffer in
       skip_comment buffer
   | '"' ->
-      let _ = wait_balance read_dquoted_string buffer in
-      let _ = end_string buffer in
+      let _ =
+        wait_balance
+          (Lex_state.dQuotedStringWraper.wrap
+             (read_quoted_string Lex_state.dQuotedStringWraper.wrap))
+          buffer
+      in
+      let _ = Lex_state.dQuotedStringWraper.end_string buffer in
       skip_comment buffer
   | eol ->
       (* Ugly hack used in order to put the eol in the front of the next
@@ -170,13 +139,13 @@ let rec skip_comment buffer =
   | _ -> raise Not_found
 
 (** Main lexer *)
-let rec token : Lexbuf.t -> token =
+let rec parse_token : Lexbuf.t -> token =
  fun buffer ->
   let lexbuf = Lexbuf.buffer buffer in
   match%sedlex lexbuf with
   | 0Xfeff ->
       (* Ignore the BOM *)
-      token buffer
+      parse_token buffer
   | '#', Star space, location ->
       (* Extract the location name *)
       let ident = Lexbuf.content buffer in
@@ -215,7 +184,7 @@ let rec token : Lexbuf.t -> token =
       R_PAREN
   | ">>" ->
       Lexbuf.leave_state buffer;
-      token buffer
+      parse_token buffer
   | '<' -> LT
   | '>' -> GT
   | coma -> COMA
@@ -235,16 +204,6 @@ let rec token : Lexbuf.t -> token =
       match Lexbuf.state buffer with
       | Some Lexbuf.Expression -> EXCLAMATION
       | _ -> skip_comment buffer)
-  | spaces -> token buffer
-  | '\'' ->
-      Lexbuf.enter_state buffer Lexbuf.String;
-      TEXT_MARKER
-  | '"' ->
-      Lexbuf.enter_state buffer Lexbuf.DString;
-      TEXT_MARKER
-  | '{' ->
-      Lexbuf.enter_state buffer (Lexbuf.MString 0);
-      TEXT_MARKER
   | '}' -> TEXT_MARKER
   | eof -> raise EOF
   | _ ->
@@ -254,20 +213,27 @@ let rec token : Lexbuf.t -> token =
 
 let main buffer =
   match Lexbuf.state buffer with
-  | Some Lexbuf.String -> wait_balance read_quoted_string buffer
-  | Some Lexbuf.DString -> wait_balance read_dquoted_string buffer
+  | Some (Lexbuf.String w) ->
+      wait_balance (w.wrap @@ read_quoted_string w.wrap) buffer
   | Some (Lexbuf.MString level) -> wait_balance (read_long_string level) buffer
-  | Some Lexbuf.EndString -> end_string buffer
-  | _ -> token buffer
+  | Some (Lexbuf.EndString w) -> w.end_string buffer
+  | Some (Lexbuf.Token w) -> w.start_string parse_token buffer
+  | _ -> Lex_state.defaultWraper.start_string parse_token buffer
 
 let rec discard buffer =
   let lexbuf = Lexbuf.buffer buffer in
   match%sedlex lexbuf with
   | '\'' ->
-      ignore (wait_balance read_quoted_string buffer);
+      ignore
+        (wait_balance
+           (read_quoted_string Lex_state.quotedStringWraper.wrap)
+           buffer);
       discard buffer
   | '"' ->
-      ignore (wait_balance read_dquoted_string buffer);
+      ignore
+        (wait_balance
+           (read_quoted_string Lex_state.quotedStringWraper.wrap)
+           buffer);
       discard buffer
   | '{' ->
       ignore (wait_balance (read_long_string 0) buffer);
diff --git a/lib/qparser/lexer.mli b/lib/qparser/lexer.mli
index 26d59cb..be10ddb 100644
--- a/lib/qparser/lexer.mli
+++ b/lib/qparser/lexer.mli
@@ -1,3 +1,10 @@
+(** Provide a lexer for the langage. The function [main] read the source and
+    identify the next to token to give to the parser.
+
+    Personal note: parsing the QSP is really complicated. The language was
+    designed for regex and I have to twist the lexer in order to get something
+    working. *)
+
 exception EOF
 exception UnclosedQuote
 exception LexError of string
diff --git a/lib/qparser/qsp_expression.mly b/lib/qparser/qsp_expression.mly
index 58da39e..b470a88 100644
--- a/lib/qparser/qsp_expression.mly
+++ b/lib/qparser/qsp_expression.mly
@@ -34,9 +34,7 @@
       op = binary_operator 
       expr2 = expression 
       { Analyzer.Expression.boperator $loc op expr1 expr2 }
-    | TEXT_MARKER 
-      v = LITERAL 
-      TEXT_MARKER 
+    | v = delimited(TEXT_MARKER, LITERAL, TEXT_MARKER)
       { Analyzer.Expression.literal $loc v }
     | i = INTEGER  { Analyzer.Expression.integer $loc i }
     | v = variable { Analyzer.Expression.ident v }
-- 
cgit v1.2.3