From dd060261e35fcb8a57f03b01dbe84ab772a2a199 Mon Sep 17 00:00:00 2001
From: Chimrod <>
Date: Sat, 28 Oct 2023 16:47:23 +0200
Subject: Set up a context for parsing the literal strings
---
lib/qparser/lex_state.ml | 154 +++++++++++++++++++++++++++++++++++++++++
lib/qparser/lex_state.mli | 14 ++++
lib/qparser/lexbuf.ml | 31 ++++++---
lib/qparser/lexbuf.mli | 24 +++++--
lib/qparser/lexer.ml | 120 ++++++++++++--------------------
lib/qparser/lexer.mli | 7 ++
lib/qparser/qsp_expression.mly | 4 +-
test/syntax.ml | 12 ++++
8 files changed, 274 insertions(+), 92 deletions(-)
create mode 100644 lib/qparser/lex_state.ml
create mode 100644 lib/qparser/lex_state.mli
diff --git a/lib/qparser/lex_state.ml b/lib/qparser/lex_state.ml
new file mode 100644
index 0000000..37400e7
--- /dev/null
+++ b/lib/qparser/lex_state.ml
@@ -0,0 +1,154 @@
+exception Out_of_context
+
+let space = [%sedlex.regexp? ' ' | '\t']
+let spaces = [%sedlex.regexp? Plus space]
+let single_quote = [%sedlex.regexp? '\'']
+let double_quote = [%sedlex.regexp? '"']
+
+let leave_text end_wrapper buf buffer =
+ Lexbuf.leave_state buffer;
+ Lexbuf.enter_state buffer (Lexbuf.EndString end_wrapper);
+ Lexbuf.rollback buffer;
+ Tokens.LITERAL (Buffer.contents buf)
+
+let rec nestedQuotedStringWraper : Lexbuf.stringWraper =
+ let rec start_string f buffer =
+ let lexbuf = Lexbuf.buffer buffer in
+ match%sedlex lexbuf with
+ (* There is no more way to add start a quoted string here *)
+ | spaces -> start_string f buffer
+ | double_quote ->
+ Lexbuf.enter_state buffer (Lexbuf.String dQuotedStringWraper);
+ Tokens.TEXT_MARKER
+ | '{' ->
+ Lexbuf.enter_state buffer (Lexbuf.MString 0);
+ Tokens.TEXT_MARKER
+ | _ -> f buffer
+ in
+
+ {
+ start_string;
+ wrap = (fun _ -> raise Out_of_context);
+ end_string =
+ (fun buffer ->
+ let lexbuf = Lexbuf.buffer buffer in
+ match%sedlex lexbuf with
+ | "''" ->
+ Lexbuf.leave_state buffer;
+ TEXT_MARKER
+ | _ -> raise Not_found);
+ }
+
+and quotedStringWraper : Lexbuf.stringWraper =
+ (* This function need to be recursirve in case of successive functions to
+ escape *)
+ let rec wrap f buf buffer =
+ let lexbuf = Lexbuf.buffer buffer in
+ match%sedlex lexbuf with
+ | "''" ->
+ Buffer.add_char buf '\'';
+ wrap f buf buffer
+ | single_quote -> leave_text quotedStringWraper buf buffer
+ | _ -> f buf buffer
+ in
+
+ let rec start_string f buffer =
+ let lexbuf = Lexbuf.buffer buffer in
+ match%sedlex lexbuf with
+ | spaces -> start_string f buffer
+ | "''" ->
+ Lexbuf.enter_state buffer (Lexbuf.String nestedQuotedStringWraper);
+ Tokens.TEXT_MARKER
+ | double_quote ->
+ Lexbuf.enter_state buffer (Lexbuf.String dQuotedStringWraper);
+ Tokens.TEXT_MARKER
+ | '{' ->
+ Lexbuf.enter_state buffer (Lexbuf.MString 0);
+ Tokens.TEXT_MARKER
+ | _ -> f buffer
+ in
+
+ {
+ start_string;
+ wrap;
+ end_string =
+ (fun buffer ->
+ let lexbuf = Lexbuf.buffer buffer in
+ match%sedlex lexbuf with
+ | single_quote ->
+ Lexbuf.leave_state buffer;
+ TEXT_MARKER
+ | _ -> raise Not_found);
+ }
+
+and dQuotedStringWraper : Lexbuf.stringWraper =
+ let rec wrap f buf buffer =
+ let lexbuf = Lexbuf.buffer buffer in
+ match%sedlex lexbuf with
+ | "\"\"" ->
+ Buffer.add_char buf '"';
+ wrap f buf buffer
+ | double_quote -> leave_text dQuotedStringWraper buf buffer
+ | _ -> f buf buffer
+ in
+
+ let rec start_string f buffer =
+ let lexbuf = Lexbuf.buffer buffer in
+ match%sedlex lexbuf with
+ | spaces -> start_string f buffer
+ | single_quote ->
+ Lexbuf.enter_state buffer (Lexbuf.String nestedQuotedStringWraper);
+ Tokens.TEXT_MARKER
+ | '{' ->
+ Lexbuf.enter_state buffer (Lexbuf.MString 0);
+ Tokens.TEXT_MARKER
+ | _ -> f buffer
+ in
+
+ {
+ start_string;
+ wrap;
+ end_string =
+ (fun buffer ->
+ let lexbuf = Lexbuf.buffer buffer in
+ match%sedlex lexbuf with
+ | double_quote ->
+ Lexbuf.leave_state buffer;
+ TEXT_MARKER
+ | _ -> raise Not_found);
+ }
+
+let defaultWraper : Lexbuf.stringWraper =
+ let rec start_string f buffer =
+ let lexbuf = Lexbuf.buffer buffer in
+ match%sedlex lexbuf with
+ | spaces -> start_string f buffer
+ | single_quote ->
+ Lexbuf.enter_state buffer (Lexbuf.String quotedStringWraper);
+ Tokens.TEXT_MARKER
+ | double_quote ->
+ Lexbuf.enter_state buffer (Lexbuf.String dQuotedStringWraper);
+ Tokens.TEXT_MARKER
+ | '{' ->
+ Lexbuf.enter_state buffer (Lexbuf.MString 0);
+ Tokens.TEXT_MARKER
+ | _ -> f buffer
+ in
+ {
+ start_string;
+ wrap = (fun _f _buf _buffer -> raise Out_of_context);
+ end_string = (fun _buffer -> raise Out_of_context);
+ }
+
+let readLongStringWraper : Lexbuf.stringWraper =
+ {
+ defaultWraper with
+ end_string =
+ (fun buffer ->
+ let lexbuf = Lexbuf.buffer buffer in
+ match%sedlex lexbuf with
+ | "}" ->
+ Lexbuf.leave_state buffer;
+ TEXT_MARKER
+ | _ -> raise Not_found);
+ }
diff --git a/lib/qparser/lex_state.mli b/lib/qparser/lex_state.mli
new file mode 100644
index 0000000..1e69faf
--- /dev/null
+++ b/lib/qparser/lex_state.mli
@@ -0,0 +1,14 @@
+(** This module keep a track of the different way to start, escape and end a
+ string in the lexer.
+
+ Depending on how the string was started (a single quote or double quote),
+ we have differents caracters for every of thoses actions.
+*)
+
+val defaultWraper : Lexbuf.stringWraper
+(** The default string lexer. Used when we start the lexing. *)
+
+val quotedStringWraper : Lexbuf.stringWraper
+val nestedQuotedStringWraper : Lexbuf.stringWraper
+val dQuotedStringWraper : Lexbuf.stringWraper
+val readLongStringWraper : Lexbuf.stringWraper
diff --git a/lib/qparser/lexbuf.ml b/lib/qparser/lexbuf.ml
index 9498f4a..2433ea5 100644
--- a/lib/qparser/lexbuf.ml
+++ b/lib/qparser/lexbuf.ml
@@ -1,11 +1,3 @@
-type state =
- | Token
- | String
- | DString
- | MString of int
- | EndString
- | Expression
-
type t = {
buffer : Sedlexing.lexbuf;
mutable start_p : Lexing.position option;
@@ -13,6 +5,29 @@ type t = {
reset_line : bool;
}
+and lexer = t -> Tokens.token
+and buffer_builder = Buffer.t -> lexer
+
+and stringWraper = {
+ start_string : lexer -> lexer;
+ (** Start a new string. This function is used insed the token lexer, in
+ order to identify how to start a new string *)
+ wrap : buffer_builder -> buffer_builder;
+ (** function used to escape the character and add it to the buffer. This
+ function is used inside the string lexer. *)
+ end_string : lexer;
+ (** Function used to match the end of the string. This function is used
+ after the string lexer, in order to identify the end patten for a
+ string *)
+}
+
+and state =
+ | Token of stringWraper
+ | String of stringWraper
+ | MString of int
+ | EndString of stringWraper
+ | Expression
+
let state : t -> state option = fun t -> Stack.top_opt t.state
let enter_state : t -> state -> unit = fun t state -> Stack.push state t.state
let leave_state : t -> unit = fun t -> ignore (Stack.pop_opt t.state)
diff --git a/lib/qparser/lexbuf.mli b/lib/qparser/lexbuf.mli
index 5fda8ff..dd13da4 100644
--- a/lib/qparser/lexbuf.mli
+++ b/lib/qparser/lexbuf.mli
@@ -43,12 +43,28 @@ val rollback : t -> unit
using a stack for determining the token to send.
*)
+type lexer = t -> Tokens.token
+type buffer_builder = Buffer.t -> lexer
+
+type stringWraper = {
+ start_string : lexer -> lexer;
+ (** Start a new string. This function is used insed the token lexer, in
+ order to identify how to start a new string *)
+ wrap : buffer_builder -> buffer_builder;
+ (** function used to escape the character and add it to the buffer. This
+ function is used inside the string lexer. *)
+ end_string : lexer;
+ (** Function used to match the end of the string. This function is used
+ after the string lexer, in order to identify the end patten for a
+ string *)
+}
+
type state =
- | Token (** Default state, parsing the tokens *)
- | String (** String enclosed by [''] *)
- | DString (** String enclosed by [""] *)
+ | Token of stringWraper (** Default state, parsing the tokens *)
+ | String of stringWraper (** String enclosed by [''] *)
| MString of int (** String enclosed by [{}]*)
- | EndString (** State raised just before closing the string *)
+ | EndString of stringWraper
+ (** State raised just before closing the string *)
| Expression (** Expression where [!] is an operator *)
val state : t -> state option
diff --git a/lib/qparser/lexer.ml b/lib/qparser/lexer.ml
index 7878299..114846c 100644
--- a/lib/qparser/lexer.ml
+++ b/lib/qparser/lexer.ml
@@ -53,7 +53,6 @@ let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a =
with Not_found -> raise UnclosedQuote
let space = [%sedlex.regexp? ' ' | '\t']
-let spaces = [%sedlex.regexp? Plus space]
let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"]
let coma = [%sedlex.regexp? ',']
let digit = [%sedlex.regexp? '0' .. '9']
@@ -63,29 +62,6 @@ let location_ident = [%sedlex.regexp? letters | digit]
let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^']
let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident]
-(** Change the state when we are ending a string. Send the text marker to the
- parser in order to tell the string is over.
-
- This can work because the state EndString is only raised when the same
- token is fetched inside the appropriate sting method lexer. The
- [Lexbuf.rollback] function is called in order to let the same token occur
- again.
- *)
-let end_string : Lexbuf.t -> token =
- fun buffer ->
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | '}' ->
- Lexbuf.leave_state buffer;
- TEXT_MARKER
- | '\'' ->
- Lexbuf.leave_state buffer;
- TEXT_MARKER
- | '"' ->
- Lexbuf.leave_state buffer;
- TEXT_MARKER
- | _ -> raise Not_found
-
let rec read_long_string level buf buffer =
let lexbuf = Lexbuf.buffer buffer in
match%sedlex lexbuf with
@@ -96,7 +72,8 @@ let rec read_long_string level buf buffer =
match level with
| 0 ->
Lexbuf.leave_state buffer;
- Lexbuf.enter_state buffer Lexbuf.EndString;
+ Lexbuf.enter_state buffer
+ (Lexbuf.EndString Lex_state.readLongStringWraper);
Lexbuf.rollback buffer;
LITERAL (Buffer.contents buf)
| _ ->
@@ -111,55 +88,47 @@ let rec read_long_string level buf buffer =
read_long_string level buf buffer
| _ -> raise Not_found
-let rec read_dquoted_string buf buffer =
+(** Read the text inside a ['] *)
+let rec read_quoted_string f buf buffer =
let lexbuf = Lexbuf.buffer buffer in
match%sedlex lexbuf with
- | "\"\"" ->
- Buffer.add_char buf '"';
- read_dquoted_string buf buffer
- | '"' ->
- Lexbuf.leave_state buffer;
- Lexbuf.enter_state buffer Lexbuf.EndString;
- Lexbuf.rollback buffer;
- LITERAL (Buffer.contents buf)
- | any ->
- Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
- read_dquoted_string buf buffer
- | _ -> raise Not_found
-
-let rec read_quoted_string buf buffer =
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | "''" ->
- Buffer.add_char buf '\'';
- read_quoted_string buf buffer
- | '\'' ->
- Lexbuf.leave_state buffer;
- Lexbuf.enter_state buffer Lexbuf.EndString;
- Lexbuf.rollback buffer;
- LITERAL (Buffer.contents buf)
| eol ->
Buffer.add_string buf (Lexbuf.content buffer);
- read_quoted_string buf buffer
+ (f (read_quoted_string f)) buf buffer
| any ->
Buffer.add_string buf (Lexbuf.content buffer);
- read_quoted_string buf buffer
- | _ -> raise Not_found
+ (f (read_quoted_string f)) buf buffer
+ | _ ->
+ let location, _ = Lexbuf.positions buffer in
+ let line = location.Lexing.pos_lnum
+ and file = location.Lexing.pos_fname in
+ Format.eprintf "read_quoted_string : %s:%d\n" file line;
+ raise Not_found
let rec skip_comment buffer =
let lexbuf = Lexbuf.buffer buffer in
match%sedlex lexbuf with
| '{' ->
let _ = wait_balance (read_long_string 0) buffer in
- let _ = end_string buffer in
+ let _ = Lex_state.readLongStringWraper.end_string buffer in
skip_comment buffer
| '\'' ->
- let _ = wait_balance read_quoted_string buffer in
- let _ = end_string buffer in
+ let _ =
+ wait_balance
+ (Lex_state.quotedStringWraper.wrap
+ (read_quoted_string Lex_state.quotedStringWraper.wrap))
+ buffer
+ in
+ let _ = Lex_state.quotedStringWraper.end_string buffer in
skip_comment buffer
| '"' ->
- let _ = wait_balance read_dquoted_string buffer in
- let _ = end_string buffer in
+ let _ =
+ wait_balance
+ (Lex_state.dQuotedStringWraper.wrap
+ (read_quoted_string Lex_state.dQuotedStringWraper.wrap))
+ buffer
+ in
+ let _ = Lex_state.dQuotedStringWraper.end_string buffer in
skip_comment buffer
| eol ->
(* Ugly hack used in order to put the eol in the front of the next
@@ -170,13 +139,13 @@ let rec skip_comment buffer =
| _ -> raise Not_found
(** Main lexer *)
-let rec token : Lexbuf.t -> token =
+let rec parse_token : Lexbuf.t -> token =
fun buffer ->
let lexbuf = Lexbuf.buffer buffer in
match%sedlex lexbuf with
| 0Xfeff ->
(* Ignore the BOM *)
- token buffer
+ parse_token buffer
| '#', Star space, location ->
(* Extract the location name *)
let ident = Lexbuf.content buffer in
@@ -215,7 +184,7 @@ let rec token : Lexbuf.t -> token =
R_PAREN
| ">>" ->
Lexbuf.leave_state buffer;
- token buffer
+ parse_token buffer
| '<' -> LT
| '>' -> GT
| coma -> COMA
@@ -235,16 +204,6 @@ let rec token : Lexbuf.t -> token =
match Lexbuf.state buffer with
| Some Lexbuf.Expression -> EXCLAMATION
| _ -> skip_comment buffer)
- | spaces -> token buffer
- | '\'' ->
- Lexbuf.enter_state buffer Lexbuf.String;
- TEXT_MARKER
- | '"' ->
- Lexbuf.enter_state buffer Lexbuf.DString;
- TEXT_MARKER
- | '{' ->
- Lexbuf.enter_state buffer (Lexbuf.MString 0);
- TEXT_MARKER
| '}' -> TEXT_MARKER
| eof -> raise EOF
| _ ->
@@ -254,20 +213,27 @@ let rec token : Lexbuf.t -> token =
let main buffer =
match Lexbuf.state buffer with
- | Some Lexbuf.String -> wait_balance read_quoted_string buffer
- | Some Lexbuf.DString -> wait_balance read_dquoted_string buffer
+ | Some (Lexbuf.String w) ->
+ wait_balance (w.wrap @@ read_quoted_string w.wrap) buffer
| Some (Lexbuf.MString level) -> wait_balance (read_long_string level) buffer
- | Some Lexbuf.EndString -> end_string buffer
- | _ -> token buffer
+ | Some (Lexbuf.EndString w) -> w.end_string buffer
+ | Some (Lexbuf.Token w) -> w.start_string parse_token buffer
+ | _ -> Lex_state.defaultWraper.start_string parse_token buffer
let rec discard buffer =
let lexbuf = Lexbuf.buffer buffer in
match%sedlex lexbuf with
| '\'' ->
- ignore (wait_balance read_quoted_string buffer);
+ ignore
+ (wait_balance
+ (read_quoted_string Lex_state.quotedStringWraper.wrap)
+ buffer);
discard buffer
| '"' ->
- ignore (wait_balance read_dquoted_string buffer);
+ ignore
+ (wait_balance
+ (read_quoted_string Lex_state.quotedStringWraper.wrap)
+ buffer);
discard buffer
| '{' ->
ignore (wait_balance (read_long_string 0) buffer);
diff --git a/lib/qparser/lexer.mli b/lib/qparser/lexer.mli
index 26d59cb..be10ddb 100644
--- a/lib/qparser/lexer.mli
+++ b/lib/qparser/lexer.mli
@@ -1,3 +1,10 @@
+(** Provide a lexer for the langage. The function [main] read the source and
+ identify the next to token to give to the parser.
+
+ Personal note: parsing the QSP is really complicated. The language was
+ designed for regex and I have to twist the lexer in order to get something
+ working. *)
+
exception EOF
exception UnclosedQuote
exception LexError of string
diff --git a/lib/qparser/qsp_expression.mly b/lib/qparser/qsp_expression.mly
index 58da39e..b470a88 100644
--- a/lib/qparser/qsp_expression.mly
+++ b/lib/qparser/qsp_expression.mly
@@ -34,9 +34,7 @@
op = binary_operator
expr2 = expression
{ Analyzer.Expression.boperator $loc op expr1 expr2 }
- | TEXT_MARKER
- v = LITERAL
- TEXT_MARKER
+ | v = delimited(TEXT_MARKER, LITERAL, TEXT_MARKER)
{ Analyzer.Expression.literal $loc v }
| i = INTEGER { Analyzer.Expression.integer $loc i }
| v = variable { Analyzer.Expression.ident v }
diff --git a/test/syntax.ml b/test/syntax.ml
index 56fac8e..a49bd1c 100644
--- a/test/syntax.ml
+++ b/test/syntax.ml
@@ -784,6 +784,17 @@ let test_precedence8 () =
Tree.Ast.Integer (_position, "1") ) ));
]
+let nested_string () =
+ _test_instruction
+ {|'Delete'|}
+ [
+ Tree.Ast.Expression
+ (Tree.Ast.Literal
+ ( _position,
+ {|Delete|}
+ ));
+ ]
+
let test =
( "Syntax",
[
@@ -848,4 +859,5 @@ let test =
Alcotest.test_case "inline if else if" `Quick test_mutiple_inline_ifs;
Alcotest.test_case "Precedence7" `Quick test_precedence7;
Alcotest.test_case "Precedence8" `Quick test_precedence8;
+ Alcotest.test_case "Nested string" `Quick nested_string;
] )
--
cgit v1.2.3