aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/idents.ml6
-rw-r--r--lib/lexbuf.ml32
-rw-r--r--lib/lexbuf.mli6
-rw-r--r--lib/lexer.ml73
-rw-r--r--lib/lexer.mli4
5 files changed, 52 insertions, 69 deletions
diff --git a/lib/idents.ml b/lib/idents.ml
index dab0ba0..baf23dc 100644
--- a/lib/idents.ml
+++ b/lib/idents.ml
@@ -3,12 +3,6 @@ module T = Qsp_syntax.T
let keyword_table = Hashtbl.create 53
-let char_of_uchar : Uchar.t -> char =
- fun u -> match Uchar.is_char u with true -> Uchar.to_char u | _ -> '?'
-
-let of_uchars : Uchar.t array -> string =
- fun arr -> Array.to_seq arr |> Seq.map char_of_uchar |> String.of_seq
-
let _ =
List.iter
(fun (kwd, tok) -> Hashtbl.add keyword_table kwd tok)
diff --git a/lib/lexbuf.ml b/lib/lexbuf.ml
index 8ddba2d..6059c8a 100644
--- a/lib/lexbuf.ml
+++ b/lib/lexbuf.ml
@@ -1,7 +1,17 @@
-type t = { buffer : Sedlexing.lexbuf; mutable start_p : Lexing.position option }
+type t = {
+ buffer : Sedlexing.lexbuf;
+ mutable start_p : Lexing.position option;
+ mutable expression_level : int;
+}
let buffer : t -> Sedlexing.lexbuf = fun t -> t.buffer
-let start : t -> unit = fun t -> Sedlexing.start t.buffer
+
+let start : t -> unit =
+ fun t ->
+ let _start_pos, end_pos = Sedlexing.lexing_positions t.buffer in
+ Sedlexing.set_position t.buffer { end_pos with Lexing.pos_lnum = 1 };
+ t.start_p <- None;
+ t.expression_level <- 0
let positions : t -> Lexing.position * Lexing.position =
fun t -> Sedlexing.lexing_positions t.buffer
@@ -9,7 +19,7 @@ let positions : t -> Lexing.position * Lexing.position =
let content : t -> string = fun t -> Sedlexing.Utf8.lexeme t.buffer
let from_lexbuf : Sedlexing.lexbuf -> t =
- fun t -> { buffer = t; start_p = None }
+ fun t -> { buffer = t; start_p = None; expression_level = 0 }
let set_start_position : t -> Lexing.position -> unit =
fun t position -> t.start_p <- Some position
@@ -28,3 +38,19 @@ let tokenize : (t -> 'a) -> t -> unit -> 'a * Lexing.position * Lexing.position
(token, start_p, curr_p)
in
lexer
+
+(* The comment system is terrible. The same symbol can be used for :
+ - starting a comment
+ - inequality operation
+ In order to manage this, I try to identify the context in a very basic way,
+ using a counter for determining the token to send.
+*)
+
+let incr_level : t -> unit =
+ fun t -> t.expression_level <- t.expression_level + 1
+
+let decr_level : t -> unit =
+ fun t -> t.expression_level <- t.expression_level - 1
+
+let reset_level : t -> unit = fun t -> t.expression_level <- 0
+let level : t -> int = fun t -> t.expression_level
diff --git a/lib/lexbuf.mli b/lib/lexbuf.mli
index b058f3c..918c011 100644
--- a/lib/lexbuf.mli
+++ b/lib/lexbuf.mli
@@ -17,3 +17,9 @@ val content : t -> string
val set_start_position : t -> Lexing.position -> unit
val tokenize : (t -> 'a) -> t -> unit -> 'a * Lexing.position * Lexing.position
+val incr_level : t -> unit
+val decr_level : t -> unit
+val reset_level : t -> unit
+
+val level : t -> int
+(** Return the nested expression level *)
diff --git a/lib/lexer.ml b/lib/lexer.ml
index 7014d85..c643577 100644
--- a/lib/lexer.ml
+++ b/lib/lexer.ml
@@ -14,49 +14,16 @@ let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } =
(* Extract the location name from the pattern *)
let location_name = Str.regexp {|.* \(.*\)|}
-module type Encoding = sig
- val lexeme : Sedlexing.lexbuf -> string
-end
-
-(* The comment system is terrible. The same symbol can be used for :
- - starting a comment
- - inequality operation
- In order to manage this, I try to identify the context in a very basic way,
- using a flag True False for determining the token to send.
-*)
-module Bucket = Ephemeron.K1.Bucket
-
-type bucket = (Sedlexing.lexbuf, int) Bucket.t
-
-let is_expression : bucket = Bucket.make ()
-
-let incr_level lexbuf =
- match Bucket.find is_expression lexbuf with
- | None -> Bucket.add is_expression lexbuf 1
- | Some v -> Bucket.add is_expression lexbuf (v + 1)
-
-let decr_level lexbuf =
- match Bucket.find is_expression lexbuf with
- | None -> ()
- | Some v ->
- if v > 1 then Bucket.add is_expression lexbuf (v - 1)
- else Bucket.remove is_expression lexbuf
-
-let build_ident lexbuf =
- let id =
- Sedlexing.lexeme lexbuf |> Idents.of_uchars |> String.uppercase_ascii
- in
+let build_ident buffer =
+ let id = Lexbuf.content buffer |> String.uppercase_ascii in
try
let value = Hashtbl.find Idents.keyword_table id in
- let _ = match value with IF | ELIF -> incr_level lexbuf | _ -> () in
+ let _ =
+ match value with IF | ELIF -> Lexbuf.incr_level buffer | _ -> ()
+ in
value
with Not_found -> IDENT id
-let incr_level lexbuf =
- match Bucket.find is_expression lexbuf with
- | None -> Bucket.add is_expression lexbuf 1
- | Some v -> Bucket.add is_expression lexbuf (v + 1)
-
let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a =
fun rule lexbuf ->
let _, position = Lexbuf.positions lexbuf in
@@ -154,12 +121,9 @@ let rec token : Lexbuf.t -> token =
let lexbuf = Lexbuf.buffer buffer in
match%sedlex lexbuf with
| 0Xfeff ->
- Lexbuf.start buffer;
(* Ignore the BOM *)
token buffer
| '#', Star space, location ->
- let _start_pos, end_pos = Sedlexing.lexing_positions lexbuf in
-
(* Extract the location name *)
let ident = Lexbuf.content buffer in
let () =
@@ -169,12 +133,11 @@ let rec token : Lexbuf.t -> token =
in
(* Restart the line number (new location here) *)
- Sedlexing.start lexbuf;
- Sedlexing.set_position lexbuf { end_pos with Lexing.pos_lnum = 1 };
+ Lexbuf.start buffer;
LOCATION_START ident
| '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) ->
- Bucket.remove is_expression lexbuf;
+ Lexbuf.reset_level buffer;
LOCATION_END
| Plus digit -> INTEGER (Lexbuf.content buffer)
| '+' -> PLUS
@@ -186,33 +149,31 @@ let rec token : Lexbuf.t -> token =
| '*' -> STAR
| ':' ->
(* We are leaving the block, the comment will be handled again *)
- decr_level lexbuf;
+ Lexbuf.decr_level buffer;
+
COLUMN
| '[' -> L_BRACKET
| ']' -> R_BRACKET
| '(' ->
- incr_level lexbuf;
+ Lexbuf.incr_level buffer;
L_PAREN
| ')' ->
- decr_level lexbuf;
+ Lexbuf.decr_level buffer;
R_PAREN
| '<' -> LT
| '>' -> GT
| coma -> COMA
| '=' ->
- incr_level lexbuf;
+ Lexbuf.incr_level buffer;
EQUAL
- | ident -> build_ident lexbuf
+ | ident -> build_ident buffer
| eol ->
- Bucket.add is_expression lexbuf 0;
+ Lexbuf.reset_level buffer;
EOL
| '&' ->
- Bucket.add is_expression lexbuf 0;
+ Lexbuf.reset_level buffer;
AMPERSAND
- | '!' -> (
- match Bucket.find is_expression lexbuf with
- | Some i when i <> 0 -> EXCLAMATION
- | _ -> skip_comment buffer)
+ | '!' -> if Lexbuf.level buffer > 0 then EXCLAMATION else skip_comment buffer
| spaces -> token buffer
| '\'' -> LITERAL (wait_balance read_quoted_string buffer)
| '"' -> LITERAL (wait_balance read_dquoted_string buffer)
@@ -242,7 +203,7 @@ let rec discard buffer =
discard buffer
| eof -> raise EOF
| '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) ->
- Bucket.remove is_expression lexbuf;
+ Lexbuf.reset_level buffer;
()
| '!' ->
ignore @@ skip_comment buffer;
diff --git a/lib/lexer.mli b/lib/lexer.mli
index 65e9fa0..585877c 100644
--- a/lib/lexer.mli
+++ b/lib/lexer.mli
@@ -1,7 +1,3 @@
-module type Encoding = sig
- val lexeme : Sedlexing.lexbuf -> string
-end
-
exception EOF
val token : Lexbuf.t -> Tokens.token