From 7de7b756e6e3168a5bfb579470031d999a6e8585 Mon Sep 17 00:00:00 2001 From: Chimrod <> Date: Fri, 29 Sep 2023 10:33:48 +0200 Subject: Cleaned the code, removed the old Ephemeron trick --- lib/idents.ml | 6 ----- lib/lexbuf.ml | 32 ++++++++++++++++++++++--- lib/lexbuf.mli | 6 +++++ lib/lexer.ml | 73 ++++++++++++++-------------------------------------------- lib/lexer.mli | 4 ---- 5 files changed, 52 insertions(+), 69 deletions(-) (limited to 'lib') diff --git a/lib/idents.ml b/lib/idents.ml index dab0ba0..baf23dc 100644 --- a/lib/idents.ml +++ b/lib/idents.ml @@ -3,12 +3,6 @@ module T = Qsp_syntax.T let keyword_table = Hashtbl.create 53 -let char_of_uchar : Uchar.t -> char = - fun u -> match Uchar.is_char u with true -> Uchar.to_char u | _ -> '?' - -let of_uchars : Uchar.t array -> string = - fun arr -> Array.to_seq arr |> Seq.map char_of_uchar |> String.of_seq - let _ = List.iter (fun (kwd, tok) -> Hashtbl.add keyword_table kwd tok) diff --git a/lib/lexbuf.ml b/lib/lexbuf.ml index 8ddba2d..6059c8a 100644 --- a/lib/lexbuf.ml +++ b/lib/lexbuf.ml @@ -1,7 +1,17 @@ -type t = { buffer : Sedlexing.lexbuf; mutable start_p : Lexing.position option } +type t = { + buffer : Sedlexing.lexbuf; + mutable start_p : Lexing.position option; + mutable expression_level : int; +} let buffer : t -> Sedlexing.lexbuf = fun t -> t.buffer -let start : t -> unit = fun t -> Sedlexing.start t.buffer + +let start : t -> unit = + fun t -> + let _start_pos, end_pos = Sedlexing.lexing_positions t.buffer in + Sedlexing.set_position t.buffer { end_pos with Lexing.pos_lnum = 1 }; + t.start_p <- None; + t.expression_level <- 0 let positions : t -> Lexing.position * Lexing.position = fun t -> Sedlexing.lexing_positions t.buffer @@ -9,7 +19,7 @@ let positions : t -> Lexing.position * Lexing.position = let content : t -> string = fun t -> Sedlexing.Utf8.lexeme t.buffer let from_lexbuf : Sedlexing.lexbuf -> t = - fun t -> { buffer = t; start_p = None } + fun t -> { buffer = t; start_p = None; expression_level = 0 } let set_start_position : t -> Lexing.position -> unit = fun t position -> t.start_p <- Some position @@ -28,3 +38,19 @@ let tokenize : (t -> 'a) -> t -> unit -> 'a * Lexing.position * Lexing.position (token, start_p, curr_p) in lexer + +(* The comment system is terrible. The same symbol can be used for : + - starting a comment + - inequality operation + In order to manage this, I try to identify the context in a very basic way, + using a counter for determining the token to send. +*) + +let incr_level : t -> unit = + fun t -> t.expression_level <- t.expression_level + 1 + +let decr_level : t -> unit = + fun t -> t.expression_level <- t.expression_level - 1 + +let reset_level : t -> unit = fun t -> t.expression_level <- 0 +let level : t -> int = fun t -> t.expression_level diff --git a/lib/lexbuf.mli b/lib/lexbuf.mli index b058f3c..918c011 100644 --- a/lib/lexbuf.mli +++ b/lib/lexbuf.mli @@ -17,3 +17,9 @@ val content : t -> string val set_start_position : t -> Lexing.position -> unit val tokenize : (t -> 'a) -> t -> unit -> 'a * Lexing.position * Lexing.position +val incr_level : t -> unit +val decr_level : t -> unit +val reset_level : t -> unit + +val level : t -> int +(** Return the nested expression level *) diff --git a/lib/lexer.ml b/lib/lexer.ml index 7014d85..c643577 100644 --- a/lib/lexer.ml +++ b/lib/lexer.ml @@ -14,49 +14,16 @@ let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } = (* Extract the location name from the pattern *) let location_name = Str.regexp {|.* \(.*\)|} -module type Encoding = sig - val lexeme : Sedlexing.lexbuf -> string -end - -(* The comment system is terrible. The same symbol can be used for : - - starting a comment - - inequality operation - In order to manage this, I try to identify the context in a very basic way, - using a flag True False for determining the token to send. -*) -module Bucket = Ephemeron.K1.Bucket - -type bucket = (Sedlexing.lexbuf, int) Bucket.t - -let is_expression : bucket = Bucket.make () - -let incr_level lexbuf = - match Bucket.find is_expression lexbuf with - | None -> Bucket.add is_expression lexbuf 1 - | Some v -> Bucket.add is_expression lexbuf (v + 1) - -let decr_level lexbuf = - match Bucket.find is_expression lexbuf with - | None -> () - | Some v -> - if v > 1 then Bucket.add is_expression lexbuf (v - 1) - else Bucket.remove is_expression lexbuf - -let build_ident lexbuf = - let id = - Sedlexing.lexeme lexbuf |> Idents.of_uchars |> String.uppercase_ascii - in +let build_ident buffer = + let id = Lexbuf.content buffer |> String.uppercase_ascii in try let value = Hashtbl.find Idents.keyword_table id in - let _ = match value with IF | ELIF -> incr_level lexbuf | _ -> () in + let _ = + match value with IF | ELIF -> Lexbuf.incr_level buffer | _ -> () + in value with Not_found -> IDENT id -let incr_level lexbuf = - match Bucket.find is_expression lexbuf with - | None -> Bucket.add is_expression lexbuf 1 - | Some v -> Bucket.add is_expression lexbuf (v + 1) - let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a = fun rule lexbuf -> let _, position = Lexbuf.positions lexbuf in @@ -154,12 +121,9 @@ let rec token : Lexbuf.t -> token = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | 0Xfeff -> - Lexbuf.start buffer; (* Ignore the BOM *) token buffer | '#', Star space, location -> - let _start_pos, end_pos = Sedlexing.lexing_positions lexbuf in - (* Extract the location name *) let ident = Lexbuf.content buffer in let () = @@ -169,12 +133,11 @@ let rec token : Lexbuf.t -> token = in (* Restart the line number (new location here) *) - Sedlexing.start lexbuf; - Sedlexing.set_position lexbuf { end_pos with Lexing.pos_lnum = 1 }; + Lexbuf.start buffer; LOCATION_START ident | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> - Bucket.remove is_expression lexbuf; + Lexbuf.reset_level buffer; LOCATION_END | Plus digit -> INTEGER (Lexbuf.content buffer) | '+' -> PLUS @@ -186,33 +149,31 @@ let rec token : Lexbuf.t -> token = | '*' -> STAR | ':' -> (* We are leaving the block, the comment will be handled again *) - decr_level lexbuf; + Lexbuf.decr_level buffer; + COLUMN | '[' -> L_BRACKET | ']' -> R_BRACKET | '(' -> - incr_level lexbuf; + Lexbuf.incr_level buffer; L_PAREN | ')' -> - decr_level lexbuf; + Lexbuf.decr_level buffer; R_PAREN | '<' -> LT | '>' -> GT | coma -> COMA | '=' -> - incr_level lexbuf; + Lexbuf.incr_level buffer; EQUAL - | ident -> build_ident lexbuf + | ident -> build_ident buffer | eol -> - Bucket.add is_expression lexbuf 0; + Lexbuf.reset_level buffer; EOL | '&' -> - Bucket.add is_expression lexbuf 0; + Lexbuf.reset_level buffer; AMPERSAND - | '!' -> ( - match Bucket.find is_expression lexbuf with - | Some i when i <> 0 -> EXCLAMATION - | _ -> skip_comment buffer) + | '!' -> if Lexbuf.level buffer > 0 then EXCLAMATION else skip_comment buffer | spaces -> token buffer | '\'' -> LITERAL (wait_balance read_quoted_string buffer) | '"' -> LITERAL (wait_balance read_dquoted_string buffer) @@ -242,7 +203,7 @@ let rec discard buffer = discard buffer | eof -> raise EOF | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> - Bucket.remove is_expression lexbuf; + Lexbuf.reset_level buffer; () | '!' -> ignore @@ skip_comment buffer; diff --git a/lib/lexer.mli b/lib/lexer.mli index 65e9fa0..585877c 100644 --- a/lib/lexer.mli +++ b/lib/lexer.mli @@ -1,7 +1,3 @@ -module type Encoding = sig - val lexeme : Sedlexing.lexbuf -> string -end - exception EOF val token : Lexbuf.t -> Tokens.token -- cgit v1.2.3