aboutsummaryrefslogtreecommitdiff
path: root/lib/lexer.ml
diff options
context:
space:
mode:
Diffstat (limited to 'lib/lexer.ml')
-rw-r--r--lib/lexer.ml211
1 files changed, 211 insertions, 0 deletions
diff --git a/lib/lexer.ml b/lib/lexer.ml
new file mode 100644
index 0000000..324171f
--- /dev/null
+++ b/lib/lexer.ml
@@ -0,0 +1,211 @@
+(**
+ Lexer using sedlex
+ *)
+
+open Tokens
+
+exception UnclosedQuote of { content : string; line : int }
+exception LexError of Lexing.position * string
+
+let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } =
+ Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol)
+
+module type Encoding = sig
+ val lexeme : Sedlexing.lexbuf -> string
+end
+
+(* The comment system is terrible. The same symbol can be used for :
+ - starting a comment
+ - inequality operation
+ In order to manage this, I try to identify the context in a very basic way,
+ using a flag True False for determining the token to send.
+*)
+module Bucket = Ephemeron.K1.Bucket
+
+type bucket = (Sedlexing.lexbuf, int) Bucket.t
+
+let is_expression : bucket = Bucket.make ()
+
+let incr_level lexbuf =
+ match Bucket.find is_expression lexbuf with
+ | None -> Bucket.add is_expression lexbuf 1
+ | Some v -> Bucket.add is_expression lexbuf (v + 1)
+
+let decr_level lexbuf =
+ match Bucket.find is_expression lexbuf with
+ | None -> ()
+ | Some v ->
+ if v > 1 then Bucket.add is_expression lexbuf (v - 1)
+ else Bucket.remove is_expression lexbuf
+
+let build_ident lexbuf =
+ let id =
+ Sedlexing.lexeme lexbuf |> Idents.of_uchars |> String.uppercase_ascii
+ in
+ try
+ let value = Hashtbl.find Idents.keyword_table id in
+ let _ = match value with IF | ELIF -> incr_level lexbuf | _ -> () in
+ value
+ with Not_found -> IDENT id
+
+let incr_level lexbuf =
+ match Bucket.find is_expression lexbuf with
+ | None -> Bucket.add is_expression lexbuf 1
+ | Some v -> Bucket.add is_expression lexbuf (v + 1)
+
+let wait_balance rule lexbuf =
+ try[@warning "-52"] rule (Buffer.create 17) lexbuf
+ with Failure "lexing: empty token" ->
+ let position, _ = Sedlexing.lexing_positions lexbuf in
+ let line = position.Lexing.pos_lnum
+ and content = Sedlexing.lexeme lexbuf |> Idents.of_uchars in
+ (raise (UnclosedQuote { line; content }) [@warning "+52"])
+
+let space = [%sedlex.regexp? ' ' | '\t']
+let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"]
+let coma = [%sedlex.regexp? ',']
+let digit = [%sedlex.regexp? '0' .. '9']
+let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_']
+let spaces = [%sedlex.regexp? Plus space]
+let ident = [%sedlex.regexp? ('$' | letters), Star (digit | letters)]
+let location_ident = [%sedlex.regexp? letters | digit]
+let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^']
+let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident]
+
+let rec read_long_string level buf lexbuf =
+ match%sedlex lexbuf with
+ | '{' ->
+ Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+ read_long_string (level + 1) buf lexbuf
+ | '}' -> (
+ match level with
+ | 0 -> Buffer.contents buf
+ | _ ->
+ Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+ read_long_string (level - 1) buf lexbuf)
+ | eol ->
+ Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+ read_long_string level buf lexbuf
+ | any ->
+ Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+ read_long_string level buf lexbuf
+ | _ -> raise Not_found
+
+let rec read_dquoted_string buf lexbuf =
+ match%sedlex lexbuf with
+ | "\"\"" ->
+ Buffer.add_char buf '"';
+ read_dquoted_string buf lexbuf
+ | '"' -> Buffer.contents buf
+ | any ->
+ Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+ read_dquoted_string buf lexbuf
+ | _ -> raise Not_found
+
+let rec read_quoted_string buf lexbuf =
+ match%sedlex lexbuf with
+ | "''" ->
+ Buffer.add_char buf '\'';
+ read_quoted_string buf lexbuf
+ | '\'' -> Buffer.contents buf
+ | eol ->
+ Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+ read_quoted_string buf lexbuf
+ | any ->
+ Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
+ read_quoted_string buf lexbuf
+ | _ -> raise Not_found
+
+let rec skip_comment lexbuf =
+ match%sedlex lexbuf with
+ | '{' ->
+ let _ = wait_balance (read_long_string 0) lexbuf in
+ skip_comment lexbuf
+ | '\'' ->
+ let _ = wait_balance read_quoted_string lexbuf in
+ skip_comment lexbuf
+ | '"' ->
+ let _ = wait_balance read_dquoted_string lexbuf in
+ skip_comment lexbuf
+ | eol ->
+ (* Ugly hack used in order to put the eol in the front of the next
+ parsing. *)
+ Sedlexing.rollback lexbuf;
+ COMMENT
+ | any -> skip_comment lexbuf
+ | _ -> raise Not_found
+
+(** Main lexer *)
+let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token =
+ fun (module E : Encoding) lexbuf ->
+ match%sedlex lexbuf with
+ | 0Xfeff ->
+ (* Ignore the BOM *)
+ token (module E) lexbuf
+ | '#', Star space, location ->
+ let ident = E.lexeme lexbuf in
+
+ LOCATION_START ident
+ | Plus digit -> INTEGER (E.lexeme lexbuf)
+ | '+' -> PLUS
+ | '-' -> MINUS
+ | "+=" -> INCR
+ | "-=" -> DECR
+ | "*=" -> MULT_EQUAL
+ | '/' -> DIV
+ | '*' -> STAR
+ | ':' ->
+ (* We are leaving the block, the comment will be handled again *)
+ decr_level lexbuf;
+ COLUMN
+ | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> LOCATION_END
+ | '[' -> L_BRACKET
+ | ']' -> R_BRACKET
+ | '(' ->
+ incr_level lexbuf;
+ L_PAREN
+ | ')' ->
+ decr_level lexbuf;
+ R_PAREN
+ | '<' -> LT
+ | '>' -> GT
+ | coma -> COMA
+ | eof ->
+ Bucket.remove is_expression lexbuf;
+ EOF
+ | '=' ->
+ incr_level lexbuf;
+ EQUAL
+ | ident -> build_ident lexbuf
+ | eol ->
+ Bucket.add is_expression lexbuf 0;
+ EOL
+ | '&' ->
+ Bucket.add is_expression lexbuf 0;
+ AMPERSAND
+ | '!' -> (
+ match Bucket.find is_expression lexbuf with
+ | Some i when i <> 0 -> EXCLAMATION
+ | _ -> skip_comment lexbuf)
+ | spaces -> token (module E) lexbuf
+ | '\'' -> LITERAL (wait_balance read_quoted_string lexbuf)
+ | '"' -> LITERAL (wait_balance read_dquoted_string lexbuf)
+ | '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf)
+ | _ ->
+ let position = fst @@ Sedlexing.lexing_positions lexbuf in
+ let tok = E.lexeme lexbuf in
+
+ let msg =
+ Format.asprintf "Unexpected character %S at %a" tok pp_pos position
+ in
+
+ raise @@ LexError (position, msg)
+
+(** Tokenizer for menhir *)
+let lexer :
+ (module Encoding) ->
+ Sedlexing.lexbuf ->
+ unit ->
+ token * Lexing.position * Lexing.position =
+ fun (module E : Encoding.S) lexbuf ->
+ Sedlexing.with_tokenizer (token (module E)) lexbuf