(** Lexer using sedlex *) open Tokens exception UnclosedQuote of { content : string; line : int } exception LexError of Lexing.position * string exception EOF let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } = Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol) (* Extract the location name from the pattern *) let location_name = Str.regexp {|.* \(.*\)|} module type Encoding = sig val lexeme : Sedlexing.lexbuf -> string end (* The comment system is terrible. The same symbol can be used for : - starting a comment - inequality operation In order to manage this, I try to identify the context in a very basic way, using a flag True False for determining the token to send. *) module Bucket = Ephemeron.K1.Bucket type bucket = (Sedlexing.lexbuf, int) Bucket.t let is_expression : bucket = Bucket.make () let incr_level lexbuf = match Bucket.find is_expression lexbuf with | None -> Bucket.add is_expression lexbuf 1 | Some v -> Bucket.add is_expression lexbuf (v + 1) let decr_level lexbuf = match Bucket.find is_expression lexbuf with | None -> () | Some v -> if v > 1 then Bucket.add is_expression lexbuf (v - 1) else Bucket.remove is_expression lexbuf let build_ident lexbuf = let id = Sedlexing.lexeme lexbuf |> Idents.of_uchars |> String.uppercase_ascii in try let value = Hashtbl.find Idents.keyword_table id in let _ = match value with IF | ELIF -> incr_level lexbuf | _ -> () in value with Not_found -> IDENT id let incr_level lexbuf = match Bucket.find is_expression lexbuf with | None -> Bucket.add is_expression lexbuf 1 | Some v -> Bucket.add is_expression lexbuf (v + 1) let wait_balance rule lexbuf = try[@warning "-52"] rule (Buffer.create 17) lexbuf with Failure "lexing: empty token" -> let position, _ = Sedlexing.lexing_positions lexbuf in let line = position.Lexing.pos_lnum and content = Sedlexing.lexeme lexbuf |> Idents.of_uchars in (raise (UnclosedQuote { line; content }) [@warning "+52"]) let space = [%sedlex.regexp? ' ' | '\t'] let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"] let coma = [%sedlex.regexp? ','] let digit = [%sedlex.regexp? '0' .. '9'] let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_'] let spaces = [%sedlex.regexp? Plus space] let ident = [%sedlex.regexp? ('$' | letters), Star (digit | letters)] let location_ident = [%sedlex.regexp? letters | digit] let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] let rec read_long_string level buf lexbuf = match%sedlex lexbuf with | '{' -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_long_string (level + 1) buf lexbuf | '}' -> ( match level with | 0 -> Buffer.contents buf | _ -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_long_string (level - 1) buf lexbuf) | eol -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_long_string level buf lexbuf | any -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_long_string level buf lexbuf | _ -> raise Not_found let rec read_dquoted_string buf lexbuf = match%sedlex lexbuf with | "\"\"" -> Buffer.add_char buf '"'; read_dquoted_string buf lexbuf | '"' -> Buffer.contents buf | any -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_dquoted_string buf lexbuf | _ -> raise Not_found let rec read_quoted_string buf lexbuf = match%sedlex lexbuf with | "''" -> Buffer.add_char buf '\''; read_quoted_string buf lexbuf | '\'' -> Buffer.contents buf | eol -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_quoted_string buf lexbuf | any -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_quoted_string buf lexbuf | _ -> raise Not_found let rec skip_comment lexbuf = match%sedlex lexbuf with | '{' -> let _ = wait_balance (read_long_string 0) lexbuf in skip_comment lexbuf | '\'' -> let _ = wait_balance read_quoted_string lexbuf in skip_comment lexbuf | '"' -> let _ = wait_balance read_dquoted_string lexbuf in skip_comment lexbuf | eol -> (* Ugly hack used in order to put the eol in the front of the next parsing. *) Sedlexing.rollback lexbuf; COMMENT | any -> skip_comment lexbuf | _ -> raise Not_found (** Main lexer *) let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = fun (module E : Encoding) lexbuf -> match%sedlex lexbuf with | 0Xfeff -> Sedlexing.start lexbuf; (* Ignore the BOM *) token (module E) lexbuf | '#', Star space, location -> let _start_pos, end_pos = Sedlexing.lexing_positions lexbuf in (* Extract the location name *) let ident = E.lexeme lexbuf in let () = match Str.string_match location_name ident 0 with | false -> () | true -> Sedlexing.set_filename lexbuf (Str.matched_group 1 ident) in (* Restart the line number (new location here) *) Sedlexing.start lexbuf; Sedlexing.set_position lexbuf { end_pos with Lexing.pos_lnum = 1 }; LOCATION_START ident | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> Bucket.remove is_expression lexbuf; LOCATION_END | Plus digit -> INTEGER (E.lexeme lexbuf) | '+' -> PLUS | '-' -> MINUS | "+=" -> INCR | "-=" -> DECR | "*=" -> MULT_EQUAL | '/' -> DIV | '*' -> STAR | ':' -> (* We are leaving the block, the comment will be handled again *) decr_level lexbuf; COLUMN | '[' -> L_BRACKET | ']' -> R_BRACKET | '(' -> incr_level lexbuf; L_PAREN | ')' -> decr_level lexbuf; R_PAREN | '<' -> LT | '>' -> GT | coma -> COMA | '=' -> incr_level lexbuf; EQUAL | ident -> build_ident lexbuf | eol -> Bucket.add is_expression lexbuf 0; EOL | '&' -> Bucket.add is_expression lexbuf 0; AMPERSAND | '!' -> ( match Bucket.find is_expression lexbuf with | Some i when i <> 0 -> EXCLAMATION | _ -> skip_comment lexbuf) | spaces -> token (module E) lexbuf | '\'' -> LITERAL (wait_balance read_quoted_string lexbuf) | '"' -> LITERAL (wait_balance read_dquoted_string lexbuf) | '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf) | eof -> raise EOF | _ -> let position = fst @@ Sedlexing.lexing_positions lexbuf in let tok = E.lexeme lexbuf in let msg = Format.asprintf "Unexpected character %S at %a" tok pp_pos position in raise @@ LexError (position, msg) let rec discard lexbuf = match%sedlex lexbuf with | '\'' -> ignore (wait_balance read_quoted_string lexbuf); discard lexbuf | '"' -> ignore (wait_balance read_dquoted_string lexbuf); discard lexbuf | '{' -> ignore (wait_balance (read_long_string 0) lexbuf); discard lexbuf | eof -> raise EOF | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> Bucket.remove is_expression lexbuf; () | '!' -> ignore @@ skip_comment lexbuf; discard lexbuf | any -> discard lexbuf | _ -> raise EOF (** Tokenizer for menhir *) let lexer : (module Encoding) -> Sedlexing.lexbuf -> unit -> token * Lexing.position * Lexing.position = fun (module E : Encoding.S) lexbuf -> Sedlexing.with_tokenizer (token (module E)) lexbuf