(** Lexer using sedlex *) open Tokens exception UnclosedQuote exception LexError of string exception EOF (* Extract the location name from the pattern *) let location_name = Str.regexp {|.* \(.*\)|} (** Remove all the expression state when we are leaving the expression itself. *) let rec leave_expression buffer = match Lexbuf.state buffer with | Some Lexbuf.Expression -> Lexbuf.leave_state buffer; leave_expression buffer | _ -> () (** Try to read the identifier and check if this is a function, a keyword, or just a variable. See the tests [Syntax.Operator2] and [Syntax.Call Nl] for two cases. *) let build_ident buffer = let id = Lexbuf.content buffer |> String.uppercase_ascii in try let value = Hashtbl.find Idents.keyword_table id in let _ = match value with | IF | ELIF -> Lexbuf.enter_state buffer Lexbuf.Expression | _ -> () in value with Not_found -> (* If the identifier does not match a keyword and start with [*], then try it as a '*' operator. *) if Char.equal '*' id.[0] then ( Lexbuf.rollback buffer; let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with '*' -> STAR | _ -> IDENT id) else IDENT id let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a = fun rule lexbuf -> let _, position = Lexbuf.positions lexbuf in Lexbuf.set_start_position lexbuf position; try let token = rule (Buffer.create 256) lexbuf in token with Not_found -> raise UnclosedQuote let space = [%sedlex.regexp? ' ' | '\t'] let spaces = [%sedlex.regexp? Plus space] let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"] let coma = [%sedlex.regexp? ','] let digit = [%sedlex.regexp? '0' .. '9'] let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_'] let ident = [%sedlex.regexp? Opt ('$' | '*'), letters, Star (digit | letters)] let location_ident = [%sedlex.regexp? letters | digit] let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] (** Change the state when we are ending a string. Send the text marker to the parser in order to tell the string is over. This can work because the state EndString is only raised when the same token is fetched inside the appropriate sting method lexer. The [Lexbuf.rollback] function is called in order to let the same token occur again. *) let end_string : Lexbuf.t -> token = fun buffer -> let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '}' -> Lexbuf.leave_state buffer; TEXT_MARKER | '\'' -> Lexbuf.leave_state buffer; TEXT_MARKER | '"' -> Lexbuf.leave_state buffer; TEXT_MARKER | _ -> raise Not_found let rec read_long_string level buf buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '{' -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_long_string (level + 1) buf buffer | '}' -> ( match level with | 0 -> Lexbuf.leave_state buffer; Lexbuf.enter_state buffer Lexbuf.EndString; Lexbuf.rollback buffer; LITERAL (Buffer.contents buf) | _ -> (* We have nested strings. Do not terminate end *) Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_long_string (level - 1) buf buffer) | eol -> Buffer.add_string buf (Lexbuf.content buffer); read_long_string level buf buffer | any -> Buffer.add_string buf (Lexbuf.content buffer); read_long_string level buf buffer | _ -> raise Not_found let rec read_dquoted_string buf buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | "\"\"" -> Buffer.add_char buf '"'; read_dquoted_string buf buffer | '"' -> Lexbuf.leave_state buffer; Lexbuf.enter_state buffer Lexbuf.EndString; Lexbuf.rollback buffer; LITERAL (Buffer.contents buf) | any -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_dquoted_string buf buffer | _ -> raise Not_found let rec read_quoted_string buf buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | "''" -> Buffer.add_char buf '\''; read_quoted_string buf buffer | '\'' -> Lexbuf.leave_state buffer; Lexbuf.enter_state buffer Lexbuf.EndString; Lexbuf.rollback buffer; LITERAL (Buffer.contents buf) | eol -> Buffer.add_string buf (Lexbuf.content buffer); read_quoted_string buf buffer | any -> Buffer.add_string buf (Lexbuf.content buffer); read_quoted_string buf buffer | _ -> raise Not_found let rec skip_comment buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '{' -> let _ = wait_balance (read_long_string 0) buffer in let _ = end_string buffer in skip_comment buffer | '\'' -> let _ = wait_balance read_quoted_string buffer in let _ = end_string buffer in skip_comment buffer | '"' -> let _ = wait_balance read_dquoted_string buffer in let _ = end_string buffer in skip_comment buffer | eol -> (* Ugly hack used in order to put the eol in the front of the next parsing. *) Lexbuf.rollback buffer; COMMENT | any -> skip_comment buffer | _ -> raise Not_found (** Main lexer *) let rec token : Lexbuf.t -> token = fun buffer -> let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | 0Xfeff -> (* Ignore the BOM *) token buffer | '#', Star space, location -> (* Extract the location name *) let ident = Lexbuf.content buffer in let () = match Str.string_match location_name ident 0 with | false -> () | true -> Sedlexing.set_filename lexbuf (Str.matched_group 1 ident) in (* Restart the line number (new location here) *) Lexbuf.start buffer; LOCATION_START ident | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> leave_expression buffer; LOCATION_END | Plus digit -> INTEGER (Lexbuf.content buffer) | '+' -> PLUS | '-' -> MINUS | "+=" -> INCR | "-=" -> DECR | "*=" -> MULT_EQUAL | '/' -> DIV | '*' -> STAR | ':' -> (* We are leaving the block, the comment will be handled again *) Lexbuf.leave_state buffer; COLUMN | '[' -> L_BRACKET | ']' -> R_BRACKET | '(' -> Lexbuf.enter_state buffer Lexbuf.Expression; L_PAREN | ')' -> Lexbuf.leave_state buffer; R_PAREN | ">>" -> Lexbuf.leave_state buffer; token buffer | '<' -> LT | '>' -> GT | coma -> COMA | '=' -> Lexbuf.enter_state buffer Lexbuf.Expression; EQUAL | ident -> build_ident buffer | eol -> leave_expression buffer; EOL | '&' -> leave_expression buffer; AMPERSAND | '!' -> ( match Lexbuf.state buffer with | Some Lexbuf.Expression -> EXCLAMATION | _ -> skip_comment buffer) | spaces -> token buffer | '\'' -> Lexbuf.enter_state buffer Lexbuf.String; TEXT_MARKER | '"' -> Lexbuf.enter_state buffer Lexbuf.DString; TEXT_MARKER | '{' -> Lexbuf.enter_state buffer (Lexbuf.MString 0); TEXT_MARKER | '}' -> TEXT_MARKER | eof -> raise EOF | _ -> let tok = Lexbuf.content buffer in let msg = Format.asprintf "Unexpected character %S" tok in raise @@ LexError msg let main buffer = match Lexbuf.state buffer with | Some Lexbuf.String -> wait_balance read_quoted_string buffer | Some Lexbuf.DString -> wait_balance read_dquoted_string buffer | Some (Lexbuf.MString level) -> wait_balance (read_long_string level) buffer | Some Lexbuf.EndString -> end_string buffer | _ -> token buffer let rec discard buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '\'' -> ignore (wait_balance read_quoted_string buffer); discard buffer | '"' -> ignore (wait_balance read_dquoted_string buffer); discard buffer | '{' -> ignore (wait_balance (read_long_string 0) buffer); discard buffer | eof -> raise EOF | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> leave_expression buffer; () | '!' -> ignore @@ skip_comment buffer; discard buffer | any -> discard buffer | _ -> raise EOF