(** Lexer using sedlex *) open Tokens exception UnclosedQuote exception LexError of string exception EOF (* Extract the location name from the pattern *) let location_name = Str.regexp {|.* \(.*\)|} (** Remove all the expression state when we are leaving the expression itself. *) let rec leave_expression buffer = match Lexbuf.state buffer with | Some Lexbuf.Expression -> Lexbuf.leave_state buffer; leave_expression buffer | _ -> () (** Try to read the identifier and check if this is a function, a keyword, or just a variable. See the tests [Syntax.Operator2] and [Syntax.Call Nl] for two cases. *) let build_ident buffer = let id = Lexbuf.content buffer |> String.uppercase_ascii in try let value = Hashtbl.find Idents.keyword_table id in let _ = match value with | IF | ELIF -> Lexbuf.enter_state buffer Lexbuf.Expression | _ -> () in value with Not_found -> (* If the identifier does not match a keyword and start with [*], then try it as a '*' operator. *) if Char.equal '*' id.[0] then ( Lexbuf.rollback buffer; let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with '*' -> STAR | _ -> IDENT id) else IDENT id let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a = fun rule lexbuf -> let _, position = Lexbuf.positions lexbuf in Lexbuf.set_start_position lexbuf position; try let token = rule (Buffer.create 256) lexbuf in token with Not_found -> raise UnclosedQuote let space = [%sedlex.regexp? ' ' | '\t'] let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"] let coma = [%sedlex.regexp? ','] let digit = [%sedlex.regexp? '0' .. '9'] let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_'] let ident = [%sedlex.regexp? Opt ('$' | '*'), letters, Star (digit | letters)] let location_ident = [%sedlex.regexp? letters | digit] let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] let rec read_long_string level buf buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '{' -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_long_string (level + 1) buf buffer | '}' -> ( match level with | 0 -> Lexbuf.leave_state buffer; Lexbuf.enter_state buffer (Lexbuf.EndString Lex_state.readLongStringWraper); Lexbuf.rollback buffer; LITERAL (Buffer.contents buf) | _ -> (* We have nested strings. Do not terminate end *) Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_long_string (level - 1) buf buffer) | eol -> Buffer.add_string buf (Lexbuf.content buffer); read_long_string level buf buffer | any -> Buffer.add_string buf (Lexbuf.content buffer); read_long_string level buf buffer | _ -> raise Not_found (** Read the text inside a ['] *) let rec read_quoted_string f buf buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | eol -> Buffer.add_string buf (Lexbuf.content buffer); (f (read_quoted_string f)) buf buffer | any -> Buffer.add_string buf (Lexbuf.content buffer); (f (read_quoted_string f)) buf buffer | _ -> let location, _ = Lexbuf.positions buffer in let line = location.Lexing.pos_lnum and file = location.Lexing.pos_fname in Format.eprintf "read_quoted_string : %s:%d\n" file line; raise Not_found let rec skip_comment buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '{' -> let _ = wait_balance (read_long_string 0) buffer in let _ = Lex_state.readLongStringWraper.end_string buffer in skip_comment buffer | '\'' -> let _ = wait_balance (Lex_state.quotedStringWraper.wrap (read_quoted_string Lex_state.quotedStringWraper.wrap)) buffer in let _ = Lex_state.quotedStringWraper.end_string buffer in skip_comment buffer | '"' -> let _ = wait_balance (Lex_state.dQuotedStringWraper.wrap (read_quoted_string Lex_state.dQuotedStringWraper.wrap)) buffer in let _ = Lex_state.dQuotedStringWraper.end_string buffer in skip_comment buffer | eol -> (* Ugly hack used in order to put the eol in the front of the next parsing. *) Lexbuf.rollback buffer; COMMENT | any -> skip_comment buffer | _ -> raise Not_found (** Main lexer *) let rec parse_token : Lexbuf.t -> token = fun buffer -> let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | 0Xfeff -> (* Ignore the BOM *) parse_token buffer | '#', Star space, location -> (* Extract the location name *) let ident = Lexbuf.content buffer in let () = match Str.string_match location_name ident 0 with | false -> () | true -> Sedlexing.set_filename lexbuf (Str.matched_group 1 ident) in (* Restart the line number (new location here) *) Lexbuf.start buffer; LOCATION_START ident | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> leave_expression buffer; LOCATION_END | Plus digit -> INTEGER (Lexbuf.content buffer) | '+' -> PLUS | '-' -> MINUS | "+=" -> INCR | "-=" -> DECR | "*=" -> MULT_EQUAL | '/' -> DIV | '*' -> STAR | ':' -> (* We are leaving the block, the comment will be handled again *) Lexbuf.leave_state buffer; COLUMN | '[' -> L_BRACKET | ']' -> R_BRACKET | '(' -> Lexbuf.enter_state buffer Lexbuf.Expression; L_PAREN | ')' -> Lexbuf.leave_state buffer; R_PAREN | ">>" -> Lexbuf.leave_state buffer; parse_token buffer | '<' -> LT | '>' -> GT | coma -> COMA | '=' -> Lexbuf.enter_state buffer Lexbuf.Expression; EQUAL | ident -> build_ident buffer | eol -> leave_expression buffer; EOL | '&' -> leave_expression buffer; AMPERSAND | '!' -> ( match Lexbuf.state buffer with | Some Lexbuf.Expression -> EXCLAMATION | _ -> skip_comment buffer) | '}' -> TEXT_MARKER | eof -> raise EOF | _ -> let tok = Lexbuf.content buffer in let msg = Format.asprintf "Unexpected character %S" tok in raise @@ LexError msg let main buffer = match Lexbuf.state buffer with | Some (Lexbuf.String w) -> wait_balance (w.wrap @@ read_quoted_string w.wrap) buffer | Some (Lexbuf.MString level) -> wait_balance (read_long_string level) buffer | Some (Lexbuf.EndString w) -> w.end_string buffer | Some (Lexbuf.Token w) -> w.start_string parse_token buffer | _ -> Lex_state.defaultWraper.start_string parse_token buffer let rec discard buffer = let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '\'' -> ignore (wait_balance (read_quoted_string Lex_state.quotedStringWraper.wrap) buffer); discard buffer | '"' -> ignore (wait_balance (read_quoted_string Lex_state.quotedStringWraper.wrap) buffer); discard buffer | '{' -> ignore (wait_balance (read_long_string 0) buffer); discard buffer | eof -> raise EOF | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> leave_expression buffer; () | '!' -> ignore @@ skip_comment buffer; discard buffer | any -> discard buffer | _ -> raise EOF