diff options
-rw-r--r-- | bin/qsp_parser.ml | 23 | ||||
-rw-r--r-- | lib/analyzer.ml | 15 | ||||
-rw-r--r-- | lib/analyzer.mli | 1 | ||||
-rw-r--r-- | lib/interpreter.ml | 32 | ||||
-rw-r--r-- | lib/lexbuf.ml | 14 | ||||
-rw-r--r-- | lib/lexbuf.mli | 8 | ||||
-rw-r--r-- | lib/lexer.ml | 119 | ||||
-rw-r--r-- | lib/lexer.mli | 10 | ||||
-rw-r--r-- | test/qsp_parser_test.ml | 7 |
9 files changed, 112 insertions, 117 deletions
diff --git a/bin/qsp_parser.ml b/bin/qsp_parser.ml index 1d846e0..3980ad5 100644 --- a/bin/qsp_parser.ml +++ b/bin/qsp_parser.ml @@ -41,12 +41,9 @@ let filter_report : filters -> Report.t list -> Report.t -> Report.t list = (** Read the source file until getting a report (the whole location has been read properly), or until the first syntax error. *) -let parse_location : - (module Qparser.Lexer.Encoding) -> Sedlexing.lexbuf -> filters -> unit = - fun encoding lexbuf filters -> - let result = - Qparser.Analyzer.parse (module Qsp_syntax.Type_of) encoding lexbuf - in +let parse_location : Sedlexing.lexbuf -> filters -> unit = + fun lexbuf filters -> + let result = Qparser.Analyzer.parse (module Qsp_syntax.Type_of) lexbuf in let result = Result.map @@ -78,23 +75,17 @@ let () = let ic = Stdlib.open_in file_name in (*let lexer = Lexing.from_channel ~with_positions:true ic in*) - let lexer, mod_ = + let lexer = match Filename.extension file_name with - | ".qsrc" -> - ( Sedlexing.Utf8.from_channel ic, - (module Sedlexing.Utf8 : Qparser.Lexer.Encoding) ) - | ".txt" -> - ( Sedlexing.Utf16.from_channel ic (Some Little_endian), - (module struct - let lexeme lexbuf = Sedlexing.Utf8.lexeme lexbuf - end : Qparser.Lexer.Encoding) ) + | ".qsrc" -> Sedlexing.Utf8.from_channel ic + | ".txt" -> Sedlexing.Utf16.from_channel ic (Some Little_endian) | _ -> raise (Failure "unknown extension") in let () = try while true do - parse_location mod_ lexer filters + parse_location lexer filters done with Qparser.Lexer.EOF -> () in diff --git a/lib/analyzer.ml b/lib/analyzer.ml index a6f5e51..547b3da 100644 --- a/lib/analyzer.ml +++ b/lib/analyzer.ml @@ -5,18 +5,19 @@ *) let parse : (module Qsp_syntax.S.Analyzer with type Location.repr = 'a) -> - (module Encoding.S) -> Sedlexing.lexbuf -> ('a, Qsp_syntax.Report.t) Result.t = - fun (type a) (module S : Qsp_syntax.S.Analyzer with type Location.repr = a) - (module E : Encoding.S) -> + fun (type a) (module S : Qsp_syntax.S.Analyzer with type Location.repr = a) -> let module Parser = Parser.Make (S) in let module IncrementalParser = Interpreter.Interpreter (Parser.MenhirInterpreter) in fun lexbuf -> - IncrementalParser.of_lexbuf lexbuf - (Lexer.lexer (module E)) - Parser.Incremental.main + let l = Lexbuf.from_lexbuf lexbuf in + let lexer = Lexbuf.tokenize Lexer.token l in + + let init = Parser.Incremental.main (fst (Lexbuf.positions l)) in + + IncrementalParser.of_lexbuf lexer l init |> Result.map_error (fun e -> let message = match e.IncrementalParser.code with @@ -38,5 +39,5 @@ let parse : (* Discard the remaining file to read. The parser is now in a blank state, it does not make sense to keep feeding it with the new tokens. *) - Lexer.discard lexbuf; + Lexer.discard l; report) diff --git a/lib/analyzer.mli b/lib/analyzer.mli index e7efdb0..d79ea76 100644 --- a/lib/analyzer.mli +++ b/lib/analyzer.mli @@ -1,6 +1,5 @@ val parse : (module Qsp_syntax.S.Analyzer with type Location.repr = 'a) -> - (module Encoding.S) -> Sedlexing.lexbuf -> ('a, Qsp_syntax.Report.t) Result.t (** Read the source and build a analyzis over it. diff --git a/lib/interpreter.ml b/lib/interpreter.ml index d3b24c9..b719600 100644 --- a/lib/interpreter.ml +++ b/lib/interpreter.ml @@ -27,47 +27,41 @@ struct let range_message (start_pos, end_pos) : error_code -> error = fun code -> { code; start_pos; end_pos } - let get_parse_error lexbuf env : error = + let get_parse_error : Lexbuf.t -> 'a MI.env -> error = + fun buffer env -> match MI.stack env with | (lazy Nil) -> (* The parser is in its initial state. We should not get an error here *) - let positions = Sedlexing.lexing_positions lexbuf in + let positions = Lexbuf.positions buffer in + range_message positions UnrecoverableError | (lazy (Cons (MI.Element (state, _, start_pos, end_pos), _))) -> range_message (start_pos, end_pos) (MenhirCode (MI.number state)) let rec _parse : - (Sedlexing.lexbuf -> unit -> step) -> - Sedlexing.lexbuf -> - 'a MI.checkpoint -> - ('a, error) Result.t = - fun get_token (lexbuf : Sedlexing.lexbuf) (checkpoint : 'a MI.checkpoint) -> + Lexbuf.t -> (unit -> step) -> 'a MI.checkpoint -> ('a, error) Result.t = + fun buffer get_token (checkpoint : 'a MI.checkpoint) -> match checkpoint with | MI.InputNeeded _env -> - let token, startp, endp = get_token lexbuf () in + let token, startp, endp = get_token () in let checkpoint = MI.offer checkpoint (token, startp, endp) in - _parse get_token lexbuf checkpoint + _parse buffer get_token checkpoint | MI.Shifting _ | MI.AboutToReduce _ -> let checkpoint = MI.resume checkpoint in - _parse get_token lexbuf checkpoint + _parse buffer get_token checkpoint | MI.HandlingError _env -> - let err = get_parse_error lexbuf _env in + let err = get_parse_error buffer _env in Error err | MI.Accepted v -> Ok v | MI.Rejected -> - let positions = Sedlexing.lexing_positions lexbuf in + let positions = Lexbuf.positions buffer in let err = range_message positions InvalidSyntax in Error err type 'a builder = Lexing.position -> 'a MI.checkpoint let of_lexbuf : - Sedlexing.lexbuf -> - (Sedlexing.lexbuf -> unit -> step) -> - 'a builder -> - ('a, error) result = - fun lexbuf lexer f -> - let init = f (fst (Sedlexing.lexing_positions lexbuf)) in - _parse lexer lexbuf init + (unit -> step) -> Lexbuf.t -> 'a MI.checkpoint -> ('a, error) result = + fun lexer buffer init -> _parse buffer lexer init end diff --git a/lib/lexbuf.ml b/lib/lexbuf.ml new file mode 100644 index 0000000..2ed9099 --- /dev/null +++ b/lib/lexbuf.ml @@ -0,0 +1,14 @@ +type t = Sedlexing.lexbuf + +let buffer : t -> Sedlexing.lexbuf = fun t -> t +let start : t -> unit = fun t -> Sedlexing.start t + +let positions : t -> Lexing.position * Lexing.position = + fun t -> Sedlexing.lexing_positions t + +let content : t -> string = fun t -> Sedlexing.Utf8.lexeme t +let from_lexbuf : Sedlexing.lexbuf -> t = fun t -> t + +let tokenize : (t -> 'a) -> t -> unit -> 'a * Lexing.position * Lexing.position + = + fun f lexbuf -> Sedlexing.with_tokenizer f lexbuf diff --git a/lib/lexbuf.mli b/lib/lexbuf.mli new file mode 100644 index 0000000..cf93c7e --- /dev/null +++ b/lib/lexbuf.mli @@ -0,0 +1,8 @@ +type t + +val start : t -> unit +val buffer : t -> Sedlexing.lexbuf +val positions : t -> Lexing.position * Lexing.position +val content : t -> string +val from_lexbuf : Sedlexing.lexbuf -> t +val tokenize : (t -> 'a) -> t -> unit -> 'a * Lexing.position * Lexing.position diff --git a/lib/lexer.ml b/lib/lexer.ml index 1a2d788..a91bfdb 100644 --- a/lib/lexer.ml +++ b/lib/lexer.ml @@ -57,12 +57,14 @@ let incr_level lexbuf = | None -> Bucket.add is_expression lexbuf 1 | Some v -> Bucket.add is_expression lexbuf (v + 1) -let wait_balance rule lexbuf = - try[@warning "-52"] rule (Buffer.create 17) lexbuf +let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a = + fun rule lexbuf -> + try[@warning "-52"] + let token = rule (Buffer.create 256) lexbuf in + token with Failure "lexing: empty token" -> - let position, _ = Sedlexing.lexing_positions lexbuf in - let line = position.Lexing.pos_lnum - and content = Sedlexing.lexeme lexbuf |> Idents.of_uchars in + let position, _ = Lexbuf.positions lexbuf in + let line = position.Lexing.pos_lnum and content = Lexbuf.content lexbuf in (raise (UnclosedQuote { line; content }) [@warning "+52"]) let space = [%sedlex.regexp? ' ' | '\t'] @@ -76,82 +78,87 @@ let location_ident = [%sedlex.regexp? letters | digit] let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] -let rec read_long_string level buf lexbuf = +let rec read_long_string level buf buffer = + let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '{' -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string (level + 1) buf lexbuf + read_long_string (level + 1) buf buffer | '}' -> ( match level with | 0 -> Buffer.contents buf | _ -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string (level - 1) buf lexbuf) + read_long_string (level - 1) buf buffer) | eol -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string level buf lexbuf + Buffer.add_string buf (Lexbuf.content buffer); + read_long_string level buf buffer | any -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_long_string level buf lexbuf + Buffer.add_string buf (Lexbuf.content buffer); + read_long_string level buf buffer | _ -> raise Not_found -let rec read_dquoted_string buf lexbuf = +let rec read_dquoted_string buf buffer = + let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | "\"\"" -> Buffer.add_char buf '"'; - read_dquoted_string buf lexbuf + read_dquoted_string buf buffer | '"' -> Buffer.contents buf | any -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_dquoted_string buf lexbuf + read_dquoted_string buf buffer | _ -> raise Not_found -let rec read_quoted_string buf lexbuf = +let rec read_quoted_string buf buffer = + let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | "''" -> Buffer.add_char buf '\''; - read_quoted_string buf lexbuf + read_quoted_string buf buffer | '\'' -> Buffer.contents buf | eol -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_quoted_string buf lexbuf + Buffer.add_string buf (Lexbuf.content buffer); + read_quoted_string buf buffer | any -> - Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); - read_quoted_string buf lexbuf + Buffer.add_string buf (Lexbuf.content buffer); + read_quoted_string buf buffer | _ -> raise Not_found -let rec skip_comment lexbuf = +let rec skip_comment buffer = + let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '{' -> - let _ = wait_balance (read_long_string 0) lexbuf in - skip_comment lexbuf + let _ = wait_balance (read_long_string 0) buffer in + skip_comment buffer | '\'' -> - let _ = wait_balance read_quoted_string lexbuf in - skip_comment lexbuf + let _ = wait_balance read_quoted_string buffer in + skip_comment buffer | '"' -> - let _ = wait_balance read_dquoted_string lexbuf in - skip_comment lexbuf + let _ = wait_balance read_dquoted_string buffer in + skip_comment buffer | eol -> (* Ugly hack used in order to put the eol in the front of the next parsing. *) Sedlexing.rollback lexbuf; COMMENT - | any -> skip_comment lexbuf + | any -> skip_comment buffer | _ -> raise Not_found (** Main lexer *) -let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = - fun (module E : Encoding) lexbuf -> +let rec token : Lexbuf.t -> token = + fun buffer -> + let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | 0Xfeff -> - Sedlexing.start lexbuf; + Lexbuf.start buffer; (* Ignore the BOM *) - token (module E) lexbuf + token buffer | '#', Star space, location -> let _start_pos, end_pos = Sedlexing.lexing_positions lexbuf in (* Extract the location name *) - let ident = E.lexeme lexbuf in + let ident = Lexbuf.content buffer in let () = match Str.string_match location_name ident 0 with | false -> () @@ -166,7 +173,7 @@ let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> Bucket.remove is_expression lexbuf; LOCATION_END - | Plus digit -> INTEGER (E.lexeme lexbuf) + | Plus digit -> INTEGER (Lexbuf.content buffer) | '+' -> PLUS | '-' -> MINUS | "+=" -> INCR @@ -202,15 +209,15 @@ let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = | '!' -> ( match Bucket.find is_expression lexbuf with | Some i when i <> 0 -> EXCLAMATION - | _ -> skip_comment lexbuf) - | spaces -> token (module E) lexbuf - | '\'' -> LITERAL (wait_balance read_quoted_string lexbuf) - | '"' -> LITERAL (wait_balance read_dquoted_string lexbuf) - | '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf) + | _ -> skip_comment buffer) + | spaces -> token buffer + | '\'' -> LITERAL (wait_balance read_quoted_string buffer) + | '"' -> LITERAL (wait_balance read_dquoted_string buffer) + | '{' -> LITERAL (wait_balance (read_long_string 0) buffer) | eof -> raise EOF | _ -> let position = fst @@ Sedlexing.lexing_positions lexbuf in - let tok = E.lexeme lexbuf in + let tok = Lexbuf.content buffer in let msg = Format.asprintf "Unexpected character %S at %a" tok pp_pos position @@ -218,32 +225,24 @@ let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = raise @@ LexError (position, msg) -let rec discard lexbuf = +let rec discard buffer = + let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '\'' -> - ignore (wait_balance read_quoted_string lexbuf); - discard lexbuf + ignore (wait_balance read_quoted_string buffer); + discard buffer | '"' -> - ignore (wait_balance read_dquoted_string lexbuf); - discard lexbuf + ignore (wait_balance read_dquoted_string buffer); + discard buffer | '{' -> - ignore (wait_balance (read_long_string 0) lexbuf); - discard lexbuf + ignore (wait_balance (read_long_string 0) buffer); + discard buffer | eof -> raise EOF | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> Bucket.remove is_expression lexbuf; () | '!' -> - ignore @@ skip_comment lexbuf; - discard lexbuf - | any -> discard lexbuf + ignore @@ skip_comment buffer; + discard buffer + | any -> discard buffer | _ -> raise EOF - -(** Tokenizer for menhir *) -let lexer : - (module Encoding) -> - Sedlexing.lexbuf -> - unit -> - token * Lexing.position * Lexing.position = - fun (module E : Encoding.S) lexbuf -> - Sedlexing.with_tokenizer (token (module E)) lexbuf diff --git a/lib/lexer.mli b/lib/lexer.mli index 0a8ec12..65e9fa0 100644 --- a/lib/lexer.mli +++ b/lib/lexer.mli @@ -4,11 +4,5 @@ end exception EOF -val lexer : - (module Encoding) -> - Sedlexing.lexbuf -> - unit -> - Tokens.token * Lexing.position * Lexing.position -(** Apply the lexer to the source *) - -val discard : Sedlexing.lexbuf -> unit +val token : Lexbuf.t -> Tokens.token +val discard : Lexbuf.t -> unit diff --git a/test/qsp_parser_test.ml b/test/qsp_parser_test.ml index 3e4a96b..6ea9eab 100644 --- a/test/qsp_parser_test.ml +++ b/test/qsp_parser_test.ml @@ -12,12 +12,7 @@ let parse : string -> T.pos location = fun content -> let lexing = Sedlexing.Latin1.from_string content in - match - Qparser.Analyzer.parse - (module Qsp_syntax.Tree) - (module Sedlexing.Utf8) - lexing - with + match Qparser.Analyzer.parse (module Qsp_syntax.Tree) lexing with | Ok e -> e | Error e -> let msg = Format.asprintf "%a" Qsp_syntax.Report.pp e in |