diff options
author | Chimrod <> | 2023-09-27 15:36:13 +0200 |
---|---|---|
committer | Chimrod <> | 2023-09-29 10:00:21 +0200 |
commit | 5dc0c5defdd7ebb152a00e8b2895787b54931779 (patch) | |
tree | 78f9d7c646d3c614a6c934778b195e0707f47821 | |
parent | 40f7b4c7398db2b832b71e3dfb8afb53116fad51 (diff) |
Allow differents file encoding for the source
-rw-r--r-- | bin/qsp_parser.ml | 7 | ||||
-rw-r--r-- | lib/UTF16.ml | 27 | ||||
-rw-r--r-- | lib/analyzer.ml | 10 | ||||
-rw-r--r-- | lib/analyzer.mli | 5 | ||||
-rw-r--r-- | lib/encoding.ml | 3 | ||||
-rw-r--r-- | lib/interpreter.ml | 16 | ||||
-rw-r--r-- | test/qsp_parser_test.ml | 7 |
7 files changed, 56 insertions, 19 deletions
diff --git a/bin/qsp_parser.ml b/bin/qsp_parser.ml index 4ed2ba7..f2b21f9 100644 --- a/bin/qsp_parser.ml +++ b/bin/qsp_parser.ml @@ -45,7 +45,12 @@ let () = let ic = Stdlib.open_in file_name in (*let lexer = Lexing.from_channel ~with_positions:true ic in*) let lexer = Sedlexing.Utf8.from_channel ic in - let result = Qparser.Analyzer.parse (module Qsp_syntax.Type_of) lexer in + let result = + Qparser.Analyzer.parse + (module Qsp_syntax.Type_of) + (module Sedlexing.Utf8) + lexer + in match result with | Ok f -> ( let report = List.fold_left (f []) ~init:[] ~f:(filter_report filters) in diff --git a/lib/UTF16.ml b/lib/UTF16.ml index bdc48c7..e325011 100644 --- a/lib/UTF16.ml +++ b/lib/UTF16.ml @@ -10,6 +10,10 @@ exception LexError of Lexing.position * string let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } = Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol) +module type Encoding = sig + val lexeme : Sedlexing.lexbuf -> string +end + (* The comment system is terrible. The same symbol can be used for : - starting a comment - inequality operation @@ -133,16 +137,18 @@ let rec skip_comment lexbuf = | any -> skip_comment lexbuf | _ -> raise Not_found -let rec token lexbuf = +(** Main lexer *) +let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token = + fun (module E : Encoding) lexbuf -> match%sedlex lexbuf with | 0Xfeff -> (* Ignore the BOM *) - token lexbuf + token (module E) lexbuf | '#', Star space, location -> - let ident = Idents.of_uchars (Sedlexing.lexeme lexbuf) in + let ident = E.lexeme lexbuf in LOCATION_START ident - | Plus digit -> INTEGER (Sedlexing.Utf8.lexeme lexbuf) + | Plus digit -> INTEGER (E.lexeme lexbuf) | '+' -> PLUS | '-' -> MINUS | "+=" -> INCR @@ -184,13 +190,13 @@ let rec token lexbuf = match Bucket.find is_expression lexbuf with | Some i when i <> 0 -> EXCLAMATION | _ -> skip_comment lexbuf) - | spaces -> token lexbuf + | spaces -> token (module E) lexbuf | '\'' -> LITERAL (wait_balance read_quoted_string lexbuf) | '"' -> LITERAL (wait_balance read_dquoted_string lexbuf) | '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf) | _ -> let position = fst @@ Sedlexing.lexing_positions lexbuf in - let tok = Sedlexing.Utf16.lexeme lexbuf Little_endian false in + let tok = E.lexeme lexbuf in let msg = Format.asprintf "Unexpected character %S at %a" tok pp_pos position @@ -198,4 +204,11 @@ let rec token lexbuf = raise @@ LexError (position, msg) -let lexer buf = Sedlexing.with_tokenizer token buf +(** Tokenizer for menhir *) +let lexer : + (module Encoding) -> + Sedlexing.lexbuf -> + unit -> + token * Lexing.position * Lexing.position = + fun (module E : Encoding.S) lexbuf -> + Sedlexing.with_tokenizer (token (module E)) lexbuf diff --git a/lib/analyzer.ml b/lib/analyzer.ml index fe6ae90..b422b24 100644 --- a/lib/analyzer.ml +++ b/lib/analyzer.ml @@ -23,17 +23,21 @@ let format_error : Format.formatter -> error -> unit = *) let parse : (module Qsp_syntax.S.Analyzer with type Location.repr = 'a) -> + (module Encoding.S) -> Sedlexing.lexbuf -> ('a, error) Result.t = - fun (type a) (module S : Qsp_syntax.S.Analyzer with type Location.repr = a) -> + fun (type a) (module S : Qsp_syntax.S.Analyzer with type Location.repr = a) + (module E : Encoding.S) -> let module Parser = Parser.Make (S) in let module IncrementalParser = Interpreter.Interpreter (Parser.MenhirInterpreter) in fun lexbuf -> - IncrementalParser.of_lexbuf lexbuf UTF16.lexer Parser.Incremental.main + IncrementalParser.of_lexbuf lexbuf + (UTF16.lexer (module E)) + Parser.Incremental.main |> Result.map_error (fun e -> let message = - match e.Interpreter.code with + match e.IncrementalParser.code with | Interpreter.InvalidSyntax -> "Invalid Syntax" | Interpreter.MenhirCode c -> String.concat "" diff --git a/lib/analyzer.mli b/lib/analyzer.mli index 02d7b47..43509ba 100644 --- a/lib/analyzer.mli +++ b/lib/analyzer.mli @@ -8,5 +8,10 @@ val format_error : Format.formatter -> error -> unit val parse : (module Qsp_syntax.S.Analyzer with type Location.repr = 'a) -> + (module Encoding.S) -> Sedlexing.lexbuf -> ('a, error) Result.t +(** Read the source and build a analyzis over it. + +This method make the link between the source file and how to read it +(encoding…) and the AST we want to build. *) diff --git a/lib/encoding.ml b/lib/encoding.ml new file mode 100644 index 0000000..30b9c4e --- /dev/null +++ b/lib/encoding.ml @@ -0,0 +1,3 @@ +module type S = sig + val lexeme : Sedlexing.lexbuf -> string +end diff --git a/lib/interpreter.ml b/lib/interpreter.ml index 21c1430..346ceb5 100644 --- a/lib/interpreter.ml +++ b/lib/interpreter.ml @@ -11,20 +11,22 @@ type error_code = InvalidSyntax | MenhirCode of int -type error = { - code : error_code; - start_pos : Lexing.position; - end_pos : Lexing.position; -} - module Interpreter (MI : MenhirLib.IncrementalEngine.INCREMENTAL_ENGINE) = struct + type error = { + code : error_code; + start_pos : Lexing.position; + end_pos : Lexing.position; + + } + module E = MenhirLib.ErrorReports module L = MenhirLib.LexerUtil type step = MI.token * Lexing.position * Lexing.position - let range_message (start_pos, end_pos) code = { code; start_pos; end_pos } + let range_message (start_pos, end_pos) : error_code -> error = + fun code -> { code; start_pos; end_pos } let get_parse_error lexbuf env : error = match MI.stack env with diff --git a/test/qsp_parser_test.ml b/test/qsp_parser_test.ml index 6bf249d..25619b7 100644 --- a/test/qsp_parser_test.ml +++ b/test/qsp_parser_test.ml @@ -12,7 +12,12 @@ let parse : string -> T.pos location = fun content -> let lexing = Sedlexing.Latin1.from_string content in - match Qparser.Analyzer.parse (module Qsp_syntax.Tree) lexing with + match + Qparser.Analyzer.parse + (module Qsp_syntax.Tree) + (module Sedlexing.Utf8) + lexing + with | Ok e -> e | Error e -> let msg = Format.asprintf "%a" Qparser.Analyzer.format_error e in |