aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChimrod <>2023-09-27 15:36:13 +0200
committerChimrod <>2023-09-29 10:00:21 +0200
commit5dc0c5defdd7ebb152a00e8b2895787b54931779 (patch)
tree78f9d7c646d3c614a6c934778b195e0707f47821 /lib
parent40f7b4c7398db2b832b71e3dfb8afb53116fad51 (diff)
Allow differents file encoding for the source
Diffstat (limited to 'lib')
-rw-r--r--lib/UTF16.ml27
-rw-r--r--lib/analyzer.ml10
-rw-r--r--lib/analyzer.mli5
-rw-r--r--lib/encoding.ml3
-rw-r--r--lib/interpreter.ml16
5 files changed, 44 insertions, 17 deletions
diff --git a/lib/UTF16.ml b/lib/UTF16.ml
index bdc48c7..e325011 100644
--- a/lib/UTF16.ml
+++ b/lib/UTF16.ml
@@ -10,6 +10,10 @@ exception LexError of Lexing.position * string
let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } =
Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol)
+module type Encoding = sig
+ val lexeme : Sedlexing.lexbuf -> string
+end
+
(* The comment system is terrible. The same symbol can be used for :
- starting a comment
- inequality operation
@@ -133,16 +137,18 @@ let rec skip_comment lexbuf =
| any -> skip_comment lexbuf
| _ -> raise Not_found
-let rec token lexbuf =
+(** Main lexer *)
+let rec token : (module Encoding.S) -> Sedlexing.lexbuf -> token =
+ fun (module E : Encoding) lexbuf ->
match%sedlex lexbuf with
| 0Xfeff ->
(* Ignore the BOM *)
- token lexbuf
+ token (module E) lexbuf
| '#', Star space, location ->
- let ident = Idents.of_uchars (Sedlexing.lexeme lexbuf) in
+ let ident = E.lexeme lexbuf in
LOCATION_START ident
- | Plus digit -> INTEGER (Sedlexing.Utf8.lexeme lexbuf)
+ | Plus digit -> INTEGER (E.lexeme lexbuf)
| '+' -> PLUS
| '-' -> MINUS
| "+=" -> INCR
@@ -184,13 +190,13 @@ let rec token lexbuf =
match Bucket.find is_expression lexbuf with
| Some i when i <> 0 -> EXCLAMATION
| _ -> skip_comment lexbuf)
- | spaces -> token lexbuf
+ | spaces -> token (module E) lexbuf
| '\'' -> LITERAL (wait_balance read_quoted_string lexbuf)
| '"' -> LITERAL (wait_balance read_dquoted_string lexbuf)
| '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf)
| _ ->
let position = fst @@ Sedlexing.lexing_positions lexbuf in
- let tok = Sedlexing.Utf16.lexeme lexbuf Little_endian false in
+ let tok = E.lexeme lexbuf in
let msg =
Format.asprintf "Unexpected character %S at %a" tok pp_pos position
@@ -198,4 +204,11 @@ let rec token lexbuf =
raise @@ LexError (position, msg)
-let lexer buf = Sedlexing.with_tokenizer token buf
+(** Tokenizer for menhir *)
+let lexer :
+ (module Encoding) ->
+ Sedlexing.lexbuf ->
+ unit ->
+ token * Lexing.position * Lexing.position =
+ fun (module E : Encoding.S) lexbuf ->
+ Sedlexing.with_tokenizer (token (module E)) lexbuf
diff --git a/lib/analyzer.ml b/lib/analyzer.ml
index fe6ae90..b422b24 100644
--- a/lib/analyzer.ml
+++ b/lib/analyzer.ml
@@ -23,17 +23,21 @@ let format_error : Format.formatter -> error -> unit =
*)
let parse :
(module Qsp_syntax.S.Analyzer with type Location.repr = 'a) ->
+ (module Encoding.S) ->
Sedlexing.lexbuf ->
('a, error) Result.t =
- fun (type a) (module S : Qsp_syntax.S.Analyzer with type Location.repr = a) ->
+ fun (type a) (module S : Qsp_syntax.S.Analyzer with type Location.repr = a)
+ (module E : Encoding.S) ->
let module Parser = Parser.Make (S) in
let module IncrementalParser =
Interpreter.Interpreter (Parser.MenhirInterpreter) in
fun lexbuf ->
- IncrementalParser.of_lexbuf lexbuf UTF16.lexer Parser.Incremental.main
+ IncrementalParser.of_lexbuf lexbuf
+ (UTF16.lexer (module E))
+ Parser.Incremental.main
|> Result.map_error (fun e ->
let message =
- match e.Interpreter.code with
+ match e.IncrementalParser.code with
| Interpreter.InvalidSyntax -> "Invalid Syntax"
| Interpreter.MenhirCode c ->
String.concat ""
diff --git a/lib/analyzer.mli b/lib/analyzer.mli
index 02d7b47..43509ba 100644
--- a/lib/analyzer.mli
+++ b/lib/analyzer.mli
@@ -8,5 +8,10 @@ val format_error : Format.formatter -> error -> unit
val parse :
(module Qsp_syntax.S.Analyzer with type Location.repr = 'a) ->
+ (module Encoding.S) ->
Sedlexing.lexbuf ->
('a, error) Result.t
+(** Read the source and build a analyzis over it.
+
+This method make the link between the source file and how to read it
+(encoding…) and the AST we want to build. *)
diff --git a/lib/encoding.ml b/lib/encoding.ml
new file mode 100644
index 0000000..30b9c4e
--- /dev/null
+++ b/lib/encoding.ml
@@ -0,0 +1,3 @@
+module type S = sig
+ val lexeme : Sedlexing.lexbuf -> string
+end
diff --git a/lib/interpreter.ml b/lib/interpreter.ml
index 21c1430..346ceb5 100644
--- a/lib/interpreter.ml
+++ b/lib/interpreter.ml
@@ -11,20 +11,22 @@
type error_code = InvalidSyntax | MenhirCode of int
-type error = {
- code : error_code;
- start_pos : Lexing.position;
- end_pos : Lexing.position;
-}
-
module Interpreter (MI : MenhirLib.IncrementalEngine.INCREMENTAL_ENGINE) =
struct
+ type error = {
+ code : error_code;
+ start_pos : Lexing.position;
+ end_pos : Lexing.position;
+
+ }
+
module E = MenhirLib.ErrorReports
module L = MenhirLib.LexerUtil
type step = MI.token * Lexing.position * Lexing.position
- let range_message (start_pos, end_pos) code = { code; start_pos; end_pos }
+ let range_message (start_pos, end_pos) : error_code -> error =
+ fun code -> { code; start_pos; end_pos }
let get_parse_error lexbuf env : error =
match MI.stack env with