From 40f7b4c7398db2b832b71e3dfb8afb53116fad51 Mon Sep 17 00:00:00 2001 From: Chimrod <> Date: Wed, 27 Sep 2023 13:31:14 +0200 Subject: Switched to sedlex instead of ocamllex --- bin/dune | 1 + bin/qsp_parser.ml | 3 +- lib/UTF16.ml | 201 +++++++++++++++++++++++++++ lib/analyzer.ml | 4 +- lib/analyzer.mli | 2 +- lib/dune | 10 +- lib/idents.ml | 185 +++++++++++++++++++++++++ lib/interpreter.ml | 32 ++--- lib/lexer.mll | 362 ------------------------------------------------ test/dune | 1 + test/qsp_parser_test.ml | 10 +- 11 files changed, 421 insertions(+), 390 deletions(-) create mode 100644 lib/UTF16.ml create mode 100644 lib/idents.ml delete mode 100644 lib/lexer.mll diff --git a/bin/dune b/bin/dune index 9702e66..f7aacc3 100644 --- a/bin/dune +++ b/bin/dune @@ -2,6 +2,7 @@ (public_name qsp_parser) (name qsp_parser) (libraries + sedlex qsp_syntax qparser) diff --git a/bin/qsp_parser.ml b/bin/qsp_parser.ml index af12abe..4ed2ba7 100644 --- a/bin/qsp_parser.ml +++ b/bin/qsp_parser.ml @@ -43,7 +43,8 @@ let () = let file_name = List.hd file_names in let ic = Stdlib.open_in file_name in - let lexer = Lexing.from_channel ~with_positions:true ic in + (*let lexer = Lexing.from_channel ~with_positions:true ic in*) + let lexer = Sedlexing.Utf8.from_channel ic in let result = Qparser.Analyzer.parse (module Qsp_syntax.Type_of) lexer in match result with | Ok f -> ( diff --git a/lib/UTF16.ml b/lib/UTF16.ml new file mode 100644 index 0000000..bdc48c7 --- /dev/null +++ b/lib/UTF16.ml @@ -0,0 +1,201 @@ +(** + Lexer using sedlex + *) + +open Tokens + +exception UnclosedQuote of { content : string; line : int } +exception LexError of Lexing.position * string + +let pp_pos out { Lexing.pos_lnum; pos_cnum; pos_bol; _ } = + Format.fprintf out "line %d:%d" pos_lnum (pos_cnum - pos_bol) + +(* The comment system is terrible. The same symbol can be used for : + - starting a comment + - inequality operation + In order to manage this, I try to identify the context in a very basic way, + using a flag True False for determining the token to send. +*) +module Bucket = Ephemeron.K1.Bucket + +type bucket = (Sedlexing.lexbuf, int) Bucket.t + +let is_expression : bucket = Bucket.make () + +let incr_level lexbuf = + match Bucket.find is_expression lexbuf with + | None -> Bucket.add is_expression lexbuf 1 + | Some v -> Bucket.add is_expression lexbuf (v + 1) + +let decr_level lexbuf = + match Bucket.find is_expression lexbuf with + | None -> () + | Some v -> + if v > 1 then Bucket.add is_expression lexbuf (v - 1) + else Bucket.remove is_expression lexbuf + +let build_ident lexbuf = + let id = + Sedlexing.lexeme lexbuf |> Idents.of_uchars |> String.uppercase_ascii + in + try + let value = Hashtbl.find Idents.keyword_table id in + let _ = match value with IF | ELIF -> incr_level lexbuf | _ -> () in + value + with Not_found -> IDENT id + +let incr_level lexbuf = + match Bucket.find is_expression lexbuf with + | None -> Bucket.add is_expression lexbuf 1 + | Some v -> Bucket.add is_expression lexbuf (v + 1) + +let wait_balance rule lexbuf = + try[@warning "-52"] rule (Buffer.create 17) lexbuf + with Failure "lexing: empty token" -> + let position, _ = Sedlexing.lexing_positions lexbuf in + let line = position.Lexing.pos_lnum + and content = Sedlexing.lexeme lexbuf |> Idents.of_uchars in + (raise (UnclosedQuote { line; content }) [@warning "+52"]) + +let space = [%sedlex.regexp? ' ' | '\t'] +let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"] +let coma = [%sedlex.regexp? ','] +let digit = [%sedlex.regexp? '0' .. '9'] +let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_'] +let spaces = [%sedlex.regexp? Plus space] +let ident = [%sedlex.regexp? ('$' | letters), Star (digit | letters)] +let location_ident = [%sedlex.regexp? letters | digit] +let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] +let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] + +let rec read_long_string level buf lexbuf = + match%sedlex lexbuf with + | '{' -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_long_string (level + 1) buf lexbuf + | '}' -> ( + match level with + | 0 -> Buffer.contents buf + | _ -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_long_string (level - 1) buf lexbuf) + | eol -> + Sedlexing.new_line lexbuf; + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_long_string level buf lexbuf + | any -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_long_string level buf lexbuf + | _ -> raise Not_found + +let rec read_dquoted_string buf lexbuf = + match%sedlex lexbuf with + | "\"\"" -> + Buffer.add_char buf '"'; + read_dquoted_string buf lexbuf + | '"' -> Buffer.contents buf + | any -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_dquoted_string buf lexbuf + | _ -> raise Not_found + +let rec read_quoted_string buf lexbuf = + match%sedlex lexbuf with + | "''" -> + Buffer.add_char buf '\''; + read_quoted_string buf lexbuf + | '\'' -> Buffer.contents buf + | eol -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + Sedlexing.new_line lexbuf; + read_quoted_string buf lexbuf + | any -> + Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); + read_quoted_string buf lexbuf + | _ -> raise Not_found + +let rec skip_comment lexbuf = + match%sedlex lexbuf with + | '{' -> + let _ = wait_balance (read_long_string 0) lexbuf in + skip_comment lexbuf + | '\'' -> + let _ = wait_balance read_quoted_string lexbuf in + skip_comment lexbuf + | '"' -> + let _ = wait_balance read_dquoted_string lexbuf in + skip_comment lexbuf + | eol -> + (* Ugly hack used in order to put the eol in the front of the next + parsing. *) + Sedlexing.rollback lexbuf; + COMMENT + | any -> skip_comment lexbuf + | _ -> raise Not_found + +let rec token lexbuf = + match%sedlex lexbuf with + | 0Xfeff -> + (* Ignore the BOM *) + token lexbuf + | '#', Star space, location -> + let ident = Idents.of_uchars (Sedlexing.lexeme lexbuf) in + + LOCATION_START ident + | Plus digit -> INTEGER (Sedlexing.Utf8.lexeme lexbuf) + | '+' -> PLUS + | '-' -> MINUS + | "+=" -> INCR + | "-=" -> DECR + | "*=" -> MULT_EQUAL + | '/' -> DIV + | '*' -> STAR + | ':' -> + (* We are leaving the block, the comment will be handled again *) + decr_level lexbuf; + COLUMN + | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> LOCATION_END + | '[' -> L_BRACKET + | ']' -> R_BRACKET + | '(' -> + incr_level lexbuf; + L_PAREN + | ')' -> + decr_level lexbuf; + R_PAREN + | '<' -> LT + | '>' -> GT + | coma -> COMA + | eof -> + Bucket.remove is_expression lexbuf; + EOF + | '=' -> + incr_level lexbuf; + EQUAL + | ident -> build_ident lexbuf + | eol -> + Bucket.add is_expression lexbuf 0; + Sedlexing.new_line lexbuf; + EOL + | '&' -> + Bucket.add is_expression lexbuf 0; + AMPERSAND + | '!' -> ( + match Bucket.find is_expression lexbuf with + | Some i when i <> 0 -> EXCLAMATION + | _ -> skip_comment lexbuf) + | spaces -> token lexbuf + | '\'' -> LITERAL (wait_balance read_quoted_string lexbuf) + | '"' -> LITERAL (wait_balance read_dquoted_string lexbuf) + | '{' -> LITERAL (wait_balance (read_long_string 0) lexbuf) + | _ -> + let position = fst @@ Sedlexing.lexing_positions lexbuf in + let tok = Sedlexing.Utf16.lexeme lexbuf Little_endian false in + + let msg = + Format.asprintf "Unexpected character %S at %a" tok pp_pos position + in + + raise @@ LexError (position, msg) + +let lexer buf = Sedlexing.with_tokenizer token buf diff --git a/lib/analyzer.ml b/lib/analyzer.ml index f0f8ca5..fe6ae90 100644 --- a/lib/analyzer.ml +++ b/lib/analyzer.ml @@ -23,14 +23,14 @@ let format_error : Format.formatter -> error -> unit = *) let parse : (module Qsp_syntax.S.Analyzer with type Location.repr = 'a) -> - Lexing.lexbuf -> + Sedlexing.lexbuf -> ('a, error) Result.t = fun (type a) (module S : Qsp_syntax.S.Analyzer with type Location.repr = a) -> let module Parser = Parser.Make (S) in let module IncrementalParser = Interpreter.Interpreter (Parser.MenhirInterpreter) in fun lexbuf -> - IncrementalParser.of_lexbuf lexbuf Lexer.token Parser.Incremental.main + IncrementalParser.of_lexbuf lexbuf UTF16.lexer Parser.Incremental.main |> Result.map_error (fun e -> let message = match e.Interpreter.code with diff --git a/lib/analyzer.mli b/lib/analyzer.mli index 3032375..02d7b47 100644 --- a/lib/analyzer.mli +++ b/lib/analyzer.mli @@ -8,5 +8,5 @@ val format_error : Format.formatter -> error -> unit val parse : (module Qsp_syntax.S.Analyzer with type Location.repr = 'a) -> - Lexing.lexbuf -> + Sedlexing.lexbuf -> ('a, error) Result.t diff --git a/lib/dune b/lib/dune index c88994d..5717f90 100644 --- a/lib/dune +++ b/lib/dune @@ -2,10 +2,11 @@ (name qparser) (libraries qsp_syntax - menhirLib) + menhirLib + ) (preprocess (pps - ppx_deriving.show - ppx_deriving.eq )) + sedlex.ppx + )) ) @@ -25,6 +26,3 @@ (flags --table --external-tokens Tokens) (merge_into parser) ) - -(ocamllex lexer) - diff --git a/lib/idents.ml b/lib/idents.ml new file mode 100644 index 0000000..dab0ba0 --- /dev/null +++ b/lib/idents.ml @@ -0,0 +1,185 @@ +open Tokens +module T = Qsp_syntax.T + +let keyword_table = Hashtbl.create 53 + +let char_of_uchar : Uchar.t -> char = + fun u -> match Uchar.is_char u with true -> Uchar.to_char u | _ -> '?' + +let of_uchars : Uchar.t array -> string = + fun arr -> Array.to_seq arr |> Seq.map char_of_uchar |> String.of_seq + +let _ = + List.iter + (fun (kwd, tok) -> Hashtbl.add keyword_table kwd tok) + [ + ("ACT", ACT); + ("ADDLIB", KEYWORD "INCLIB"); + ("ADDOBJ", KEYWORD "ADDOBJ"); + ("ADD OBJ", KEYWORD "ADDOBJ"); + ("ADDQST", KEYWORD "INCLIB"); + ("AND", AND); + ("ARRCOMP", FUNCTION T.Arrcomp); + ("ARRPOS", FUNCTION T.Arrpos); + ("ARRSIZE", FUNCTION T.Arrsize) + (* + ; "BACKIMAGE", KEYWORD "BACKIMAGE" + ; "$BACKIMAGE", KEYWORD "BACKIMAGE" + ; "BCOLOR", KEYWORD "BCOLOR" *); + ("CLA", KEYWORD "CLA"); + ("CLEAR", KEYWORD "CLEAR"); + ("*CLEAR", KEYWORD "*CLEAR"); + ("CLOSE", KEYWORD "CLOSE"); + ("CLOSE ALL", KEYWORD "CLOSEALL"); + ("CLR", KEYWORD "CLEAR"); + ("*CLR", KEYWORD "*CLEAR"); + ("CLS", KEYWORD "CLS"); + ("CMDCLEAR", KEYWORD "CMDCLEAR"); + ("CMDCLR", KEYWORD "CMDCLEAR"); + ("COPYARR", KEYWORD "COPYARR"); + ("COUNTOBJ", FUNCTION T.Countobj); + ("CURACTS", IDENT "CURACTS"); + ("$CURACTS", IDENT "$CURACTS"); + ("CURLOC", IDENT "CURLOC"); + ("$CURLOC", IDENT "$CURLOC") + (* + ; "DEBUG", KEYWORD "DEBUG" +*); + ("DELACT", KEYWORD "DELACT"); + ("DEL ACT", KEYWORD "DELACT"); + ("DELLIB", KEYWORD "FREELIB"); + ("DELOBJ", KEYWORD "DELOBJ"); + ("DEL OBJ", KEYWORD "DELOBJ"); + ("DESC", FUNCTION T.Desc); + ("$DESC", FUNCTION T.Desc') + (* + ; "DISABLESCROLL", KEYWORD "DISABLESCROLL" + ; "DISABLESUBEX", KEYWORD "DISABLESUBEX" +*); + ("DYNAMIC", KEYWORD "DYNAMIC"); + ("DYNEVAL", FUNCTION T.Dyneval); + ("$DYNEVAL", FUNCTION T.Dyneval'); + ("ELSE", ELSE); + ("ELSEIF", ELIF); + ("END", END); + ("EXEC", KEYWORD "EXEC"); + ("EXIT", KEYWORD "EXIT") + (* + ; "FCOLOR", KEYWORD "FCOLOR" + ; "$FNAME", KEYWORD "$FNAME" +*); + ("FREELIB", KEYWORD "FREELIB") + (* + ; "FSIZE", KEYWORD "FSIZE" +*); + ("FUNC", FUNCTION T.Func); + ("$FUNC", FUNCTION T.Func'); + ("GETOBJ", FUNCTION T.Getobj); + ("$GETOBJ", FUNCTION T.Getobj'); + ("GOSUB", KEYWORD "GOSUB"); + ("GOTO", KEYWORD "GOTO") (* + ; "GC", KEYWORD "GC" +*); + ("GS", KEYWORD "GOSUB"); + ("GT", KEYWORD "GOTO"); + ("IF", IF); + ("IIF", FUNCTION T.Iif); + ("$IIF", FUNCTION T.Iif'); + ("INCLIB", KEYWORD "INCLIB"); + ("INPUT", FUNCTION T.Input); + ("$INPUT", FUNCTION T.Input'); + ("INSTR", FUNCTION T.Instr); + ("ISNUM", FUNCTION T.Isnum); + ("ISPLAY", FUNCTION T.Isplay); + ("JUMP", KEYWORD "JUMP"); + ("KILLALL", KEYWORD "KILLALL"); + ("KILLOBJ", KEYWORD "KILLOBJ"); + ("KILLQST", KEYWORD "FREELIB"); + ("KILLVAR", KEYWORD "KILLVAR"); + ("LCASE", FUNCTION T.Lcase); + ("$LCASE", FUNCTION T.Lcase') + (* + ; "LCOLOR", KEYWORD "LCOLOR" +*); + ("LEN", FUNCTION T.Len); + ("LET", LET); + ("LOC", FUNCTION T.Loc); + ("MAINTXT", IDENT "MAINTXT"); + ("$MAINTXT", IDENT "MAINTXT"); + ("MAX", FUNCTION T.Max); + ("$MAX", FUNCTION T.Max'); + ("MENU", KEYWORD "MENU"); + ("MID", FUNCTION T.Mid); + ("$MID", FUNCTION T.Mid'); + ("MIN", FUNCTION T.Min); + ("$MIN", FUNCTION T.Min'); + ("MOD", MOD); + ("MSECSCOUNT", FUNCTION T.Msecscount); + ("MSG", KEYWORD "MSG"); + ("NL", KEYWORD "NL"); + ("*NL", KEYWORD "*NL"); + ("NO", NO) (* + ; "NOSAVE", KEYWORD "NOSAVE" +*); + ("OBJ", OBJ); + ("$ONACTSEL", IDENT "$ONACTSEL"); + ("$ONGLOAD", IDENT "$ONGLOAD"); + ("$ONGSAVE", IDENT "$ONGSAVE"); + ("$ONNEWLOC", IDENT "$ONNEWLOC"); + ("$ONOBJADD", IDENT "$ONOBJADD"); + ("$ONOBJDEL", IDENT "$ONOBJDEL"); + ("$ONOBJSEL", IDENT "$ONOBJSEL"); + ("OPENGAME", KEYWORD "OPENGAME"); + ("OPENQST", KEYWORD "OPENQST"); + ("OR", OR); + ("P", KEYWORD "P"); + ("*P", KEYWORD "*P"); + ("PL", KEYWORD "PL"); + ("*PL", KEYWORD "*PL"); + ("PLAY", KEYWORD "PLAY"); + ("QSPVER", FUNCTION T.Qspver); + ("$QSPVER", FUNCTION T.Qspver'); + ("RAND", FUNCTION T.Rand); + ("REFINT", KEYWORD "REFINT"); + ("REPLACE", FUNCTION T.Replace); + ("$REPLACE", FUNCTION T.Replace'); + ("RGB", FUNCTION T.Rgb); + ("RND", FUNCTION T.Rnd); + ("SAVEGAME", KEYWORD "SAVEGAME"); + ("SELACT", FUNCTION T.Selact); + ("$SELACT", IDENT "$SELACT"); + ("SELOBJ", IDENT "SELOBJ"); + ("$SELOBJ", IDENT "$SELOBJ"); + ("SET", SET); + ("SETTIMER", KEYWORD "SETTIMER"); + ("SHOWACTS", KEYWORD "SHOWACTS"); + ("SHOWINPUT", KEYWORD "SHOWINPUT"); + ("SHOWOBJS", KEYWORD "SHOWOBJS"); + ("SHOWSTAT", KEYWORD "SHOWSTAT"); + ("STATTXT", FUNCTION T.Stattxt); + ("$STATTXT", FUNCTION T.Stattxt'); + ("STR", FUNCTION T.Str); + ("$STR", FUNCTION T.Str'); + ("STRCOMP", FUNCTION T.Strcomp); + ("STRFIND", FUNCTION T.Strfind); + ("$STRFIND", FUNCTION T.Strfind'); + ("STRPOS", FUNCTION T.Strpos); + ("TRIM", FUNCTION T.Trim); + ("$TRIM", FUNCTION T.Trim'); + ("UCASE", FUNCTION T.Ucase); + ("$UCASE", FUNCTION T.Ucase'); + ("UNSEL", KEYWORD "UNSELECT"); + ("UNSELECT", KEYWORD "UNSELECT"); + ("USEHTML", IDENT "USEHTML"); + ("USERCOM", IDENT "USERCOM"); + ("$USERCOM", IDENT "$USERCOM"); + ("USER_TEXT", IDENT "USER_TEXT"); + ("$USER_TEXT", IDENT "USER_TEXT"); + ("USRTXT", IDENT "USER_TEXT"); + ("$USRTXT", IDENT "USER_TEXT"); + ("VAL", FUNCTION T.Val); + ("VIEW", KEYWORD "VIEW"); + ("WAIT", KEYWORD "WAIT"); + ("XGOTO", KEYWORD "XGOTO"); + ("XGT", KEYWORD "XGOTO"); + ] diff --git a/lib/interpreter.ml b/lib/interpreter.ml index b41d74e..21c1430 100644 --- a/lib/interpreter.ml +++ b/lib/interpreter.ml @@ -22,26 +22,27 @@ struct module E = MenhirLib.ErrorReports module L = MenhirLib.LexerUtil - let range_message start_pos end_pos code = { code; start_pos; end_pos } + type step = MI.token * Lexing.position * Lexing.position - let get_parse_error default_position env : error = + let range_message (start_pos, end_pos) code = { code; start_pos; end_pos } + + let get_parse_error lexbuf env : error = match MI.stack env with | (lazy Nil) -> - range_message default_position.Lexing.lex_start_p - default_position.Lexing.lex_curr_p InvalidSyntax + let positions = Sedlexing.lexing_positions lexbuf in + range_message positions InvalidSyntax | (lazy (Cons (MI.Element (state, _, start_pos, end_pos), _))) -> - range_message start_pos end_pos (MenhirCode (MI.number state)) + range_message (start_pos, end_pos) (MenhirCode (MI.number state)) let rec _parse : - (Lexing.lexbuf -> MI.token) -> - Lexing.lexbuf -> + (Sedlexing.lexbuf -> unit -> step) -> + Sedlexing.lexbuf -> 'a MI.checkpoint -> ('a, error) Result.t = - fun get_token (lexbuf : Lexing.lexbuf) (checkpoint : 'a MI.checkpoint) -> + fun get_token (lexbuf : Sedlexing.lexbuf) (checkpoint : 'a MI.checkpoint) -> match checkpoint with | MI.InputNeeded _env -> - let token = get_token lexbuf in - let startp = lexbuf.Lexing.lex_start_p and endp = lexbuf.lex_curr_p in + let token, startp, endp = get_token lexbuf () in let checkpoint = MI.offer checkpoint (token, startp, endp) in _parse get_token lexbuf checkpoint | MI.Shifting _ | MI.AboutToReduce _ -> @@ -52,19 +53,18 @@ struct Error err | MI.Accepted v -> Ok v | MI.Rejected -> - let err = - range_message lexbuf.lex_start_p lexbuf.lex_curr_p InvalidSyntax - in + let positions = Sedlexing.lexing_positions lexbuf in + let err = range_message positions InvalidSyntax in Error err type 'a builder = Lexing.position -> 'a MI.checkpoint let of_lexbuf : - Lexing.lexbuf -> - (Lexing.lexbuf -> MI.token) -> + Sedlexing.lexbuf -> + (Sedlexing.lexbuf -> unit -> step) -> 'a builder -> ('a, error) result = fun lexbuf lexer f -> - let init = f lexbuf.lex_curr_p in + let init = f (fst (Sedlexing.lexing_positions lexbuf)) in _parse lexer lexbuf init end diff --git a/lib/lexer.mll b/lib/lexer.mll deleted file mode 100644 index d3d5e5b..0000000 --- a/lib/lexer.mll +++ /dev/null @@ -1,362 +0,0 @@ -{ - open Tokens - module T = Qsp_syntax.T - - exception UnclosedQuote of { content: string ; line : int} - - (* The comment system is terrible. The same symbol can be used for : - - starting a comment - - inequality operation - In order to manage this, I try to identify the context in a very basic way, - using a flag True False for determining the token to send. - *) - module Bucket = Ephemeron.K1.Bucket - type bucket = (Lexing.lexbuf, int) Bucket.t - let is_expression : bucket = Bucket.make () - - let incr_level lexbuf = - match Bucket.find is_expression lexbuf with - | None -> Bucket.add is_expression lexbuf 1 - | Some v -> Bucket.add is_expression lexbuf (v+1) - - let decr_level lexbuf = - match Bucket.find is_expression lexbuf with - | None -> () - | Some v -> - if v > 1 then - Bucket.add is_expression lexbuf (v-1) - else - Bucket.remove is_expression lexbuf - - let keyword_table = Hashtbl.create 53 - let _ = - List.iter (fun (kwd, tok) -> Hashtbl.add keyword_table kwd tok) - [ "ACT", ACT - ; "ADDLIB", KEYWORD "INCLIB" - ; "ADDOBJ", KEYWORD "ADDOBJ" - ; "ADD OBJ", KEYWORD "ADDOBJ" - ; "ADDQST", KEYWORD "INCLIB" - ; "AND", AND - ; "ARRCOMP", FUNCTION T.Arrcomp - ; "ARRPOS", FUNCTION T.Arrpos - ; "ARRSIZE", FUNCTION T.Arrsize -(* - ; "BACKIMAGE", KEYWORD "BACKIMAGE" - ; "$BACKIMAGE", KEYWORD "BACKIMAGE" - ; "BCOLOR", KEYWORD "BCOLOR" -*) - ; "CLA", KEYWORD "CLA" - ; "CLEAR", KEYWORD "CLEAR" - ; "*CLEAR", KEYWORD "*CLEAR" - ; "CLOSE", KEYWORD "CLOSE" - ; "CLOSE ALL", KEYWORD "CLOSEALL" - ; "CLR", KEYWORD "CLEAR" - ; "*CLR", KEYWORD "*CLEAR" - ; "CLS", KEYWORD "CLS" - ; "CMDCLEAR", KEYWORD "CMDCLEAR" - ; "CMDCLR", KEYWORD "CMDCLEAR" - ; "COPYARR", KEYWORD "COPYARR" - ; "COUNTOBJ", FUNCTION T.Countobj - ; "CURACTS", IDENT "CURACTS" - ; "$CURACTS", IDENT "$CURACTS" - ; "CURLOC", IDENT "CURLOC" - ; "$CURLOC", IDENT "$CURLOC" -(* - ; "DEBUG", KEYWORD "DEBUG" -*) - ; "DELACT", KEYWORD "DELACT" - ; "DEL ACT", KEYWORD "DELACT" - ; "DELLIB", KEYWORD "FREELIB" - ; "DELOBJ", KEYWORD "DELOBJ" - ; "DEL OBJ", KEYWORD "DELOBJ" - ; "DESC", FUNCTION T.Desc - ; "$DESC", FUNCTION T.Desc' -(* - ; "DISABLESCROLL", KEYWORD "DISABLESCROLL" - ; "DISABLESUBEX", KEYWORD "DISABLESUBEX" -*) - ; "DYNAMIC", KEYWORD "DYNAMIC" - ; "DYNEVAL", FUNCTION T.Dyneval - ; "$DYNEVAL", FUNCTION T.Dyneval' - ; "ELSE", ELSE - ; "ELSEIF", ELIF - ; "END", END - ; "EXEC", KEYWORD "EXEC" - ; "EXIT", KEYWORD "EXIT" -(* - ; "FCOLOR", KEYWORD "FCOLOR" - ; "$FNAME", KEYWORD "$FNAME" -*) - ; "FREELIB", KEYWORD "FREELIB" -(* - ; "FSIZE", KEYWORD "FSIZE" -*) - ; "FUNC", FUNCTION T.Func - ; "$FUNC", FUNCTION T.Func' - ; "GETOBJ", FUNCTION T.Getobj - ; "$GETOBJ", FUNCTION T.Getobj' - ; "GOSUB", KEYWORD "GOSUB" - ; "GOTO", KEYWORD "GOTO" -(* - ; "GC", KEYWORD "GC" -*) - ; "GS", KEYWORD "GOSUB" - ; "GT", KEYWORD "GOTO" - ; "IF", IF - ; "IIF", FUNCTION T.Iif - ; "$IIF", FUNCTION T.Iif' - ; "INCLIB", KEYWORD "INCLIB" - ; "INPUT", FUNCTION T.Input - ; "$INPUT", FUNCTION T.Input' - ; "INSTR", FUNCTION T.Instr - ; "ISNUM", FUNCTION T.Isnum - ; "ISPLAY", FUNCTION T.Isplay - ; "JUMP", KEYWORD "JUMP" - ; "KILLALL", KEYWORD "KILLALL" - ; "KILLOBJ", KEYWORD "KILLOBJ" - ; "KILLQST", KEYWORD "FREELIB" - ; "KILLVAR", KEYWORD "KILLVAR" - ; "LCASE", FUNCTION T.Lcase - ; "$LCASE", FUNCTION T.Lcase' -(* - ; "LCOLOR", KEYWORD "LCOLOR" -*) - ; "LEN", FUNCTION T.Len - ; "LET", LET - ; "LOC", FUNCTION T.Loc - ; "MAINTXT", IDENT "MAINTXT" - ; "$MAINTXT", IDENT "MAINTXT" - ; "MAX", FUNCTION T.Max - ; "$MAX", FUNCTION T.Max' - ; "MENU", KEYWORD "MENU" - ; "MID", FUNCTION T.Mid - ; "$MID", FUNCTION T.Mid' - ; "MIN", FUNCTION T.Min - ; "$MIN", FUNCTION T.Min' - ; "MOD", MOD - ; "MSECSCOUNT", FUNCTION T.Msecscount - ; "MSG", KEYWORD "MSG" - ; "NL", KEYWORD "NL" - ; "*NL", KEYWORD "*NL" - ; "NO", NO -(* - ; "NOSAVE", KEYWORD "NOSAVE" -*) - ; "OBJ", OBJ - ; "$ONACTSEL", IDENT "$ONACTSEL" - ; "$ONGLOAD", IDENT "$ONGLOAD" - ; "$ONGSAVE", IDENT "$ONGSAVE" - ; "$ONNEWLOC", IDENT "$ONNEWLOC" - ; "$ONOBJADD", IDENT "$ONOBJADD" - ; "$ONOBJDEL", IDENT "$ONOBJDEL" - ; "$ONOBJSEL", IDENT "$ONOBJSEL" - ; "OPENGAME", KEYWORD "OPENGAME" - ; "OPENQST", KEYWORD "OPENQST" - ; "OR", OR - ; "P", KEYWORD "P" - ; "*P", KEYWORD "*P" - ; "PL", KEYWORD "PL" - ; "*PL", KEYWORD "*PL" - ; "PLAY", KEYWORD "PLAY" - ; "QSPVER", FUNCTION T.Qspver - ; "$QSPVER", FUNCTION T.Qspver' - ; "RAND", FUNCTION T.Rand - ; "REFINT", KEYWORD "REFINT" - ; "REPLACE", FUNCTION T.Replace - ; "$REPLACE", FUNCTION T.Replace' - ; "RGB", FUNCTION T.Rgb - ; "RND", FUNCTION T.Rnd - ; "SAVEGAME", KEYWORD "SAVEGAME" - ; "SELACT", FUNCTION T.Selact - ; "$SELACT", IDENT "$SELACT" - ; "SELOBJ", IDENT "SELOBJ" - ; "$SELOBJ", IDENT "$SELOBJ" - ; "SET", SET - ; "SETTIMER", KEYWORD "SETTIMER" - ; "SHOWACTS", KEYWORD "SHOWACTS" - ; "SHOWINPUT", KEYWORD "SHOWINPUT" - ; "SHOWOBJS", KEYWORD "SHOWOBJS" - ; "SHOWSTAT", KEYWORD "SHOWSTAT" - ; "STATTXT", FUNCTION T.Stattxt - ; "$STATTXT", FUNCTION T.Stattxt' - ; "STR", FUNCTION T.Str - ; "$STR", FUNCTION T.Str' - ; "STRCOMP", FUNCTION T.Strcomp - ; "STRFIND", FUNCTION T.Strfind - ; "$STRFIND", FUNCTION T.Strfind' - ; "STRPOS", FUNCTION T.Strpos - ; "TRIM", FUNCTION T.Trim - ; "$TRIM", FUNCTION T.Trim' - ; "UCASE", FUNCTION T.Ucase - ; "$UCASE", FUNCTION T.Ucase' - ; "UNSEL", KEYWORD "UNSELECT" - ; "UNSELECT", KEYWORD "UNSELECT" - ; "USEHTML", IDENT "USEHTML" - ; "USERCOM", IDENT "USERCOM" - ; "$USERCOM", IDENT "$USERCOM" - ; "USER_TEXT", IDENT "USER_TEXT" - ; "$USER_TEXT", IDENT "USER_TEXT" - ; "USRTXT", IDENT "USER_TEXT" - ; "$USRTXT", IDENT "USER_TEXT" - ; "VAL", FUNCTION T.Val - ; "VIEW", KEYWORD "VIEW" - ; "WAIT", KEYWORD "WAIT" - ; "XGOTO", KEYWORD "XGOTO" - ; "XGT", KEYWORD "XGOTO" - ] - - let ident lexbuf id = - let id = String.uppercase_ascii id in - try - let value = Hashtbl.find keyword_table id in - let _ = match value with - | IF | ELIF -> incr_level lexbuf - | _ -> () in - value - - with Not_found -> IDENT id - - let wait_balance rule lexbuf = - try[@warning "-52"] - rule (Buffer.create 17) lexbuf - with - Failure "lexing: empty token" -> - let line = lexbuf.Lexing.lex_curr_p.pos_lnum - and content = Bytes.to_string lexbuf.Lexing.lex_buffer in - raise (UnclosedQuote {line; content}) - [@warning "+52"] -} - -let space = [ ' ' '\t' ] -let coma = ',' -let letters = [^ '!' ':' '&' '=' '<' '>' '+' '-' '*' '/' ',' '"' '\'' '(' ')' '[' ']' ' ' '\t' '\n' '\r' '{' '}'] -let digit = [ '0'-'9' ] -let eol = [ '\r' '\n' ] -let spaces = space+ -let ident = (letters # digit) letters* - -rule token = parse - -| '#' spaces* (('!' | '$' | '#' | '^')+)? (ident as loc) { - Lexing.set_filename lexbuf loc; - LOCATION_START loc} -| (digit+) as l { INTEGER l} -| '+' { PLUS } -| '-' { MINUS } -| "+=" { INCR } -| "-=" { DECR } -| "*=" { MULT_EQUAL } -| '/' { DIV } -| '*' { STAR } -| ':' { - (* We are leaving the block, the comment will be handled again *) - decr_level lexbuf; - COLUMN } -| '-' '-'+ [ ^ '\r' '\n']* { LOCATION_END } -| '[' { L_BRACKET } -| ']' { R_BRACKET } -| '(' { - incr_level lexbuf; - L_PAREN } -| ')' { - decr_level lexbuf; - R_PAREN } -| '<' { LT } -| '>' { GT } -| coma { COMA } -| eof { - Bucket.remove is_expression lexbuf; - EOF } -| '=' { - incr_level lexbuf; - EQUAL } -| ident as l { ident lexbuf l} -| eol { - Bucket.add is_expression lexbuf 0; - Lexing.new_line lexbuf; EOL } -| '&' { - Bucket.add is_expression lexbuf 0; - AMPERSAND } -| '!' { - match Bucket.find is_expression lexbuf with - | Some i when i <> 0 -> EXCLAMATION - | _ -> skip_comment lexbuf -} -| spaces { token lexbuf } -| '\'' { LITERAL (wait_balance read_quoted_string lexbuf) } -| '"' { LITERAL (wait_balance read_dquoted_string lexbuf) } -| '{' { LITERAL (wait_balance (read_long_string 0) lexbuf) } - -and skip_comment = parse -| [^ '\'' '"' '{' '\r' '\n' ] { skip_comment lexbuf } -| '{' { - let _ = wait_balance (read_long_string 0) lexbuf in - skip_comment lexbuf } -| '\'' { - let _ = wait_balance read_quoted_string lexbuf in - skip_comment lexbuf -} -| '"' { - let _ = wait_balance read_dquoted_string lexbuf in - skip_comment lexbuf -} -| eol { - (* Ugly hack used in order to put the eol in the front of the next - parsing. *) - lexbuf.lex_curr_pos <- lexbuf.lex_curr_pos - 1; - (*Lexing.new_line lexbuf;*) - COMMENT -} - -(* Read the content until we got another one quote *) -and read_quoted_string buf = parse - | "''" - { Buffer.add_char buf '\''; - read_quoted_string buf lexbuf - } - | [^ '\'' '\n' '\r']+ - { Buffer.add_string buf (Lexing.lexeme lexbuf); - read_quoted_string buf lexbuf - } - | '\'' - { (Buffer.contents buf) - } - | ['\n' '\r'] - { Buffer.add_string buf (Lexing.lexeme lexbuf); - Lexing.new_line lexbuf ; - read_quoted_string buf lexbuf - } - -and read_dquoted_string buf = parse - | [^ '"' ]+ - { Buffer.add_string buf (Lexing.lexeme lexbuf); - read_dquoted_string buf lexbuf - } - | "\"\"" - { Buffer.add_char buf '"'; - read_dquoted_string buf lexbuf - } - | '"' - { (Buffer.contents buf) - } - -and read_long_string level buf = parse - | [^ '{' '}' '\r' '\n' ]+ - { Buffer.add_string buf (Lexing.lexeme lexbuf); - read_long_string level buf lexbuf - } - | '{' - { Buffer.add_string buf (Lexing.lexeme lexbuf); - read_long_string (level + 1) buf lexbuf } - | '}' - { match level with - | 0 -> (Buffer.contents buf) - | _ -> Buffer.add_string buf (Lexing.lexeme lexbuf); - read_long_string (level - 1) buf lexbuf - } -| eol -{ - Lexing.new_line lexbuf; - Buffer.add_string buf (Lexing.lexeme lexbuf); - read_long_string level buf lexbuf -} diff --git a/test/dune b/test/dune index 2f4e803..74546a0 100644 --- a/test/dune +++ b/test/dune @@ -2,6 +2,7 @@ (name qsp_parser_test) (libraries alcotest + sedlex qparser qsp_syntax fmt diff --git a/test/qsp_parser_test.ml b/test/qsp_parser_test.ml index b833fe5..6bf249d 100644 --- a/test/qsp_parser_test.ml +++ b/test/qsp_parser_test.ml @@ -7,10 +7,16 @@ let _position = (Lexing.dummy_pos, Lexing.dummy_pos) type 'a location = 'a * 'a Ast.statement list [@@deriving eq, show] +(** Run the parser with the given expression and return the result *) let parse : string -> T.pos location = fun content -> - let lexing = Lexing.from_string content in - Parser.main Qparser.Lexer.token lexing + let lexing = Sedlexing.Latin1.from_string content in + + match Qparser.Analyzer.parse (module Qsp_syntax.Tree) lexing with + | Ok e -> e + | Error e -> + let msg = Format.asprintf "%a" Qparser.Analyzer.format_error e in + raise (Failure msg) let location : T.pos location Alcotest.testable = let equal = equal_location (fun _ _ -> true) in -- cgit v1.2.3