aboutsummaryrefslogtreecommitdiff
path: root/lib/lexer.mll
diff options
context:
space:
mode:
Diffstat (limited to 'lib/lexer.mll')
-rw-r--r--lib/lexer.mll361
1 files changed, 361 insertions, 0 deletions
diff --git a/lib/lexer.mll b/lib/lexer.mll
new file mode 100644
index 0000000..4796e62
--- /dev/null
+++ b/lib/lexer.mll
@@ -0,0 +1,361 @@
+{
+ open Tokens
+ module T = Qsp_syntax.T
+
+ exception UnclosedQuote of { content: string ; line : int}
+
+ (* The comment system is terrible. The same symbol can be used for :
+ - starting a comment
+ - inequality operation
+ In order to manage this, I try to identify the context in a very basic way,
+ using a flag True False for determining the token to send.
+ *)
+ module Bucket = Ephemeron.K1.Bucket
+ type bucket = (Lexing.lexbuf, int) Bucket.t
+ let is_expression : bucket = Bucket.make ()
+
+ let incr_level lexbuf =
+ match Bucket.find is_expression lexbuf with
+ | None -> Bucket.add is_expression lexbuf 1
+ | Some v -> Bucket.add is_expression lexbuf (v+1)
+
+ let decr_level lexbuf =
+ match Bucket.find is_expression lexbuf with
+ | None -> ()
+ | Some v ->
+ if v > 1 then
+ Bucket.add is_expression lexbuf (v-1)
+ else
+ Bucket.remove is_expression lexbuf
+
+ let keyword_table = Hashtbl.create 53
+ let _ =
+ List.iter (fun (kwd, tok) -> Hashtbl.add keyword_table kwd tok)
+ [ "ACT", ACT
+ ; "ADDLIB", KEYWORD "INCLIB"
+ ; "ADDOBJ", KEYWORD "ADDOBJ"
+ ; "ADD OBJ", KEYWORD "ADDOBJ"
+ ; "ADDQST", KEYWORD "INCLIB"
+ ; "AND", AND
+ ; "ARRCOMP", FUNCTION T.Arrcomp
+ ; "ARRPOS", FUNCTION T.Arrpos
+ ; "ARRSIZE", FUNCTION T.Arrsize
+(*
+ ; "BACKIMAGE", KEYWORD "BACKIMAGE"
+ ; "$BACKIMAGE", KEYWORD "BACKIMAGE"
+ ; "BCOLOR", KEYWORD "BCOLOR"
+*)
+ ; "CLA", KEYWORD "CLA"
+ ; "CLEAR", KEYWORD "CLEAR"
+ ; "*CLEAR", KEYWORD "*CLEAR"
+ ; "CLOSE", KEYWORD "CLOSE"
+ ; "CLOSE ALL", KEYWORD "CLOSEALL"
+ ; "CLR", KEYWORD "CLEAR"
+ ; "*CLR", KEYWORD "*CLEAR"
+ ; "CLS", KEYWORD "CLS"
+ ; "CMDCLEAR", KEYWORD "CMDCLEAR"
+ ; "CMDCLR", KEYWORD "CMDCLEAR"
+ ; "COPYARR", KEYWORD "COPYARR"
+ ; "COUNTOBJ", FUNCTION T.Countobj
+ ; "CURACTS", IDENT "CURACTS"
+ ; "$CURACTS", IDENT "CURACTS"
+ ; "CURLOC", IDENT "CURLOC"
+ ; "$CURLOC", IDENT "CURLOC"
+(*
+ ; "DEBUG", KEYWORD "DEBUG"
+*)
+ ; "DELACT", KEYWORD "DELACT"
+ ; "DEL ACT", KEYWORD "DELACT"
+ ; "DELLIB", KEYWORD "FREELIB"
+ ; "DELOBJ", KEYWORD "DELOBJ"
+ ; "DEL OBJ", KEYWORD "DELOBJ"
+ ; "DESC", FUNCTION T.Desc
+ ; "$DESC", FUNCTION T.Desc'
+(*
+ ; "DISABLESCROLL", KEYWORD "DISABLESCROLL"
+ ; "DISABLESUBEX", KEYWORD "DISABLESUBEX"
+*)
+ ; "DYNAMIC", KEYWORD "DYNAMIC"
+ ; "DYNEVAL", FUNCTION T.Dyneval
+ ; "$DYNEVAL", FUNCTION T.Dyneval'
+ ; "ELSE", ELSE
+ ; "ELSEIF", ELIF
+ ; "END", END
+ ; "EXEC", KEYWORD "EXEC"
+ ; "EXIT", KEYWORD "EXIT"
+(*
+ ; "FCOLOR", KEYWORD "FCOLOR"
+ ; "$FNAME", KEYWORD "$FNAME"
+*)
+ ; "FREELIB", KEYWORD "FREELIB"
+(*
+ ; "FSIZE", KEYWORD "FSIZE"
+*)
+ ; "FUNC", FUNCTION T.Func
+ ; "$FUNC", FUNCTION T.Func'
+ ; "GETOBJ", FUNCTION T.Getobj
+ ; "$GETOBJ", FUNCTION T.Getobj'
+ ; "GOSUB", KEYWORD "GOSUB"
+ ; "GOTO", KEYWORD "GOTO"
+(*
+ ; "GC", KEYWORD "GC"
+*)
+ ; "GS", KEYWORD "GOSUB"
+ ; "GT", KEYWORD "GOTO"
+ ; "IF", IF
+ ; "IIF", FUNCTION T.Iif
+ ; "$IIF", FUNCTION T.Iif'
+ ; "INCLIB", KEYWORD "INCLIB"
+ ; "INPUT", FUNCTION T.Input
+ ; "$INPUT", FUNCTION T.Input'
+ ; "INSTR", FUNCTION T.Instr
+ ; "ISNUM", FUNCTION T.Isnum
+ ; "ISPLAY", FUNCTION T.Isplay
+ ; "JUMP", KEYWORD "JUMP"
+ ; "KILLALL", KEYWORD "KILLALL"
+ ; "KILLOBJ", KEYWORD "KILLOBJ"
+ ; "KILLQST", KEYWORD "FREELIB"
+ ; "KILLVAR", KEYWORD "KILLVAR"
+ ; "LCASE", FUNCTION T.Lcase
+ ; "$LCASE", FUNCTION T.Lcase'
+(*
+ ; "LCOLOR", KEYWORD "LCOLOR"
+*)
+ ; "LEN", FUNCTION T.Len
+ ; "LET", LET
+ ; "LOC", LOC
+ ; "MAINTXT", IDENT "MAINTXT"
+ ; "$MAINTXT", IDENT "MAINTXT"
+ ; "MAX", FUNCTION T.Max
+ ; "$MAX", FUNCTION T.Max'
+ ; "MENU", KEYWORD "MENU"
+ ; "MID", FUNCTION T.Mid
+ ; "$MID", FUNCTION T.Mid'
+ ; "MIN", FUNCTION T.Min
+ ; "$MIN", FUNCTION T.Min'
+ ; "MOD", MOD
+ ; "MSECSCOUNT", FUNCTION T.Msecscount
+ ; "MSG", KEYWORD "MSG"
+ ; "NL", KEYWORD "NL"
+ ; "*NL", KEYWORD "*NL"
+ ; "NO", NO
+(*
+ ; "NOSAVE", KEYWORD "NOSAVE"
+*)
+ ; "OBJ", OBJ
+ ; "$ONACTSEL", IDENT "$ONACTSEL"
+ ; "$ONGLOAD", IDENT "$ONGLOAD"
+ ; "$ONGSAVE", IDENT "$ONGSAVE"
+ ; "$ONNEWLOC", IDENT "$ONNEWLOC"
+ ; "$ONOBJADD", IDENT "$ONOBJADD"
+ ; "$ONOBJDEL", IDENT "$ONOBJDEL"
+ ; "$ONOBJSEL", IDENT "$ONOBJSEL"
+ ; "OPENGAME", KEYWORD "OPENGAME"
+ ; "OPENQST", KEYWORD "OPENQST"
+ ; "OR", OR
+ ; "P", KEYWORD "P"
+ ; "*P", KEYWORD "*P"
+ ; "PL", KEYWORD "PL"
+ ; "*PL", KEYWORD "*PL"
+ ; "PLAY", KEYWORD "PLAY"
+ ; "QSPVER", FUNCTION T.Qspver
+ ; "$QSPVER", FUNCTION T.Qspver'
+ ; "RAND", FUNCTION T.Rand
+ ; "REFINT", KEYWORD "REFINT"
+ ; "REPLACE", FUNCTION T.Replace
+ ; "$REPLACE", FUNCTION T.Replace'
+ ; "RGB", FUNCTION T.Rgb
+ ; "RND", FUNCTION T.Rnd
+ ; "SAVEGAME", KEYWORD "SAVEGAME"
+ ; "SELACT", FUNCTION T.Selact
+ ; "$SELACT", IDENT "SELACT"
+ ; "SELOBJ", IDENT "SELOBJ"
+ ; "$SELOBJ", IDENT "SELOBJ"
+ ; "SET", SET
+ ; "SETTIMER", KEYWORD "SETTIMER"
+ ; "SHOWACTS", KEYWORD "SHOWACTS"
+ ; "SHOWINPUT", KEYWORD "SHOWINPUT"
+ ; "SHOWOBJS", KEYWORD "SHOWOBJS"
+ ; "SHOWSTAT", KEYWORD "SHOWSTAT"
+ ; "STATTXT", FUNCTION T.Stattxt
+ ; "$STATTXT", FUNCTION T.Stattxt'
+ ; "STR", FUNCTION T.Str
+ ; "$STR", FUNCTION T.Str'
+ ; "STRCOMP", FUNCTION T.Strcomp
+ ; "STRFIND", FUNCTION T.Strfind
+ ; "$STRFIND", FUNCTION T.Strfind'
+ ; "STRPOS", FUNCTION T.Strpos
+ ; "TRIM", FUNCTION T.Trim
+ ; "$TRIM", FUNCTION T.Trim'
+ ; "UCASE", FUNCTION T.Ucase
+ ; "$UCASE", FUNCTION T.Ucase'
+ ; "UNSEL", KEYWORD "UNSELECT"
+ ; "UNSELECT", KEYWORD "UNSELECT"
+ ; "USEHTML", IDENT "USEHTML"
+ ; "USERCOM", IDENT "USERCOM"
+ ; "$USERCOM", IDENT "USERCOM"
+ ; "USER_TEXT", IDENT "USER_TEXT"
+ ; "$USER_TEXT", IDENT "USER_TEXT"
+ ; "USRTXT", IDENT "USER_TEXT"
+ ; "$USRTXT", IDENT "USER_TEXT"
+ ; "VAL", FUNCTION T.Val
+ ; "VIEW", KEYWORD "VIEW"
+ ; "WAIT", KEYWORD "WAIT"
+ ; "XGOTO", KEYWORD "XGOTO"
+ ; "XGT", KEYWORD "XGOTO"
+ ]
+
+ let ident lexbuf id =
+ let id = String.uppercase_ascii id in
+ try
+ let value = Hashtbl.find keyword_table id in
+ let _ = match value with
+ | IF | ELIF -> incr_level lexbuf
+ | _ -> () in
+ value
+
+ with Not_found -> IDENT id
+
+ let wait_balance rule lexbuf =
+ try[@warning "-52"]
+ rule (Buffer.create 17) lexbuf
+ with
+ Failure "lexing: empty token" ->
+ let line = lexbuf.Lexing.lex_curr_p.pos_lnum
+ and content = Bytes.to_string lexbuf.Lexing.lex_buffer in
+ raise (UnclosedQuote {line; content})
+ [@warning "+52"]
+}
+
+let space = [ ' ' '\t' ]
+let coma = ','
+let letters = [^ '!' ':' '&' '=' '<' '>' '+' '-' '*' '/' ',' '"' '\'' '(' ')' '[' ']' ' ' '\t' '\n' '\r' '{' '}']
+let digit = [ '0'-'9' ]
+let eol = [ '\r' '\n' ]
+let spaces = space+
+let ident = (letters # digit) letters*
+
+rule token = parse
+
+| '#' spaces* (('!' | '$' | '#' | '^')+)? (ident as loc) {
+ Lexing.set_filename lexbuf loc;
+ LOCATION_START loc}
+| (digit+) as l { INTEGER l}
+| '+' { PLUS }
+| '-' { MINUS }
+| "+=" { INCR }
+| "-=" { DECR }
+| '/' { DIV }
+| '*' { STAR }
+| ':' {
+ (* We are leaving the block, the comment will be handled again *)
+ decr_level lexbuf;
+ COLUMN }
+| '-' '-'+ [ ^ '\r' '\n']* { LOCATION_END }
+| '[' { L_BRACKET }
+| ']' { R_BRACKET }
+| '(' {
+ incr_level lexbuf;
+ L_PAREN }
+| ')' {
+ decr_level lexbuf;
+ R_PAREN }
+| '<' { LT }
+| '>' { GT }
+| coma { COMA }
+| eof {
+ Bucket.remove is_expression lexbuf;
+ EOF }
+| '=' {
+ incr_level lexbuf;
+ EQUAL }
+| ident as l { ident lexbuf l}
+| eol {
+ Bucket.add is_expression lexbuf 0;
+ Lexing.new_line lexbuf; EOL }
+| '&' {
+ Bucket.add is_expression lexbuf 0;
+ AMPERSAND }
+| '!' {
+ match Bucket.find is_expression lexbuf with
+ | Some i when i <> 0 -> EXCLAMATION
+ | _ -> skip_comment lexbuf
+}
+| spaces { token lexbuf }
+| '\'' { LITERAL (wait_balance read_quoted_string lexbuf) }
+| '"' { LITERAL (wait_balance read_dquoted_string lexbuf) }
+| '{' { LITERAL (wait_balance (read_long_string 0) lexbuf) }
+
+and skip_comment = parse
+| [^ '\'' '"' '{' '\r' '\n' ] { skip_comment lexbuf }
+| '{' {
+ let _ = wait_balance (read_long_string 0) lexbuf in
+ skip_comment lexbuf }
+| '\'' {
+ let _ = wait_balance read_quoted_string lexbuf in
+ skip_comment lexbuf
+}
+| '"' {
+ let _ = wait_balance read_dquoted_string lexbuf in
+ skip_comment lexbuf
+}
+| eol {
+ (* Ugly hack used in order to put the eol in the front of the next
+ parsing. *)
+ lexbuf.lex_curr_pos <- lexbuf.lex_curr_pos - 1;
+ (*Lexing.new_line lexbuf;*)
+ COMMENT
+}
+
+(* Read the content until we got another one quote *)
+and read_quoted_string buf = parse
+ | "''"
+ { Buffer.add_char buf '\'';
+ read_quoted_string buf lexbuf
+ }
+ | [^ '\'' '\n' '\r']+
+ { Buffer.add_string buf (Lexing.lexeme lexbuf);
+ read_quoted_string buf lexbuf
+ }
+ | '\''
+ { (Buffer.contents buf)
+ }
+ | ['\n' '\r']
+ { Buffer.add_string buf (Lexing.lexeme lexbuf);
+ Lexing.new_line lexbuf ;
+ read_quoted_string buf lexbuf
+ }
+
+and read_dquoted_string buf = parse
+ | [^ '"' ]+
+ { Buffer.add_string buf (Lexing.lexeme lexbuf);
+ read_dquoted_string buf lexbuf
+ }
+ | "\"\""
+ { Buffer.add_char buf '"';
+ read_dquoted_string buf lexbuf
+ }
+ | '"'
+ { (Buffer.contents buf)
+ }
+
+and read_long_string level buf = parse
+ | [^ '{' '}' '\r' '\n' ]+
+ { Buffer.add_string buf (Lexing.lexeme lexbuf);
+ read_long_string level buf lexbuf
+ }
+ | '{'
+ { Buffer.add_string buf (Lexing.lexeme lexbuf);
+ read_long_string (level + 1) buf lexbuf }
+ | '}'
+ { match level with
+ | 0 -> (Buffer.contents buf)
+ | _ -> Buffer.add_string buf (Lexing.lexeme lexbuf);
+ read_long_string (level - 1) buf lexbuf
+ }
+| eol
+{
+ Lexing.new_line lexbuf;
+ Buffer.add_string buf (Lexing.lexeme lexbuf);
+ read_long_string level buf lexbuf
+}