diff options
Diffstat (limited to 'lib/lexer.mll')
-rw-r--r-- | lib/lexer.mll | 361 |
1 files changed, 361 insertions, 0 deletions
diff --git a/lib/lexer.mll b/lib/lexer.mll new file mode 100644 index 0000000..4796e62 --- /dev/null +++ b/lib/lexer.mll @@ -0,0 +1,361 @@ +{ + open Tokens + module T = Qsp_syntax.T + + exception UnclosedQuote of { content: string ; line : int} + + (* The comment system is terrible. The same symbol can be used for : + - starting a comment + - inequality operation + In order to manage this, I try to identify the context in a very basic way, + using a flag True False for determining the token to send. + *) + module Bucket = Ephemeron.K1.Bucket + type bucket = (Lexing.lexbuf, int) Bucket.t + let is_expression : bucket = Bucket.make () + + let incr_level lexbuf = + match Bucket.find is_expression lexbuf with + | None -> Bucket.add is_expression lexbuf 1 + | Some v -> Bucket.add is_expression lexbuf (v+1) + + let decr_level lexbuf = + match Bucket.find is_expression lexbuf with + | None -> () + | Some v -> + if v > 1 then + Bucket.add is_expression lexbuf (v-1) + else + Bucket.remove is_expression lexbuf + + let keyword_table = Hashtbl.create 53 + let _ = + List.iter (fun (kwd, tok) -> Hashtbl.add keyword_table kwd tok) + [ "ACT", ACT + ; "ADDLIB", KEYWORD "INCLIB" + ; "ADDOBJ", KEYWORD "ADDOBJ" + ; "ADD OBJ", KEYWORD "ADDOBJ" + ; "ADDQST", KEYWORD "INCLIB" + ; "AND", AND + ; "ARRCOMP", FUNCTION T.Arrcomp + ; "ARRPOS", FUNCTION T.Arrpos + ; "ARRSIZE", FUNCTION T.Arrsize +(* + ; "BACKIMAGE", KEYWORD "BACKIMAGE" + ; "$BACKIMAGE", KEYWORD "BACKIMAGE" + ; "BCOLOR", KEYWORD "BCOLOR" +*) + ; "CLA", KEYWORD "CLA" + ; "CLEAR", KEYWORD "CLEAR" + ; "*CLEAR", KEYWORD "*CLEAR" + ; "CLOSE", KEYWORD "CLOSE" + ; "CLOSE ALL", KEYWORD "CLOSEALL" + ; "CLR", KEYWORD "CLEAR" + ; "*CLR", KEYWORD "*CLEAR" + ; "CLS", KEYWORD "CLS" + ; "CMDCLEAR", KEYWORD "CMDCLEAR" + ; "CMDCLR", KEYWORD "CMDCLEAR" + ; "COPYARR", KEYWORD "COPYARR" + ; "COUNTOBJ", FUNCTION T.Countobj + ; "CURACTS", IDENT "CURACTS" + ; "$CURACTS", IDENT "CURACTS" + ; "CURLOC", IDENT "CURLOC" + ; "$CURLOC", IDENT "CURLOC" +(* + ; "DEBUG", KEYWORD "DEBUG" +*) + ; "DELACT", KEYWORD "DELACT" + ; "DEL ACT", KEYWORD "DELACT" + ; "DELLIB", KEYWORD "FREELIB" + ; "DELOBJ", KEYWORD "DELOBJ" + ; "DEL OBJ", KEYWORD "DELOBJ" + ; "DESC", FUNCTION T.Desc + ; "$DESC", FUNCTION T.Desc' +(* + ; "DISABLESCROLL", KEYWORD "DISABLESCROLL" + ; "DISABLESUBEX", KEYWORD "DISABLESUBEX" +*) + ; "DYNAMIC", KEYWORD "DYNAMIC" + ; "DYNEVAL", FUNCTION T.Dyneval + ; "$DYNEVAL", FUNCTION T.Dyneval' + ; "ELSE", ELSE + ; "ELSEIF", ELIF + ; "END", END + ; "EXEC", KEYWORD "EXEC" + ; "EXIT", KEYWORD "EXIT" +(* + ; "FCOLOR", KEYWORD "FCOLOR" + ; "$FNAME", KEYWORD "$FNAME" +*) + ; "FREELIB", KEYWORD "FREELIB" +(* + ; "FSIZE", KEYWORD "FSIZE" +*) + ; "FUNC", FUNCTION T.Func + ; "$FUNC", FUNCTION T.Func' + ; "GETOBJ", FUNCTION T.Getobj + ; "$GETOBJ", FUNCTION T.Getobj' + ; "GOSUB", KEYWORD "GOSUB" + ; "GOTO", KEYWORD "GOTO" +(* + ; "GC", KEYWORD "GC" +*) + ; "GS", KEYWORD "GOSUB" + ; "GT", KEYWORD "GOTO" + ; "IF", IF + ; "IIF", FUNCTION T.Iif + ; "$IIF", FUNCTION T.Iif' + ; "INCLIB", KEYWORD "INCLIB" + ; "INPUT", FUNCTION T.Input + ; "$INPUT", FUNCTION T.Input' + ; "INSTR", FUNCTION T.Instr + ; "ISNUM", FUNCTION T.Isnum + ; "ISPLAY", FUNCTION T.Isplay + ; "JUMP", KEYWORD "JUMP" + ; "KILLALL", KEYWORD "KILLALL" + ; "KILLOBJ", KEYWORD "KILLOBJ" + ; "KILLQST", KEYWORD "FREELIB" + ; "KILLVAR", KEYWORD "KILLVAR" + ; "LCASE", FUNCTION T.Lcase + ; "$LCASE", FUNCTION T.Lcase' +(* + ; "LCOLOR", KEYWORD "LCOLOR" +*) + ; "LEN", FUNCTION T.Len + ; "LET", LET + ; "LOC", LOC + ; "MAINTXT", IDENT "MAINTXT" + ; "$MAINTXT", IDENT "MAINTXT" + ; "MAX", FUNCTION T.Max + ; "$MAX", FUNCTION T.Max' + ; "MENU", KEYWORD "MENU" + ; "MID", FUNCTION T.Mid + ; "$MID", FUNCTION T.Mid' + ; "MIN", FUNCTION T.Min + ; "$MIN", FUNCTION T.Min' + ; "MOD", MOD + ; "MSECSCOUNT", FUNCTION T.Msecscount + ; "MSG", KEYWORD "MSG" + ; "NL", KEYWORD "NL" + ; "*NL", KEYWORD "*NL" + ; "NO", NO +(* + ; "NOSAVE", KEYWORD "NOSAVE" +*) + ; "OBJ", OBJ + ; "$ONACTSEL", IDENT "$ONACTSEL" + ; "$ONGLOAD", IDENT "$ONGLOAD" + ; "$ONGSAVE", IDENT "$ONGSAVE" + ; "$ONNEWLOC", IDENT "$ONNEWLOC" + ; "$ONOBJADD", IDENT "$ONOBJADD" + ; "$ONOBJDEL", IDENT "$ONOBJDEL" + ; "$ONOBJSEL", IDENT "$ONOBJSEL" + ; "OPENGAME", KEYWORD "OPENGAME" + ; "OPENQST", KEYWORD "OPENQST" + ; "OR", OR + ; "P", KEYWORD "P" + ; "*P", KEYWORD "*P" + ; "PL", KEYWORD "PL" + ; "*PL", KEYWORD "*PL" + ; "PLAY", KEYWORD "PLAY" + ; "QSPVER", FUNCTION T.Qspver + ; "$QSPVER", FUNCTION T.Qspver' + ; "RAND", FUNCTION T.Rand + ; "REFINT", KEYWORD "REFINT" + ; "REPLACE", FUNCTION T.Replace + ; "$REPLACE", FUNCTION T.Replace' + ; "RGB", FUNCTION T.Rgb + ; "RND", FUNCTION T.Rnd + ; "SAVEGAME", KEYWORD "SAVEGAME" + ; "SELACT", FUNCTION T.Selact + ; "$SELACT", IDENT "SELACT" + ; "SELOBJ", IDENT "SELOBJ" + ; "$SELOBJ", IDENT "SELOBJ" + ; "SET", SET + ; "SETTIMER", KEYWORD "SETTIMER" + ; "SHOWACTS", KEYWORD "SHOWACTS" + ; "SHOWINPUT", KEYWORD "SHOWINPUT" + ; "SHOWOBJS", KEYWORD "SHOWOBJS" + ; "SHOWSTAT", KEYWORD "SHOWSTAT" + ; "STATTXT", FUNCTION T.Stattxt + ; "$STATTXT", FUNCTION T.Stattxt' + ; "STR", FUNCTION T.Str + ; "$STR", FUNCTION T.Str' + ; "STRCOMP", FUNCTION T.Strcomp + ; "STRFIND", FUNCTION T.Strfind + ; "$STRFIND", FUNCTION T.Strfind' + ; "STRPOS", FUNCTION T.Strpos + ; "TRIM", FUNCTION T.Trim + ; "$TRIM", FUNCTION T.Trim' + ; "UCASE", FUNCTION T.Ucase + ; "$UCASE", FUNCTION T.Ucase' + ; "UNSEL", KEYWORD "UNSELECT" + ; "UNSELECT", KEYWORD "UNSELECT" + ; "USEHTML", IDENT "USEHTML" + ; "USERCOM", IDENT "USERCOM" + ; "$USERCOM", IDENT "USERCOM" + ; "USER_TEXT", IDENT "USER_TEXT" + ; "$USER_TEXT", IDENT "USER_TEXT" + ; "USRTXT", IDENT "USER_TEXT" + ; "$USRTXT", IDENT "USER_TEXT" + ; "VAL", FUNCTION T.Val + ; "VIEW", KEYWORD "VIEW" + ; "WAIT", KEYWORD "WAIT" + ; "XGOTO", KEYWORD "XGOTO" + ; "XGT", KEYWORD "XGOTO" + ] + + let ident lexbuf id = + let id = String.uppercase_ascii id in + try + let value = Hashtbl.find keyword_table id in + let _ = match value with + | IF | ELIF -> incr_level lexbuf + | _ -> () in + value + + with Not_found -> IDENT id + + let wait_balance rule lexbuf = + try[@warning "-52"] + rule (Buffer.create 17) lexbuf + with + Failure "lexing: empty token" -> + let line = lexbuf.Lexing.lex_curr_p.pos_lnum + and content = Bytes.to_string lexbuf.Lexing.lex_buffer in + raise (UnclosedQuote {line; content}) + [@warning "+52"] +} + +let space = [ ' ' '\t' ] +let coma = ',' +let letters = [^ '!' ':' '&' '=' '<' '>' '+' '-' '*' '/' ',' '"' '\'' '(' ')' '[' ']' ' ' '\t' '\n' '\r' '{' '}'] +let digit = [ '0'-'9' ] +let eol = [ '\r' '\n' ] +let spaces = space+ +let ident = (letters # digit) letters* + +rule token = parse + +| '#' spaces* (('!' | '$' | '#' | '^')+)? (ident as loc) { + Lexing.set_filename lexbuf loc; + LOCATION_START loc} +| (digit+) as l { INTEGER l} +| '+' { PLUS } +| '-' { MINUS } +| "+=" { INCR } +| "-=" { DECR } +| '/' { DIV } +| '*' { STAR } +| ':' { + (* We are leaving the block, the comment will be handled again *) + decr_level lexbuf; + COLUMN } +| '-' '-'+ [ ^ '\r' '\n']* { LOCATION_END } +| '[' { L_BRACKET } +| ']' { R_BRACKET } +| '(' { + incr_level lexbuf; + L_PAREN } +| ')' { + decr_level lexbuf; + R_PAREN } +| '<' { LT } +| '>' { GT } +| coma { COMA } +| eof { + Bucket.remove is_expression lexbuf; + EOF } +| '=' { + incr_level lexbuf; + EQUAL } +| ident as l { ident lexbuf l} +| eol { + Bucket.add is_expression lexbuf 0; + Lexing.new_line lexbuf; EOL } +| '&' { + Bucket.add is_expression lexbuf 0; + AMPERSAND } +| '!' { + match Bucket.find is_expression lexbuf with + | Some i when i <> 0 -> EXCLAMATION + | _ -> skip_comment lexbuf +} +| spaces { token lexbuf } +| '\'' { LITERAL (wait_balance read_quoted_string lexbuf) } +| '"' { LITERAL (wait_balance read_dquoted_string lexbuf) } +| '{' { LITERAL (wait_balance (read_long_string 0) lexbuf) } + +and skip_comment = parse +| [^ '\'' '"' '{' '\r' '\n' ] { skip_comment lexbuf } +| '{' { + let _ = wait_balance (read_long_string 0) lexbuf in + skip_comment lexbuf } +| '\'' { + let _ = wait_balance read_quoted_string lexbuf in + skip_comment lexbuf +} +| '"' { + let _ = wait_balance read_dquoted_string lexbuf in + skip_comment lexbuf +} +| eol { + (* Ugly hack used in order to put the eol in the front of the next + parsing. *) + lexbuf.lex_curr_pos <- lexbuf.lex_curr_pos - 1; + (*Lexing.new_line lexbuf;*) + COMMENT +} + +(* Read the content until we got another one quote *) +and read_quoted_string buf = parse + | "''" + { Buffer.add_char buf '\''; + read_quoted_string buf lexbuf + } + | [^ '\'' '\n' '\r']+ + { Buffer.add_string buf (Lexing.lexeme lexbuf); + read_quoted_string buf lexbuf + } + | '\'' + { (Buffer.contents buf) + } + | ['\n' '\r'] + { Buffer.add_string buf (Lexing.lexeme lexbuf); + Lexing.new_line lexbuf ; + read_quoted_string buf lexbuf + } + +and read_dquoted_string buf = parse + | [^ '"' ]+ + { Buffer.add_string buf (Lexing.lexeme lexbuf); + read_dquoted_string buf lexbuf + } + | "\"\"" + { Buffer.add_char buf '"'; + read_dquoted_string buf lexbuf + } + | '"' + { (Buffer.contents buf) + } + +and read_long_string level buf = parse + | [^ '{' '}' '\r' '\n' ]+ + { Buffer.add_string buf (Lexing.lexeme lexbuf); + read_long_string level buf lexbuf + } + | '{' + { Buffer.add_string buf (Lexing.lexeme lexbuf); + read_long_string (level + 1) buf lexbuf } + | '}' + { match level with + | 0 -> (Buffer.contents buf) + | _ -> Buffer.add_string buf (Lexing.lexeme lexbuf); + read_long_string (level - 1) buf lexbuf + } +| eol +{ + Lexing.new_line lexbuf; + Buffer.add_string buf (Lexing.lexeme lexbuf); + read_long_string level buf lexbuf +} |