aboutsummaryrefslogtreecommitdiff
path: root/css/lib/lexer.ml
diff options
context:
space:
mode:
Diffstat (limited to 'css/lib/lexer.ml')
-rwxr-xr-xcss/lib/lexer.ml351
1 files changed, 351 insertions, 0 deletions
diff --git a/css/lib/lexer.ml b/css/lib/lexer.ml
new file mode 100755
index 0000000..9185acc
--- /dev/null
+++ b/css/lib/lexer.ml
@@ -0,0 +1,351 @@
+(** CSS lexer.
+ * Reference:
+ * https://www.w3.org/TR/css-syntax-3/
+ * https://github.com/yahoo/css-js/blob/master/src/l/css.3.l *)
+
+module Sedlexing = Lex_buffer
+
+exception LexingError of (Lexing.position * string)
+(** Signals a lexing error at the provided source location. *)
+
+exception
+ ParseError of (Menhir_parser.token * Lexing.position * Lexing.position)
+(** Signals a parsing error at the provided token and its start and end
+ * locations. *)
+
+exception GrammarError of (string * Location.t)
+(** Signals a grammar error at the provided location. *)
+
+let position_to_string pos =
+ Printf.sprintf "[%d,%d+%d]" pos.Lexing.pos_lnum pos.Lexing.pos_bol
+ (pos.Lexing.pos_cnum - pos.Lexing.pos_bol)
+
+let location_to_string loc =
+ Printf.sprintf "%s..%s"
+ (position_to_string loc.Location.loc_start)
+ (position_to_string loc.Location.loc_end)
+
+let dimension_to_string = function
+ | Types.Length -> "length"
+ | Angle -> "angle"
+ | Time -> "time"
+ | Frequency -> "frequency"
+
+let token_to_string = function
+ | Menhir_parser.EOF -> "EOF"
+ | LEFT_BRACE -> "{"
+ | RIGHT_BRACE -> "}"
+ | LEFT_PAREN -> "("
+ | RIGHT_PAREN -> ")"
+ | LEFT_BRACKET -> "["
+ | RIGHT_BRACKET -> "]"
+ | COLON -> ":"
+ | DOT -> "."
+ (* Whitespaces are detected only in selectors, before ":", ".", and "#", to
+ * disambiguate between "p :first-child" and "p:first-child", these
+ * whitespaces are replaced with "*" *)
+ | WHITESPACE -> "*"
+ | SEMI_COLON -> ";"
+ | PERCENTAGE -> "%"
+ | IMPORTANT -> "!important"
+ | IDENT s -> "IDENT(" ^ s ^ ")"
+ | STRING s -> "STRING(" ^ s ^ ")"
+ | URI s -> "URI(" ^ s ^ ")"
+ | OPERATOR s -> "OPERATOR(" ^ s ^ ")"
+ | DELIM s -> "DELIM(" ^ s ^ ")"
+ | NESTED_AT_RULE s -> "NESTED_AT_RULE(" ^ s ^ ")"
+ | AT_RULE_WITHOUT_BODY s -> "AT_RULE_WITHOUT_BODY(" ^ s ^ ")"
+ | AT_RULE s -> "AT_RULE(" ^ s ^ ")"
+ | FUNCTION s -> "FUNCTION(" ^ s ^ ")"
+ | HASH s -> "HASH(" ^ s ^ ")"
+ | NUMBER s -> "NUMBER(" ^ s ^ ")"
+ | UNICODE_RANGE s -> "UNICODE_RANGE(" ^ s ^ ")"
+ | FLOAT_DIMENSION (n, s, d) ->
+ "FLOAT_DIMENSION(" ^ n ^ ", " ^ s ^ ", " ^ dimension_to_string d ^ ")"
+ | DIMENSION (n, d) -> "DIMENSION(" ^ n ^ ", " ^ d ^ ")"
+
+let () =
+ Location.register_error_of_exn (function
+ | LexingError (pos, msg) ->
+ let loc = Lex_buffer.make_loc_and_fix pos pos in
+ Some (Location.error ~loc msg)
+ | ParseError (token, start_pos, end_pos) ->
+ let loc = Lex_buffer.make_loc_and_fix start_pos end_pos in
+ let msg =
+ Printf.sprintf "Parse error while reading token '%s'"
+ (token_to_string token)
+ in
+ Some (Location.error ~loc msg)
+ | GrammarError (msg, loc) -> Some (Location.error ~loc msg)
+ | _ -> None)
+
+(* Regexes *)
+let newline = [%sedlex.regexp? '\n' | "\r\n" | '\r' | '\012']
+
+let white_space = [%sedlex.regexp? " " | '\t' | newline]
+
+let ws = [%sedlex.regexp? Star white_space]
+
+let hex_digit = [%sedlex.regexp? '0' .. '9' | 'a' .. 'f' | 'A' .. 'F']
+
+let digit = [%sedlex.regexp? '0' .. '9']
+
+let non_ascii = [%sedlex.regexp? '\160' .. '\255']
+
+let up_to_6_hex_digits = [%sedlex.regexp? Rep (hex_digit, 1 .. 6)]
+
+let unicode = [%sedlex.regexp? '\\', up_to_6_hex_digits, Opt white_space]
+
+let unicode_range =
+ [%sedlex.regexp?
+ ( Rep ((hex_digit | '?'), 1 .. 6)
+ | up_to_6_hex_digits, '-', up_to_6_hex_digits )]
+
+let escape =
+ [%sedlex.regexp? unicode | '\\', Compl ('\r' | '\n' | '\012' | hex_digit)]
+
+let ident_start =
+ [%sedlex.regexp? '_' | 'a' .. 'z' | 'A' .. 'Z' | non_ascii | escape]
+
+let ident_char =
+ [%sedlex.regexp?
+ '_' | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | non_ascii | escape]
+
+let ident = [%sedlex.regexp? (Opt '-', ident_start | '-', '-'), Star ident_char]
+
+let string_quote =
+ [%sedlex.regexp?
+ '"', Star (Compl ('\n' | '\r' | '\012' | '"') | '\\', newline | escape), '"']
+
+let string_apos =
+ [%sedlex.regexp?
+ ( '\'',
+ Star (Compl ('\n' | '\r' | '\012' | '\'') | '\\', newline | escape),
+ '\'' )]
+
+let string = [%sedlex.regexp? string_quote | string_apos]
+
+let name = [%sedlex.regexp? Plus ident_char]
+
+let number =
+ [%sedlex.regexp?
+ ( ( Opt ('+' | '-'),
+ Plus digit,
+ Opt ('.', Plus digit),
+ Opt (('e' | 'E'), ('+' | '-'), Plus digit) )
+ | ( Opt ('+' | '-'),
+ '.',
+ Plus digit,
+ Opt (('e' | 'E'), ('+' | '-'), Plus digit) ) )]
+
+let non_printable =
+ [%sedlex.regexp? '\x00' .. '\x08' | '\x0B' | '\x0E' .. '\x1F' | '\x7F']
+
+let url_unquoted =
+ [%sedlex.regexp?
+ Star (Compl ('"' | '\'' | '(' | ')' | '\\' | non_printable) | escape)]
+
+let url = [%sedlex.regexp? url_unquoted | string]
+
+let operator = [%sedlex.regexp? "~=" | "|=" | "^=" | "$=" | "*=" | "||"]
+
+let at_rule = [%sedlex.regexp? "@", ident]
+
+let at_rule_without_body =
+ [%sedlex.regexp? "@", ("charset" | "import" | "namespace")]
+
+let vendor_prefix = [%sedlex.regexp? "-webkit-" | "-moz-" | "-o-" | "-ms-"]
+
+let nested_at_rule =
+ [%sedlex.regexp?
+ ( "@",
+ ( "document" | "keyframes"
+ | vendor_prefix, "keyframes"
+ | "media" | "supports" | "scope" ) )]
+
+let _a = [%sedlex.regexp? 'A' | 'a']
+
+let _b = [%sedlex.regexp? 'B' | 'b']
+
+let _c = [%sedlex.regexp? 'C' | 'c']
+
+let _d = [%sedlex.regexp? 'D' | 'd']
+
+let _e = [%sedlex.regexp? 'e' | 'E']
+
+let _f = [%sedlex.regexp? 'F' | 'f']
+
+let _g = [%sedlex.regexp? 'G' | 'g']
+
+let _h = [%sedlex.regexp? 'H' | 'h']
+
+let _i = [%sedlex.regexp? 'I' | 'i']
+
+let _j = [%sedlex.regexp? 'J' | 'j']
+
+let _k = [%sedlex.regexp? 'K' | 'k']
+
+let _l = [%sedlex.regexp? 'L' | 'l']
+
+let _m = [%sedlex.regexp? 'M' | 'm']
+
+let _n = [%sedlex.regexp? 'N' | 'n']
+
+let _o = [%sedlex.regexp? 'O' | 'o']
+
+let _p = [%sedlex.regexp? 'P' | 'p']
+
+let _q = [%sedlex.regexp? 'Q' | 'q']
+
+let _r = [%sedlex.regexp? 'R' | 'r']
+
+let _s = [%sedlex.regexp? 'S' | 's']
+
+let _t = [%sedlex.regexp? 'T' | 't']
+
+let _u = [%sedlex.regexp? 'U' | 'u']
+
+let _v = [%sedlex.regexp? 'V' | 'v']
+
+let _w = [%sedlex.regexp? 'W' | 'w']
+
+let _x = [%sedlex.regexp? 'X' | 'x']
+
+let _y = [%sedlex.regexp? 'Y' | 'y']
+
+let _z = [%sedlex.regexp? 'Z' | 'z']
+
+let important = [%sedlex.regexp? "!", ws, _i, _m, _p, _o, _r, _t, _a, _n, _t]
+
+let length =
+ [%sedlex.regexp?
+ ( _c, _a, _p
+ | _c, _h
+ | _e, _m
+ | _e, _x
+ | _i, _c
+ | _l, _h
+ | _r, _e, _m
+ | _r, _l, _h
+ | _v, _h
+ | _v, _w
+ | _v, _i
+ | _v, _b
+ | _v, _m, _i, _n
+ | _v, _m, _a, _x
+ | _c, _m
+ | _m, _m
+ | _q
+ | _i, _n
+ | _p, _c
+ | _p, _t
+ | _p, _x )]
+
+let angle =
+ [%sedlex.regexp? _d, _e, _g | _g, _r, _a, _d | _r, _a, _d | _t, _u, _r, _n]
+
+let time = [%sedlex.regexp? _s | _m, _s]
+
+let frequency = [%sedlex.regexp? _h, _z | _k, _h, _z]
+
+(* Returns true if white spaces were discarded *)
+let discard_comments_and_white_spaces buf =
+ let rec discard_white_spaces buf spaces_detected =
+ match%sedlex buf with
+ | Plus white_space -> discard_white_spaces buf true
+ | "/*" -> discard_comments buf spaces_detected
+ | _ -> spaces_detected
+ and discard_comments buf spaces_detected =
+ match%sedlex buf with
+ | eof ->
+ raise (LexingError (buf.Lex_buffer.pos, "Unterminated comment at EOF"))
+ | "*/" -> discard_white_spaces buf spaces_detected
+ | any -> discard_comments buf spaces_detected
+ | _ -> assert false
+ in
+ discard_white_spaces buf false
+
+let rec get_next_tokens buf spaces_detected =
+ let open Menhir_parser in
+ match%sedlex buf with
+ | eof -> [ EOF ]
+ | ';' -> [ SEMI_COLON ]
+ | '}' -> [ RIGHT_BRACE ]
+ | '{' -> [ LEFT_BRACE ]
+ | ':' -> if spaces_detected then [ WHITESPACE; COLON ] else [ COLON ]
+ | '.' -> if spaces_detected then [ WHITESPACE; DOT ] else [ DOT ]
+ | '(' -> [ LEFT_PAREN ]
+ | ')' -> [ RIGHT_PAREN ]
+ | '[' -> [ LEFT_BRACKET ]
+ | ']' -> [ RIGHT_BRACKET ]
+ | '%' -> [ PERCENTAGE ]
+ | operator -> [ OPERATOR (Lex_buffer.latin1 buf) ]
+ | string -> [ STRING (Lex_buffer.latin1 ~skip:1 ~drop:1 buf) ]
+ | "url(" -> [ get_url "" buf ]
+ | important -> [ IMPORTANT ]
+ | nested_at_rule -> [ NESTED_AT_RULE (Lex_buffer.latin1 ~skip:1 buf) ]
+ | at_rule_without_body ->
+ [ AT_RULE_WITHOUT_BODY (Lex_buffer.latin1 ~skip:1 buf) ]
+ | at_rule -> [ AT_RULE (Lex_buffer.latin1 ~skip:1 buf) ]
+ (* NOTE: should be placed above ident, otherwise pattern with
+ * '-[0-9a-z]{1,6}' cannot be matched *)
+ | _u, '+', unicode_range -> [ UNICODE_RANGE (Lex_buffer.latin1 buf) ]
+ | ident, '(' -> [ FUNCTION (Lex_buffer.latin1 ~drop:1 buf) ]
+ | ident -> [ IDENT (Lex_buffer.latin1 buf) ]
+ | '#', name ->
+ if spaces_detected then
+ [ WHITESPACE; HASH (Lex_buffer.latin1 ~skip:1 buf) ]
+ else [ HASH (Lex_buffer.latin1 ~skip:1 buf) ]
+ | number -> [ get_dimension (Lex_buffer.latin1 buf) buf ]
+ | any -> [ DELIM (Lex_buffer.latin1 buf) ]
+ | _ -> assert false
+
+and get_dimension n buf =
+ match%sedlex buf with
+ | length -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Length)
+ | angle -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Angle)
+ | time -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Time)
+ | frequency -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Frequency)
+ | ident -> DIMENSION (n, Lex_buffer.latin1 buf)
+ | _ -> NUMBER n
+
+and get_url url buf =
+ match%sedlex buf with
+ | ws -> get_url url buf
+ | url -> get_url (Lex_buffer.latin1 buf) buf
+ | ")" -> URI url
+ | eof -> raise (LexingError (buf.Lex_buffer.pos, "Incomplete URI"))
+ | any ->
+ raise
+ (LexingError
+ ( buf.Lex_buffer.pos,
+ "Unexpected token: " ^ Lex_buffer.latin1 buf ^ " parsing an URI" ))
+ | _ -> assert false
+
+let token_queue = Queue.create ()
+
+let queue_next_tokens_with_location buf =
+ let spaces_detected = discard_comments_and_white_spaces buf in
+ let loc_start = Lex_buffer.next_loc buf in
+ let tokens = get_next_tokens buf spaces_detected in
+ let loc_end = Lex_buffer.next_loc buf in
+ List.iter (fun t -> Queue.add (t, loc_start, loc_end) token_queue) tokens
+
+let parse buf p =
+ let last_token =
+ ref (Menhir_parser.EOF, Lexing.dummy_pos, Lexing.dummy_pos)
+ in
+ let next_token () =
+ if Queue.is_empty token_queue then queue_next_tokens_with_location buf;
+ last_token := Queue.take token_queue;
+ !last_token
+ in
+ try MenhirLib.Convert.Simplified.traditional2revised p next_token with
+ | LexingError _ as e -> raise e
+ | _ -> raise (ParseError !last_token)
+
+let parse_string ?container_lnum ?pos s p =
+ (match container_lnum with
+ | None -> ()
+ | Some lnum -> Lex_buffer.container_lnum_ref := lnum);
+ parse (Lex_buffer.of_ascii_string ?pos s) p