1 files changed, 351 insertions, 0 deletions
diff --git a/css/lib/lexer.ml b/css/lib/lexer.ml
new file mode 100755
index 0000000..9185acc
--- /dev/null
+++ b/css/lib/lexer.ml
@@ -0,0 +1,351 @@
+(** CSS lexer.
+  * Reference:
+  * https://www.w3.org/TR/css-syntax-3/
+  * https://github.com/yahoo/css-js/blob/master/src/l/css.3.l *)
+
+module Sedlexing = Lex_buffer
+
+exception LexingError of (Lexing.position * string)
+(** Signals a lexing error at the provided source location.  *)
+
+exception
+  ParseError of (Menhir_parser.token * Lexing.position * Lexing.position)
+(** Signals a parsing error at the provided token and its start and end
+ * locations. *)
+
+exception GrammarError of (string * Location.t)
+(** Signals a grammar error at the provided location. *)
+
+let position_to_string pos =
+  Printf.sprintf "[%d,%d+%d]" pos.Lexing.pos_lnum pos.Lexing.pos_bol
+    (pos.Lexing.pos_cnum - pos.Lexing.pos_bol)
+
+let location_to_string loc =
+  Printf.sprintf "%s..%s"
+    (position_to_string loc.Location.loc_start)
+    (position_to_string loc.Location.loc_end)
+
+let dimension_to_string = function
+  | Types.Length -> "length"
+  | Angle -> "angle"
+  | Time -> "time"
+  | Frequency -> "frequency"
+
+let token_to_string = function
+  | Menhir_parser.EOF -> "EOF"
+  | LEFT_BRACE -> "{"
+  | RIGHT_BRACE -> "}"
+  | LEFT_PAREN -> "("
+  | RIGHT_PAREN -> ")"
+  | LEFT_BRACKET -> "["
+  | RIGHT_BRACKET -> "]"
+  | COLON -> ":"
+  | DOT -> "."
+  (* Whitespaces are detected only in selectors, before ":", ".", and "#", to
+   * disambiguate between "p :first-child" and "p:first-child", these
+   * whitespaces are replaced with "*" *)
+  | WHITESPACE -> "*"
+  | SEMI_COLON -> ";"
+  | PERCENTAGE -> "%"
+  | IMPORTANT -> "!important"
+  | IDENT s -> "IDENT(" ^ s ^ ")"
+  | STRING s -> "STRING(" ^ s ^ ")"
+  | URI s -> "URI(" ^ s ^ ")"
+  | OPERATOR s -> "OPERATOR(" ^ s ^ ")"
+  | DELIM s -> "DELIM(" ^ s ^ ")"
+  | NESTED_AT_RULE s -> "NESTED_AT_RULE(" ^ s ^ ")"
+  | AT_RULE_WITHOUT_BODY s -> "AT_RULE_WITHOUT_BODY(" ^ s ^ ")"
+  | AT_RULE s -> "AT_RULE(" ^ s ^ ")"
+  | FUNCTION s -> "FUNCTION(" ^ s ^ ")"
+  | HASH s -> "HASH(" ^ s ^ ")"
+  | NUMBER s -> "NUMBER(" ^ s ^ ")"
+  | UNICODE_RANGE s -> "UNICODE_RANGE(" ^ s ^ ")"
+  | FLOAT_DIMENSION (n, s, d) ->
+      "FLOAT_DIMENSION(" ^ n ^ ", " ^ s ^ ", " ^ dimension_to_string d ^ ")"
+  | DIMENSION (n, d) -> "DIMENSION(" ^ n ^ ", " ^ d ^ ")"
+
+let () =
+  Location.register_error_of_exn (function
+    | LexingError (pos, msg) ->
+        let loc = Lex_buffer.make_loc_and_fix pos pos in
+        Some (Location.error ~loc msg)
+    | ParseError (token, start_pos, end_pos) ->
+        let loc = Lex_buffer.make_loc_and_fix start_pos end_pos in
+        let msg =
+          Printf.sprintf "Parse error while reading token '%s'"
+            (token_to_string token)
+        in
+        Some (Location.error ~loc msg)
+    | GrammarError (msg, loc) -> Some (Location.error ~loc msg)
+    | _ -> None)
+
+(* Regexes *)
+let newline = [%sedlex.regexp? '\n' | "\r\n" | '\r' | '\012']
+
+let white_space = [%sedlex.regexp? " " | '\t' | newline]
+
+let ws = [%sedlex.regexp? Star white_space]
+
+let hex_digit = [%sedlex.regexp? '0' .. '9' | 'a' .. 'f' | 'A' .. 'F']
+
+let digit = [%sedlex.regexp? '0' .. '9']
+
+let non_ascii = [%sedlex.regexp? '\160' .. '\255']
+
+let up_to_6_hex_digits = [%sedlex.regexp? Rep (hex_digit, 1 .. 6)]
+
+let unicode = [%sedlex.regexp? '\\', up_to_6_hex_digits, Opt white_space]
+
+let unicode_range =
+  [%sedlex.regexp?
+    ( Rep ((hex_digit | '?'), 1 .. 6)
+    | up_to_6_hex_digits, '-', up_to_6_hex_digits )]
+
+let escape =
+  [%sedlex.regexp? unicode | '\\', Compl ('\r' | '\n' | '\012' | hex_digit)]
+
+let ident_start =
+  [%sedlex.regexp? '_' | 'a' .. 'z' | 'A' .. 'Z' | non_ascii | escape]
+
+let ident_char =
+  [%sedlex.regexp?
+    '_' | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | non_ascii | escape]
+
+let ident = [%sedlex.regexp? (Opt '-', ident_start | '-', '-'), Star ident_char]
+
+let string_quote =
+  [%sedlex.regexp?
+    '"', Star (Compl ('\n' | '\r' | '\012' | '"') | '\\', newline | escape), '"']
+
+let string_apos =
+  [%sedlex.regexp?
+    ( '\'',
+      Star (Compl ('\n' | '\r' | '\012' | '\'') | '\\', newline | escape),
+      '\'' )]
+
+let string = [%sedlex.regexp? string_quote | string_apos]
+
+let name = [%sedlex.regexp? Plus ident_char]
+
+let number =
+  [%sedlex.regexp?
+    ( ( Opt ('+' | '-'),
+        Plus digit,
+        Opt ('.', Plus digit),
+        Opt (('e' | 'E'), ('+' | '-'), Plus digit) )
+    | ( Opt ('+' | '-'),
+        '.',
+        Plus digit,
+        Opt (('e' | 'E'), ('+' | '-'), Plus digit) ) )]
+
+let non_printable =
+  [%sedlex.regexp? '\x00' .. '\x08' | '\x0B' | '\x0E' .. '\x1F' | '\x7F']
+
+let url_unquoted =
+  [%sedlex.regexp?
+    Star (Compl ('"' | '\'' | '(' | ')' | '\\' | non_printable) | escape)]
+
+let url = [%sedlex.regexp? url_unquoted | string]
+
+let operator = [%sedlex.regexp? "~=" | "|=" | "^=" | "$=" | "*=" | "||"]
+
+let at_rule = [%sedlex.regexp? "@", ident]
+
+let at_rule_without_body =
+  [%sedlex.regexp? "@", ("charset" | "import" | "namespace")]
+
+let vendor_prefix = [%sedlex.regexp? "-webkit-" | "-moz-" | "-o-" | "-ms-"]
+
+let nested_at_rule =
+  [%sedlex.regexp?
+    ( "@",
+      ( "document" | "keyframes"
+      | vendor_prefix, "keyframes"
+      | "media" | "supports" | "scope" ) )]
+
+let _a = [%sedlex.regexp? 'A' | 'a']
+
+let _b = [%sedlex.regexp? 'B' | 'b']
+
+let _c = [%sedlex.regexp? 'C' | 'c']
+
+let _d = [%sedlex.regexp? 'D' | 'd']
+
+let _e = [%sedlex.regexp? 'e' | 'E']
+
+let _f = [%sedlex.regexp? 'F' | 'f']
+
+let _g = [%sedlex.regexp? 'G' | 'g']
+
+let _h = [%sedlex.regexp? 'H' | 'h']
+
+let _i = [%sedlex.regexp? 'I' | 'i']
+
+let _j = [%sedlex.regexp? 'J' | 'j']
+
+let _k = [%sedlex.regexp? 'K' | 'k']
+
+let _l = [%sedlex.regexp? 'L' | 'l']
+
+let _m = [%sedlex.regexp? 'M' | 'm']
+
+let _n = [%sedlex.regexp? 'N' | 'n']
+
+let _o = [%sedlex.regexp? 'O' | 'o']
+
+let _p = [%sedlex.regexp? 'P' | 'p']
+
+let _q = [%sedlex.regexp? 'Q' | 'q']
+
+let _r = [%sedlex.regexp? 'R' | 'r']
+
+let _s = [%sedlex.regexp? 'S' | 's']
+
+let _t = [%sedlex.regexp? 'T' | 't']
+
+let _u = [%sedlex.regexp? 'U' | 'u']
+
+let _v = [%sedlex.regexp? 'V' | 'v']
+
+let _w = [%sedlex.regexp? 'W' | 'w']
+
+let _x = [%sedlex.regexp? 'X' | 'x']
+
+let _y = [%sedlex.regexp? 'Y' | 'y']
+
+let _z = [%sedlex.regexp? 'Z' | 'z']
+
+let important = [%sedlex.regexp? "!", ws, _i, _m, _p, _o, _r, _t, _a, _n, _t]
+
+let length =
+  [%sedlex.regexp?
+    ( _c, _a, _p
+    | _c, _h
+    | _e, _m
+    | _e, _x
+    | _i, _c
+    | _l, _h
+    | _r, _e, _m
+    | _r, _l, _h
+    | _v, _h
+    | _v, _w
+    | _v, _i
+    | _v, _b
+    | _v, _m, _i, _n
+    | _v, _m, _a, _x
+    | _c, _m
+    | _m, _m
+    | _q
+    | _i, _n
+    | _p, _c
+    | _p, _t
+    | _p, _x )]
+
+let angle =
+  [%sedlex.regexp? _d, _e, _g | _g, _r, _a, _d | _r, _a, _d | _t, _u, _r, _n]
+
+let time = [%sedlex.regexp? _s | _m, _s]
+
+let frequency = [%sedlex.regexp? _h, _z | _k, _h, _z]
+
+(* Returns true if white spaces were discarded *)
+let discard_comments_and_white_spaces buf =
+  let rec discard_white_spaces buf spaces_detected =
+    match%sedlex buf with
+    | Plus white_space -> discard_white_spaces buf true
+    | "/*" -> discard_comments buf spaces_detected
+    | _ -> spaces_detected
+  and discard_comments buf spaces_detected =
+    match%sedlex buf with
+    | eof ->
+        raise (LexingError (buf.Lex_buffer.pos, "Unterminated comment at EOF"))
+    | "*/" -> discard_white_spaces buf spaces_detected
+    | any -> discard_comments buf spaces_detected
+    | _ -> assert false
+  in
+  discard_white_spaces buf false
+
+let rec get_next_tokens buf spaces_detected =
+  let open Menhir_parser in
+  match%sedlex buf with
+  | eof -> [ EOF ]
+  | ';' -> [ SEMI_COLON ]
+  | '}' -> [ RIGHT_BRACE ]
+  | '{' -> [ LEFT_BRACE ]
+  | ':' -> if spaces_detected then [ WHITESPACE; COLON ] else [ COLON ]
+  | '.' -> if spaces_detected then [ WHITESPACE; DOT ] else [ DOT ]
+  | '(' -> [ LEFT_PAREN ]
+  | ')' -> [ RIGHT_PAREN ]
+  | '[' -> [ LEFT_BRACKET ]
+  | ']' -> [ RIGHT_BRACKET ]
+  | '%' -> [ PERCENTAGE ]
+  | operator -> [ OPERATOR (Lex_buffer.latin1 buf) ]
+  | string -> [ STRING (Lex_buffer.latin1 ~skip:1 ~drop:1 buf) ]
+  | "url(" -> [ get_url "" buf ]
+  | important -> [ IMPORTANT ]
+  | nested_at_rule -> [ NESTED_AT_RULE (Lex_buffer.latin1 ~skip:1 buf) ]
+  | at_rule_without_body ->
+      [ AT_RULE_WITHOUT_BODY (Lex_buffer.latin1 ~skip:1 buf) ]
+  | at_rule -> [ AT_RULE (Lex_buffer.latin1 ~skip:1 buf) ]
+  (* NOTE: should be placed above ident, otherwise pattern with
+   * '-[0-9a-z]{1,6}' cannot be matched *)
+  | _u, '+', unicode_range -> [ UNICODE_RANGE (Lex_buffer.latin1 buf) ]
+  | ident, '(' -> [ FUNCTION (Lex_buffer.latin1 ~drop:1 buf) ]
+  | ident -> [ IDENT (Lex_buffer.latin1 buf) ]
+  | '#', name ->
+      if spaces_detected then
+        [ WHITESPACE; HASH (Lex_buffer.latin1 ~skip:1 buf) ]
+      else [ HASH (Lex_buffer.latin1 ~skip:1 buf) ]
+  | number -> [ get_dimension (Lex_buffer.latin1 buf) buf ]
+  | any -> [ DELIM (Lex_buffer.latin1 buf) ]
+  | _ -> assert false
+
+and get_dimension n buf =
+  match%sedlex buf with
+  | length -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Length)
+  | angle -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Angle)
+  | time -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Time)
+  | frequency -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Frequency)
+  | ident -> DIMENSION (n, Lex_buffer.latin1 buf)
+  | _ -> NUMBER n
+
+and get_url url buf =
+  match%sedlex buf with
+  | ws -> get_url url buf
+  | url -> get_url (Lex_buffer.latin1 buf) buf
+  | ")" -> URI url
+  | eof -> raise (LexingError (buf.Lex_buffer.pos, "Incomplete URI"))
+  | any ->
+      raise
+        (LexingError
+           ( buf.Lex_buffer.pos,
+             "Unexpected token: " ^ Lex_buffer.latin1 buf ^ " parsing an URI" ))
+  | _ -> assert false
+
+let token_queue = Queue.create ()
+
+let queue_next_tokens_with_location buf =
+  let spaces_detected = discard_comments_and_white_spaces buf in
+  let loc_start = Lex_buffer.next_loc buf in
+  let tokens = get_next_tokens buf spaces_detected in
+  let loc_end = Lex_buffer.next_loc buf in
+  List.iter (fun t -> Queue.add (t, loc_start, loc_end) token_queue) tokens
+
+let parse buf p =
+  let last_token =
+    ref (Menhir_parser.EOF, Lexing.dummy_pos, Lexing.dummy_pos)
+  in
+  let next_token () =
+    if Queue.is_empty token_queue then queue_next_tokens_with_location buf;
+    last_token := Queue.take token_queue;
+    !last_token
+  in
+  try MenhirLib.Convert.Simplified.traditional2revised p next_token with
+  | LexingError _ as e -> raise e
+  | _ -> raise (ParseError !last_token)
+
+let parse_string ?container_lnum ?pos s p =
+  (match container_lnum with
+  | None -> ()
+  | Some lnum -> Lex_buffer.container_lnum_ref := lnum);
+  parse (Lex_buffer.of_ascii_string ?pos s) p