(** CSS lexer. * Reference: * https://www.w3.org/TR/css-syntax-3/ * https://github.com/yahoo/css-js/blob/master/src/l/css.3.l *) module Sedlexing = Lex_buffer exception LexingError of (Lexing.position * string) (** Signals a lexing error at the provided source location. *) exception ParseError of (Menhir_parser.token * Lexing.position * Lexing.position) (** Signals a parsing error at the provided token and its start and end * locations. *) exception GrammarError of (string * Location.t) (** Signals a grammar error at the provided location. *) let position_to_string pos = Printf.sprintf "[%d,%d+%d]" pos.Lexing.pos_lnum pos.Lexing.pos_bol (pos.Lexing.pos_cnum - pos.Lexing.pos_bol) let location_to_string loc = Printf.sprintf "%s..%s" (position_to_string loc.Location.loc_start) (position_to_string loc.Location.loc_end) let dimension_to_string = function | Types.Length -> "length" | Angle -> "angle" | Time -> "time" | Frequency -> "frequency" let token_to_string = function | Menhir_parser.EOF -> "EOF" | LEFT_BRACE -> "{" | RIGHT_BRACE -> "}" | LEFT_PAREN -> "(" | RIGHT_PAREN -> ")" | LEFT_BRACKET -> "[" | RIGHT_BRACKET -> "]" | COLON -> ":" | DOT -> "." (* Whitespaces are detected only in selectors, before ":", ".", and "#", to * disambiguate between "p :first-child" and "p:first-child", these * whitespaces are replaced with "*" *) | WHITESPACE -> "*" | SEMI_COLON -> ";" | PERCENTAGE -> "%" | IMPORTANT -> "!important" | IDENT s -> "IDENT(" ^ s ^ ")" | STRING s -> "STRING(" ^ s ^ ")" | URI s -> "URI(" ^ s ^ ")" | OPERATOR s -> "OPERATOR(" ^ s ^ ")" | DELIM s -> "DELIM(" ^ s ^ ")" | NESTED_AT_RULE s -> "NESTED_AT_RULE(" ^ s ^ ")" | AT_RULE_WITHOUT_BODY s -> "AT_RULE_WITHOUT_BODY(" ^ s ^ ")" | AT_RULE s -> "AT_RULE(" ^ s ^ ")" | FUNCTION s -> "FUNCTION(" ^ s ^ ")" | HASH s -> "HASH(" ^ s ^ ")" | NUMBER s -> "NUMBER(" ^ s ^ ")" | UNICODE_RANGE s -> "UNICODE_RANGE(" ^ s ^ ")" | FLOAT_DIMENSION (n, s, d) -> "FLOAT_DIMENSION(" ^ n ^ ", " ^ s ^ ", " ^ dimension_to_string d ^ ")" | DIMENSION (n, d) -> "DIMENSION(" ^ n ^ ", " ^ d ^ ")" let () = Location.register_error_of_exn (function | LexingError (pos, msg) -> let loc = Lex_buffer.make_loc_and_fix pos pos in Some (Location.error ~loc msg) | ParseError (token, start_pos, end_pos) -> let loc = Lex_buffer.make_loc_and_fix start_pos end_pos in let msg = Printf.sprintf "Parse error while reading token '%s'" (token_to_string token) in Some (Location.error ~loc msg) | GrammarError (msg, loc) -> Some (Location.error ~loc msg) | _ -> None) (* Regexes *) let newline = [%sedlex.regexp? '\n' | "\r\n" | '\r' | '\012'] let white_space = [%sedlex.regexp? " " | '\t' | newline] let ws = [%sedlex.regexp? Star white_space] let hex_digit = [%sedlex.regexp? '0' .. '9' | 'a' .. 'f' | 'A' .. 'F'] let digit = [%sedlex.regexp? '0' .. '9'] let non_ascii = [%sedlex.regexp? '\160' .. '\255'] let up_to_6_hex_digits = [%sedlex.regexp? Rep (hex_digit, 1 .. 6)] let unicode = [%sedlex.regexp? '\\', up_to_6_hex_digits, Opt white_space] let unicode_range = [%sedlex.regexp? ( Rep ((hex_digit | '?'), 1 .. 6) | up_to_6_hex_digits, '-', up_to_6_hex_digits )] let escape = [%sedlex.regexp? unicode | '\\', Compl ('\r' | '\n' | '\012' | hex_digit)] let ident_start = [%sedlex.regexp? '_' | 'a' .. 'z' | 'A' .. 'Z' | non_ascii | escape] let ident_char = [%sedlex.regexp? '_' | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | non_ascii | escape] let ident = [%sedlex.regexp? (Opt '-', ident_start | '-', '-'), Star ident_char] let string_quote = [%sedlex.regexp? '"', Star (Compl ('\n' | '\r' | '\012' | '"') | '\\', newline | escape), '"'] let string_apos = [%sedlex.regexp? ( '\'', Star (Compl ('\n' | '\r' | '\012' | '\'') | '\\', newline | escape), '\'' )] let string = [%sedlex.regexp? string_quote | string_apos] let name = [%sedlex.regexp? Plus ident_char] let number = [%sedlex.regexp? ( ( Opt ('+' | '-'), Plus digit, Opt ('.', Plus digit), Opt (('e' | 'E'), ('+' | '-'), Plus digit) ) | ( Opt ('+' | '-'), '.', Plus digit, Opt (('e' | 'E'), ('+' | '-'), Plus digit) ) )] let non_printable = [%sedlex.regexp? '\x00' .. '\x08' | '\x0B' | '\x0E' .. '\x1F' | '\x7F'] let url_unquoted = [%sedlex.regexp? Star (Compl ('"' | '\'' | '(' | ')' | '\\' | non_printable) | escape)] let url = [%sedlex.regexp? url_unquoted | string] let operator = [%sedlex.regexp? "~=" | "|=" | "^=" | "$=" | "*=" | "||"] let at_rule = [%sedlex.regexp? "@", ident] let at_rule_without_body = [%sedlex.regexp? "@", ("charset" | "import" | "namespace")] let vendor_prefix = [%sedlex.regexp? "-webkit-" | "-moz-" | "-o-" | "-ms-"] let nested_at_rule = [%sedlex.regexp? ( "@", ( "document" | "keyframes" | vendor_prefix, "keyframes" | "media" | "supports" | "scope" ) )] let _a = [%sedlex.regexp? 'A' | 'a'] let _b = [%sedlex.regexp? 'B' | 'b'] let _c = [%sedlex.regexp? 'C' | 'c'] let _d = [%sedlex.regexp? 'D' | 'd'] let _e = [%sedlex.regexp? 'e' | 'E'] let _f = [%sedlex.regexp? 'F' | 'f'] let _g = [%sedlex.regexp? 'G' | 'g'] let _h = [%sedlex.regexp? 'H' | 'h'] let _i = [%sedlex.regexp? 'I' | 'i'] let _j = [%sedlex.regexp? 'J' | 'j'] let _k = [%sedlex.regexp? 'K' | 'k'] let _l = [%sedlex.regexp? 'L' | 'l'] let _m = [%sedlex.regexp? 'M' | 'm'] let _n = [%sedlex.regexp? 'N' | 'n'] let _o = [%sedlex.regexp? 'O' | 'o'] let _p = [%sedlex.regexp? 'P' | 'p'] let _q = [%sedlex.regexp? 'Q' | 'q'] let _r = [%sedlex.regexp? 'R' | 'r'] let _s = [%sedlex.regexp? 'S' | 's'] let _t = [%sedlex.regexp? 'T' | 't'] let _u = [%sedlex.regexp? 'U' | 'u'] let _v = [%sedlex.regexp? 'V' | 'v'] let _w = [%sedlex.regexp? 'W' | 'w'] let _x = [%sedlex.regexp? 'X' | 'x'] let _y = [%sedlex.regexp? 'Y' | 'y'] let _z = [%sedlex.regexp? 'Z' | 'z'] let important = [%sedlex.regexp? "!", ws, _i, _m, _p, _o, _r, _t, _a, _n, _t] let length = [%sedlex.regexp? ( _c, _a, _p | _c, _h | _e, _m | _e, _x | _i, _c | _l, _h | _r, _e, _m | _r, _l, _h | _v, _h | _v, _w | _v, _i | _v, _b | _v, _m, _i, _n | _v, _m, _a, _x | _c, _m | _m, _m | _q | _i, _n | _p, _c | _p, _t | _p, _x )] let angle = [%sedlex.regexp? _d, _e, _g | _g, _r, _a, _d | _r, _a, _d | _t, _u, _r, _n] let time = [%sedlex.regexp? _s | _m, _s] let frequency = [%sedlex.regexp? _h, _z | _k, _h, _z] (* Returns true if white spaces were discarded *) let discard_comments_and_white_spaces buf = let rec discard_white_spaces buf spaces_detected = match%sedlex buf with | Plus white_space -> discard_white_spaces buf true | "/*" -> discard_comments buf spaces_detected | _ -> spaces_detected and discard_comments buf spaces_detected = match%sedlex buf with | eof -> raise (LexingError (buf.Lex_buffer.pos, "Unterminated comment at EOF")) | "*/" -> discard_white_spaces buf spaces_detected | any -> discard_comments buf spaces_detected | _ -> assert false in discard_white_spaces buf false let rec get_next_tokens buf spaces_detected = let open Menhir_parser in match%sedlex buf with | eof -> [ EOF ] | ';' -> [ SEMI_COLON ] | '}' -> [ RIGHT_BRACE ] | '{' -> [ LEFT_BRACE ] | ':' -> if spaces_detected then [ WHITESPACE; COLON ] else [ COLON ] | '.' -> if spaces_detected then [ WHITESPACE; DOT ] else [ DOT ] | '(' -> [ LEFT_PAREN ] | ')' -> [ RIGHT_PAREN ] | '[' -> [ LEFT_BRACKET ] | ']' -> [ RIGHT_BRACKET ] | '%' -> [ PERCENTAGE ] | operator -> [ OPERATOR (Lex_buffer.latin1 buf) ] | string -> [ STRING (Lex_buffer.latin1 ~skip:1 ~drop:1 buf) ] | "url(" -> [ get_url "" buf ] | important -> [ IMPORTANT ] | nested_at_rule -> [ NESTED_AT_RULE (Lex_buffer.latin1 ~skip:1 buf) ] | at_rule_without_body -> [ AT_RULE_WITHOUT_BODY (Lex_buffer.latin1 ~skip:1 buf) ] | at_rule -> [ AT_RULE (Lex_buffer.latin1 ~skip:1 buf) ] (* NOTE: should be placed above ident, otherwise pattern with * '-[0-9a-z]{1,6}' cannot be matched *) | _u, '+', unicode_range -> [ UNICODE_RANGE (Lex_buffer.latin1 buf) ] | ident, '(' -> [ FUNCTION (Lex_buffer.latin1 ~drop:1 buf) ] | ident -> [ IDENT (Lex_buffer.latin1 buf) ] | '#', name -> if spaces_detected then [ WHITESPACE; HASH (Lex_buffer.latin1 ~skip:1 buf) ] else [ HASH (Lex_buffer.latin1 ~skip:1 buf) ] | number -> [ get_dimension (Lex_buffer.latin1 buf) buf ] | any -> [ DELIM (Lex_buffer.latin1 buf) ] | _ -> assert false and get_dimension n buf = match%sedlex buf with | length -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Length) | angle -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Angle) | time -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Time) | frequency -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Frequency) | ident -> DIMENSION (n, Lex_buffer.latin1 buf) | _ -> NUMBER n and get_url url buf = match%sedlex buf with | ws -> get_url url buf | url -> get_url (Lex_buffer.latin1 buf) buf | ")" -> URI url | eof -> raise (LexingError (buf.Lex_buffer.pos, "Incomplete URI")) | any -> raise (LexingError ( buf.Lex_buffer.pos, "Unexpected token: " ^ Lex_buffer.latin1 buf ^ " parsing an URI" )) | _ -> assert false let token_queue = Queue.create () let queue_next_tokens_with_location buf = let spaces_detected = discard_comments_and_white_spaces buf in let loc_start = Lex_buffer.next_loc buf in let tokens = get_next_tokens buf spaces_detected in let loc_end = Lex_buffer.next_loc buf in List.iter (fun t -> Queue.add (t, loc_start, loc_end) token_queue) tokens let parse buf p = let last_token = ref (Menhir_parser.EOF, Lexing.dummy_pos, Lexing.dummy_pos) in let next_token () = if Queue.is_empty token_queue then queue_next_tokens_with_location buf; last_token := Queue.take token_queue; !last_token in try MenhirLib.Convert.Simplified.traditional2revised p next_token with | LexingError _ as e -> raise e | _ -> raise (ParseError !last_token) let parse_string ?container_lnum ?pos s p = (match container_lnum with | None -> () | Some lnum -> Lex_buffer.container_lnum_ref := lnum); parse (Lex_buffer.of_ascii_string ?pos s) p