From 86ec559f913c389e8dc055b494630f21a45e039b Mon Sep 17 00:00:00 2001 From: Sébastien Dailly Date: Thu, 4 Feb 2021 21:14:01 +0100 Subject: css_merge application --- css/lib/lexer.ml | 351 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 351 insertions(+) create mode 100755 css/lib/lexer.ml (limited to 'css/lib/lexer.ml') diff --git a/css/lib/lexer.ml b/css/lib/lexer.ml new file mode 100755 index 0000000..9185acc --- /dev/null +++ b/css/lib/lexer.ml @@ -0,0 +1,351 @@ +(** CSS lexer. + * Reference: + * https://www.w3.org/TR/css-syntax-3/ + * https://github.com/yahoo/css-js/blob/master/src/l/css.3.l *) + +module Sedlexing = Lex_buffer + +exception LexingError of (Lexing.position * string) +(** Signals a lexing error at the provided source location. *) + +exception + ParseError of (Menhir_parser.token * Lexing.position * Lexing.position) +(** Signals a parsing error at the provided token and its start and end + * locations. *) + +exception GrammarError of (string * Location.t) +(** Signals a grammar error at the provided location. *) + +let position_to_string pos = + Printf.sprintf "[%d,%d+%d]" pos.Lexing.pos_lnum pos.Lexing.pos_bol + (pos.Lexing.pos_cnum - pos.Lexing.pos_bol) + +let location_to_string loc = + Printf.sprintf "%s..%s" + (position_to_string loc.Location.loc_start) + (position_to_string loc.Location.loc_end) + +let dimension_to_string = function + | Types.Length -> "length" + | Angle -> "angle" + | Time -> "time" + | Frequency -> "frequency" + +let token_to_string = function + | Menhir_parser.EOF -> "EOF" + | LEFT_BRACE -> "{" + | RIGHT_BRACE -> "}" + | LEFT_PAREN -> "(" + | RIGHT_PAREN -> ")" + | LEFT_BRACKET -> "[" + | RIGHT_BRACKET -> "]" + | COLON -> ":" + | DOT -> "." + (* Whitespaces are detected only in selectors, before ":", ".", and "#", to + * disambiguate between "p :first-child" and "p:first-child", these + * whitespaces are replaced with "*" *) + | WHITESPACE -> "*" + | SEMI_COLON -> ";" + | PERCENTAGE -> "%" + | IMPORTANT -> "!important" + | IDENT s -> "IDENT(" ^ s ^ ")" + | STRING s -> "STRING(" ^ s ^ ")" + | URI s -> "URI(" ^ s ^ ")" + | OPERATOR s -> "OPERATOR(" ^ s ^ ")" + | DELIM s -> "DELIM(" ^ s ^ ")" + | NESTED_AT_RULE s -> "NESTED_AT_RULE(" ^ s ^ ")" + | AT_RULE_WITHOUT_BODY s -> "AT_RULE_WITHOUT_BODY(" ^ s ^ ")" + | AT_RULE s -> "AT_RULE(" ^ s ^ ")" + | FUNCTION s -> "FUNCTION(" ^ s ^ ")" + | HASH s -> "HASH(" ^ s ^ ")" + | NUMBER s -> "NUMBER(" ^ s ^ ")" + | UNICODE_RANGE s -> "UNICODE_RANGE(" ^ s ^ ")" + | FLOAT_DIMENSION (n, s, d) -> + "FLOAT_DIMENSION(" ^ n ^ ", " ^ s ^ ", " ^ dimension_to_string d ^ ")" + | DIMENSION (n, d) -> "DIMENSION(" ^ n ^ ", " ^ d ^ ")" + +let () = + Location.register_error_of_exn (function + | LexingError (pos, msg) -> + let loc = Lex_buffer.make_loc_and_fix pos pos in + Some (Location.error ~loc msg) + | ParseError (token, start_pos, end_pos) -> + let loc = Lex_buffer.make_loc_and_fix start_pos end_pos in + let msg = + Printf.sprintf "Parse error while reading token '%s'" + (token_to_string token) + in + Some (Location.error ~loc msg) + | GrammarError (msg, loc) -> Some (Location.error ~loc msg) + | _ -> None) + +(* Regexes *) +let newline = [%sedlex.regexp? '\n' | "\r\n" | '\r' | '\012'] + +let white_space = [%sedlex.regexp? " " | '\t' | newline] + +let ws = [%sedlex.regexp? Star white_space] + +let hex_digit = [%sedlex.regexp? '0' .. '9' | 'a' .. 'f' | 'A' .. 'F'] + +let digit = [%sedlex.regexp? '0' .. '9'] + +let non_ascii = [%sedlex.regexp? '\160' .. '\255'] + +let up_to_6_hex_digits = [%sedlex.regexp? Rep (hex_digit, 1 .. 6)] + +let unicode = [%sedlex.regexp? '\\', up_to_6_hex_digits, Opt white_space] + +let unicode_range = + [%sedlex.regexp? + ( Rep ((hex_digit | '?'), 1 .. 6) + | up_to_6_hex_digits, '-', up_to_6_hex_digits )] + +let escape = + [%sedlex.regexp? unicode | '\\', Compl ('\r' | '\n' | '\012' | hex_digit)] + +let ident_start = + [%sedlex.regexp? '_' | 'a' .. 'z' | 'A' .. 'Z' | non_ascii | escape] + +let ident_char = + [%sedlex.regexp? + '_' | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | non_ascii | escape] + +let ident = [%sedlex.regexp? (Opt '-', ident_start | '-', '-'), Star ident_char] + +let string_quote = + [%sedlex.regexp? + '"', Star (Compl ('\n' | '\r' | '\012' | '"') | '\\', newline | escape), '"'] + +let string_apos = + [%sedlex.regexp? + ( '\'', + Star (Compl ('\n' | '\r' | '\012' | '\'') | '\\', newline | escape), + '\'' )] + +let string = [%sedlex.regexp? string_quote | string_apos] + +let name = [%sedlex.regexp? Plus ident_char] + +let number = + [%sedlex.regexp? + ( ( Opt ('+' | '-'), + Plus digit, + Opt ('.', Plus digit), + Opt (('e' | 'E'), ('+' | '-'), Plus digit) ) + | ( Opt ('+' | '-'), + '.', + Plus digit, + Opt (('e' | 'E'), ('+' | '-'), Plus digit) ) )] + +let non_printable = + [%sedlex.regexp? '\x00' .. '\x08' | '\x0B' | '\x0E' .. '\x1F' | '\x7F'] + +let url_unquoted = + [%sedlex.regexp? + Star (Compl ('"' | '\'' | '(' | ')' | '\\' | non_printable) | escape)] + +let url = [%sedlex.regexp? url_unquoted | string] + +let operator = [%sedlex.regexp? "~=" | "|=" | "^=" | "$=" | "*=" | "||"] + +let at_rule = [%sedlex.regexp? "@", ident] + +let at_rule_without_body = + [%sedlex.regexp? "@", ("charset" | "import" | "namespace")] + +let vendor_prefix = [%sedlex.regexp? "-webkit-" | "-moz-" | "-o-" | "-ms-"] + +let nested_at_rule = + [%sedlex.regexp? + ( "@", + ( "document" | "keyframes" + | vendor_prefix, "keyframes" + | "media" | "supports" | "scope" ) )] + +let _a = [%sedlex.regexp? 'A' | 'a'] + +let _b = [%sedlex.regexp? 'B' | 'b'] + +let _c = [%sedlex.regexp? 'C' | 'c'] + +let _d = [%sedlex.regexp? 'D' | 'd'] + +let _e = [%sedlex.regexp? 'e' | 'E'] + +let _f = [%sedlex.regexp? 'F' | 'f'] + +let _g = [%sedlex.regexp? 'G' | 'g'] + +let _h = [%sedlex.regexp? 'H' | 'h'] + +let _i = [%sedlex.regexp? 'I' | 'i'] + +let _j = [%sedlex.regexp? 'J' | 'j'] + +let _k = [%sedlex.regexp? 'K' | 'k'] + +let _l = [%sedlex.regexp? 'L' | 'l'] + +let _m = [%sedlex.regexp? 'M' | 'm'] + +let _n = [%sedlex.regexp? 'N' | 'n'] + +let _o = [%sedlex.regexp? 'O' | 'o'] + +let _p = [%sedlex.regexp? 'P' | 'p'] + +let _q = [%sedlex.regexp? 'Q' | 'q'] + +let _r = [%sedlex.regexp? 'R' | 'r'] + +let _s = [%sedlex.regexp? 'S' | 's'] + +let _t = [%sedlex.regexp? 'T' | 't'] + +let _u = [%sedlex.regexp? 'U' | 'u'] + +let _v = [%sedlex.regexp? 'V' | 'v'] + +let _w = [%sedlex.regexp? 'W' | 'w'] + +let _x = [%sedlex.regexp? 'X' | 'x'] + +let _y = [%sedlex.regexp? 'Y' | 'y'] + +let _z = [%sedlex.regexp? 'Z' | 'z'] + +let important = [%sedlex.regexp? "!", ws, _i, _m, _p, _o, _r, _t, _a, _n, _t] + +let length = + [%sedlex.regexp? + ( _c, _a, _p + | _c, _h + | _e, _m + | _e, _x + | _i, _c + | _l, _h + | _r, _e, _m + | _r, _l, _h + | _v, _h + | _v, _w + | _v, _i + | _v, _b + | _v, _m, _i, _n + | _v, _m, _a, _x + | _c, _m + | _m, _m + | _q + | _i, _n + | _p, _c + | _p, _t + | _p, _x )] + +let angle = + [%sedlex.regexp? _d, _e, _g | _g, _r, _a, _d | _r, _a, _d | _t, _u, _r, _n] + +let time = [%sedlex.regexp? _s | _m, _s] + +let frequency = [%sedlex.regexp? _h, _z | _k, _h, _z] + +(* Returns true if white spaces were discarded *) +let discard_comments_and_white_spaces buf = + let rec discard_white_spaces buf spaces_detected = + match%sedlex buf with + | Plus white_space -> discard_white_spaces buf true + | "/*" -> discard_comments buf spaces_detected + | _ -> spaces_detected + and discard_comments buf spaces_detected = + match%sedlex buf with + | eof -> + raise (LexingError (buf.Lex_buffer.pos, "Unterminated comment at EOF")) + | "*/" -> discard_white_spaces buf spaces_detected + | any -> discard_comments buf spaces_detected + | _ -> assert false + in + discard_white_spaces buf false + +let rec get_next_tokens buf spaces_detected = + let open Menhir_parser in + match%sedlex buf with + | eof -> [ EOF ] + | ';' -> [ SEMI_COLON ] + | '}' -> [ RIGHT_BRACE ] + | '{' -> [ LEFT_BRACE ] + | ':' -> if spaces_detected then [ WHITESPACE; COLON ] else [ COLON ] + | '.' -> if spaces_detected then [ WHITESPACE; DOT ] else [ DOT ] + | '(' -> [ LEFT_PAREN ] + | ')' -> [ RIGHT_PAREN ] + | '[' -> [ LEFT_BRACKET ] + | ']' -> [ RIGHT_BRACKET ] + | '%' -> [ PERCENTAGE ] + | operator -> [ OPERATOR (Lex_buffer.latin1 buf) ] + | string -> [ STRING (Lex_buffer.latin1 ~skip:1 ~drop:1 buf) ] + | "url(" -> [ get_url "" buf ] + | important -> [ IMPORTANT ] + | nested_at_rule -> [ NESTED_AT_RULE (Lex_buffer.latin1 ~skip:1 buf) ] + | at_rule_without_body -> + [ AT_RULE_WITHOUT_BODY (Lex_buffer.latin1 ~skip:1 buf) ] + | at_rule -> [ AT_RULE (Lex_buffer.latin1 ~skip:1 buf) ] + (* NOTE: should be placed above ident, otherwise pattern with + * '-[0-9a-z]{1,6}' cannot be matched *) + | _u, '+', unicode_range -> [ UNICODE_RANGE (Lex_buffer.latin1 buf) ] + | ident, '(' -> [ FUNCTION (Lex_buffer.latin1 ~drop:1 buf) ] + | ident -> [ IDENT (Lex_buffer.latin1 buf) ] + | '#', name -> + if spaces_detected then + [ WHITESPACE; HASH (Lex_buffer.latin1 ~skip:1 buf) ] + else [ HASH (Lex_buffer.latin1 ~skip:1 buf) ] + | number -> [ get_dimension (Lex_buffer.latin1 buf) buf ] + | any -> [ DELIM (Lex_buffer.latin1 buf) ] + | _ -> assert false + +and get_dimension n buf = + match%sedlex buf with + | length -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Length) + | angle -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Angle) + | time -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Time) + | frequency -> FLOAT_DIMENSION (n, Lex_buffer.latin1 buf, Types.Frequency) + | ident -> DIMENSION (n, Lex_buffer.latin1 buf) + | _ -> NUMBER n + +and get_url url buf = + match%sedlex buf with + | ws -> get_url url buf + | url -> get_url (Lex_buffer.latin1 buf) buf + | ")" -> URI url + | eof -> raise (LexingError (buf.Lex_buffer.pos, "Incomplete URI")) + | any -> + raise + (LexingError + ( buf.Lex_buffer.pos, + "Unexpected token: " ^ Lex_buffer.latin1 buf ^ " parsing an URI" )) + | _ -> assert false + +let token_queue = Queue.create () + +let queue_next_tokens_with_location buf = + let spaces_detected = discard_comments_and_white_spaces buf in + let loc_start = Lex_buffer.next_loc buf in + let tokens = get_next_tokens buf spaces_detected in + let loc_end = Lex_buffer.next_loc buf in + List.iter (fun t -> Queue.add (t, loc_start, loc_end) token_queue) tokens + +let parse buf p = + let last_token = + ref (Menhir_parser.EOF, Lexing.dummy_pos, Lexing.dummy_pos) + in + let next_token () = + if Queue.is_empty token_queue then queue_next_tokens_with_location buf; + last_token := Queue.take token_queue; + !last_token + in + try MenhirLib.Convert.Simplified.traditional2revised p next_token with + | LexingError _ as e -> raise e + | _ -> raise (ParseError !last_token) + +let parse_string ?container_lnum ?pos s p = + (match container_lnum with + | None -> () + | Some lnum -> Lex_buffer.container_lnum_ref := lnum); + parse (Lex_buffer.of_ascii_string ?pos s) p -- cgit v1.2.3