diff --git a/COPYRIGHT b/COPYRIGHT index 47aefca1fbab..788f4a3370a0 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -53,6 +53,10 @@ License: other are permitted in any medium without royalty provided the copyright notice and this notice are preserved. +Files: src/tinycss/* +Copyright: Simon Sapin +License: BSD + Files: src/calibre/ebooks/readability/* Copyright: Unknown License: Apache 2.0 diff --git a/src/tinycss/__init__.py b/src/tinycss/__init__.py new file mode 100644 index 000000000000..9eca2b1b4648 --- /dev/null +++ b/src/tinycss/__init__.py @@ -0,0 +1,44 @@ +# coding: utf8 +""" + tinycss + ------- + + A CSS parser, and nothing else. + + :copyright: (c) 2012 by Simon Sapin. + :license: BSD, see LICENSE for more details. +""" + +import sys + +from .version import VERSION +__version__ = VERSION + +from .css21 import CSS21Parser +from .page3 import CSSPage3Parser + + +PARSER_MODULES = { + 'page3': CSSPage3Parser, +} + + +def make_parser(*features, **kwargs): + """Make a parser object with the chosen features. + + :param features: + Positional arguments are base classes the new parser class will extend. + The string ``'page3'`` is accepted as short for + :class:`~page3.CSSPage3Parser`. + :param kwargs: + Keyword arguments are passed to the parser’s constructor. + :returns: + An instance of a new subclass of :class:`CSS21Parser` + + """ + if features: + bases = tuple(PARSER_MODULES.get(f, f) for f in features) + parser_class = type('CustomCSSParser', bases + (CSS21Parser,), {}) + else: + parser_class = CSS21Parser + return parser_class(**kwargs) diff --git a/src/tinycss/color3.py b/src/tinycss/color3.py new file mode 100644 index 000000000000..187196e7a001 --- /dev/null +++ b/src/tinycss/color3.py @@ -0,0 +1,382 @@ +# coding: utf8 +""" + tinycss.colors3 + --------------- + + Parser for CSS 3 color values + http://www.w3.org/TR/css3-color/ + + This module does not provide anything that integrates in a parser class, + only functions that parse single tokens from (eg.) a property value. + + :copyright: (c) 2012 by Simon Sapin. + :license: BSD, see LICENSE for more details. +""" + +from __future__ import unicode_literals, division +import collections +import itertools +import re + +from .tokenizer import tokenize_grouped + + +class RGBA(collections.namedtuple('RGBA', ['red', 'green', 'blue', 'alpha'])): + """An RGBA color. + + A tuple of four floats in the 0..1 range: ``(r, g, b, a)``. + Also has ``red``, ``green``, ``blue`` and ``alpha`` attributes to access + the same values. + + """ + + +def parse_color_string(css_string): + """Parse a CSS string as a color value. + + This is a convenience wrapper around :func:`parse_color` in case you + have a string that is not from a CSS stylesheet. + + :param css_string: + An unicode string in CSS syntax. + :returns: + Same as :func:`parse_color`. + + """ + tokens = list(tokenize_grouped(css_string.strip())) + if len(tokens) == 1: + return parse_color(tokens[0]) + + +def parse_color(token): + """Parse single token as a color value. + + :param token: + A single :class:`~.token_data.Token` or + :class:`~.token_data.ContainerToken`, as found eg. in a + property value. + :returns: + * ``None``, if the token is not a valid CSS 3 color value. + (No exception is raised.) + * For the *currentColor* keyword: the string ``'currentColor'`` + * Every other values (including keywords, HSL and HSLA) is converted + to RGBA and returned as an :class:`RGBA` object (a 4-tuple with + attribute access). + The alpha channel is clipped to [0, 1], but R, G, or B can be + out of range (eg. ``rgb(-51, 306, 0)`` is represented as + ``(-.2, 1.2, 0, 1)``.) + + """ + if token.type == 'IDENT': + return COLOR_KEYWORDS.get(token.value.lower()) + elif token.type == 'HASH': + for multiplier, regexp in HASH_REGEXPS: + match = regexp(token.value) + if match: + r, g, b = [int(group * multiplier, 16) / 255 + for group in match.groups()] + return RGBA(r, g, b, 1.) + elif token.type == 'FUNCTION': + args = parse_comma_separated(token.content) + if args: + name = token.function_name.lower() + if name == 'rgb': + return parse_rgb(args, alpha=1.) + elif name == 'rgba': + alpha = parse_alpha(args[3:]) + if alpha is not None: + return parse_rgb(args[:3], alpha) + elif name == 'hsl': + return parse_hsl(args, alpha=1.) + elif name == 'hsla': + alpha = parse_alpha(args[3:]) + if alpha is not None: + return parse_hsl(args[:3], alpha) + + +def parse_alpha(args): + """ + If args is a list of a single INTEGER or NUMBER token, + retur its value clipped to the 0..1 range + Otherwise, return None. + """ + if len(args) == 1 and args[0].type in ('NUMBER', 'INTEGER'): + return min(1, max(0, args[0].value)) + + +def parse_rgb(args, alpha): + """ + If args is a list of 3 INTEGER tokens or 3 PERCENTAGE tokens, + return RGB values as a tuple of 3 floats in 0..1. + Otherwise, return None. + """ + types = [arg.type for arg in args] + if types == ['INTEGER', 'INTEGER', 'INTEGER']: + r, g, b = [arg.value / 255 for arg in args[:3]] + return RGBA(r, g, b, alpha) + elif types == ['PERCENTAGE', 'PERCENTAGE', 'PERCENTAGE']: + r, g, b = [arg.value / 100 for arg in args[:3]] + return RGBA(r, g, b, alpha) + + +def parse_hsl(args, alpha): + """ + If args is a list of 1 INTEGER token and 2 PERCENTAGE tokens, + return RGB values as a tuple of 3 floats in 0..1. + Otherwise, return None. + """ + types = [arg.type for arg in args] + if types == ['INTEGER', 'PERCENTAGE', 'PERCENTAGE']: + hsl = [arg.value for arg in args[:3]] + r, g, b = hsl_to_rgb(*hsl) + return RGBA(r, g, b, alpha) + + +def hsl_to_rgb(hue, saturation, lightness): + """ + :param hue: degrees + :param saturation: percentage + :param lightness: percentage + :returns: (r, g, b) as floats in the 0..1 range + """ + hue = (hue / 360) % 1 + saturation = min(1, max(0, saturation / 100)) + lightness = min(1, max(0, lightness / 100)) + + # Translated from ABC: http://www.w3.org/TR/css3-color/#hsl-color + def hue_to_rgb(m1, m2, h): + if h < 0: + h += 1 + if h > 1: + h -= 1 + if h * 6 < 1: + return m1 + (m2 - m1) * h * 6 + if h * 2 < 1: + return m2 + if h * 3 < 2: + return m1 + (m2 - m1) * (2 / 3 - h) * 6 + return m1 + + if lightness <= 0.5: + m2 = lightness * (saturation + 1) + else: + m2 = lightness + saturation - lightness * saturation + m1 = lightness * 2 - m2 + return ( + hue_to_rgb(m1, m2, hue + 1 / 3), + hue_to_rgb(m1, m2, hue), + hue_to_rgb(m1, m2, hue - 1 / 3), + ) + + +def parse_comma_separated(tokens): + """Parse a list of tokens (typically the content of a function token) + as arguments made of a single token each, separated by mandatory commas, + with optional white space around each argument. + + return the argument list without commas or white space; + or None if the function token content do not match the description above. + + """ + tokens = [token for token in tokens if token.type != 'S'] + if not tokens: + return [] + if len(tokens) % 2 == 1 and all( + token.type == 'DELIM' and token.value == ',' + for token in tokens[1::2]): + return tokens[::2] + + +HASH_REGEXPS = ( + (2, re.compile('^#([\da-f])([\da-f])([\da-f])$', re.I).match), + (1, re.compile('^#([\da-f]{2})([\da-f]{2})([\da-f]{2})$', re.I).match), +) + + +# (r, g, b) in 0..255 +BASIC_COLOR_KEYWORDS = [ + ('black', (0, 0, 0)), + ('silver', (192, 192, 192)), + ('gray', (128, 128, 128)), + ('white', (255, 255, 255)), + ('maroon', (128, 0, 0)), + ('red', (255, 0, 0)), + ('purple', (128, 0, 128)), + ('fuchsia', (255, 0, 255)), + ('green', (0, 128, 0)), + ('lime', (0, 255, 0)), + ('olive', (128, 128, 0)), + ('yellow', (255, 255, 0)), + ('navy', (0, 0, 128)), + ('blue', (0, 0, 255)), + ('teal', (0, 128, 128)), + ('aqua', (0, 255, 255)), +] + + +# (r, g, b) in 0..255 +EXTENDED_COLOR_KEYWORDS = [ + ('aliceblue', (240, 248, 255)), + ('antiquewhite', (250, 235, 215)), + ('aqua', (0, 255, 255)), + ('aquamarine', (127, 255, 212)), + ('azure', (240, 255, 255)), + ('beige', (245, 245, 220)), + ('bisque', (255, 228, 196)), + ('black', (0, 0, 0)), + ('blanchedalmond', (255, 235, 205)), + ('blue', (0, 0, 255)), + ('blueviolet', (138, 43, 226)), + ('brown', (165, 42, 42)), + ('burlywood', (222, 184, 135)), + ('cadetblue', (95, 158, 160)), + ('chartreuse', (127, 255, 0)), + ('chocolate', (210, 105, 30)), + ('coral', (255, 127, 80)), + ('cornflowerblue', (100, 149, 237)), + ('cornsilk', (255, 248, 220)), + ('crimson', (220, 20, 60)), + ('cyan', (0, 255, 255)), + ('darkblue', (0, 0, 139)), + ('darkcyan', (0, 139, 139)), + ('darkgoldenrod', (184, 134, 11)), + ('darkgray', (169, 169, 169)), + ('darkgreen', (0, 100, 0)), + ('darkgrey', (169, 169, 169)), + ('darkkhaki', (189, 183, 107)), + ('darkmagenta', (139, 0, 139)), + ('darkolivegreen', (85, 107, 47)), + ('darkorange', (255, 140, 0)), + ('darkorchid', (153, 50, 204)), + ('darkred', (139, 0, 0)), + ('darksalmon', (233, 150, 122)), + ('darkseagreen', (143, 188, 143)), + ('darkslateblue', (72, 61, 139)), + ('darkslategray', (47, 79, 79)), + ('darkslategrey', (47, 79, 79)), + ('darkturquoise', (0, 206, 209)), + ('darkviolet', (148, 0, 211)), + ('deeppink', (255, 20, 147)), + ('deepskyblue', (0, 191, 255)), + ('dimgray', (105, 105, 105)), + ('dimgrey', (105, 105, 105)), + ('dodgerblue', (30, 144, 255)), + ('firebrick', (178, 34, 34)), + ('floralwhite', (255, 250, 240)), + ('forestgreen', (34, 139, 34)), + ('fuchsia', (255, 0, 255)), + ('gainsboro', (220, 220, 220)), + ('ghostwhite', (248, 248, 255)), + ('gold', (255, 215, 0)), + ('goldenrod', (218, 165, 32)), + ('gray', (128, 128, 128)), + ('green', (0, 128, 0)), + ('greenyellow', (173, 255, 47)), + ('grey', (128, 128, 128)), + ('honeydew', (240, 255, 240)), + ('hotpink', (255, 105, 180)), + ('indianred', (205, 92, 92)), + ('indigo', (75, 0, 130)), + ('ivory', (255, 255, 240)), + ('khaki', (240, 230, 140)), + ('lavender', (230, 230, 250)), + ('lavenderblush', (255, 240, 245)), + ('lawngreen', (124, 252, 0)), + ('lemonchiffon', (255, 250, 205)), + ('lightblue', (173, 216, 230)), + ('lightcoral', (240, 128, 128)), + ('lightcyan', (224, 255, 255)), + ('lightgoldenrodyellow', (250, 250, 210)), + ('lightgray', (211, 211, 211)), + ('lightgreen', (144, 238, 144)), + ('lightgrey', (211, 211, 211)), + ('lightpink', (255, 182, 193)), + ('lightsalmon', (255, 160, 122)), + ('lightseagreen', (32, 178, 170)), + ('lightskyblue', (135, 206, 250)), + ('lightslategray', (119, 136, 153)), + ('lightslategrey', (119, 136, 153)), + ('lightsteelblue', (176, 196, 222)), + ('lightyellow', (255, 255, 224)), + ('lime', (0, 255, 0)), + ('limegreen', (50, 205, 50)), + ('linen', (250, 240, 230)), + ('magenta', (255, 0, 255)), + ('maroon', (128, 0, 0)), + ('mediumaquamarine', (102, 205, 170)), + ('mediumblue', (0, 0, 205)), + ('mediumorchid', (186, 85, 211)), + ('mediumpurple', (147, 112, 219)), + ('mediumseagreen', (60, 179, 113)), + ('mediumslateblue', (123, 104, 238)), + ('mediumspringgreen', (0, 250, 154)), + ('mediumturquoise', (72, 209, 204)), + ('mediumvioletred', (199, 21, 133)), + ('midnightblue', (25, 25, 112)), + ('mintcream', (245, 255, 250)), + ('mistyrose', (255, 228, 225)), + ('moccasin', (255, 228, 181)), + ('navajowhite', (255, 222, 173)), + ('navy', (0, 0, 128)), + ('oldlace', (253, 245, 230)), + ('olive', (128, 128, 0)), + ('olivedrab', (107, 142, 35)), + ('orange', (255, 165, 0)), + ('orangered', (255, 69, 0)), + ('orchid', (218, 112, 214)), + ('palegoldenrod', (238, 232, 170)), + ('palegreen', (152, 251, 152)), + ('paleturquoise', (175, 238, 238)), + ('palevioletred', (219, 112, 147)), + ('papayawhip', (255, 239, 213)), + ('peachpuff', (255, 218, 185)), + ('peru', (205, 133, 63)), + ('pink', (255, 192, 203)), + ('plum', (221, 160, 221)), + ('powderblue', (176, 224, 230)), + ('purple', (128, 0, 128)), + ('red', (255, 0, 0)), + ('rosybrown', (188, 143, 143)), + ('royalblue', (65, 105, 225)), + ('saddlebrown', (139, 69, 19)), + ('salmon', (250, 128, 114)), + ('sandybrown', (244, 164, 96)), + ('seagreen', (46, 139, 87)), + ('seashell', (255, 245, 238)), + ('sienna', (160, 82, 45)), + ('silver', (192, 192, 192)), + ('skyblue', (135, 206, 235)), + ('slateblue', (106, 90, 205)), + ('slategray', (112, 128, 144)), + ('slategrey', (112, 128, 144)), + ('snow', (255, 250, 250)), + ('springgreen', (0, 255, 127)), + ('steelblue', (70, 130, 180)), + ('tan', (210, 180, 140)), + ('teal', (0, 128, 128)), + ('thistle', (216, 191, 216)), + ('tomato', (255, 99, 71)), + ('turquoise', (64, 224, 208)), + ('violet', (238, 130, 238)), + ('wheat', (245, 222, 179)), + ('white', (255, 255, 255)), + ('whitesmoke', (245, 245, 245)), + ('yellow', (255, 255, 0)), + ('yellowgreen', (154, 205, 50)), +] + + +# (r, g, b, a) in 0..1 or a string marker +SPECIAL_COLOR_KEYWORDS = { + 'currentcolor': 'currentColor', + 'transparent': RGBA(0., 0., 0., 0.), +} + + +# RGBA namedtuples of (r, g, b, a) in 0..1 or a string marker +COLOR_KEYWORDS = SPECIAL_COLOR_KEYWORDS.copy() +COLOR_KEYWORDS.update( + # 255 maps to 1, 0 to 0, the rest is linear. + (keyword, RGBA(r / 255., g / 255., b / 255., 1.)) + for keyword, (r, g, b) in itertools.chain( + BASIC_COLOR_KEYWORDS, EXTENDED_COLOR_KEYWORDS)) diff --git a/src/tinycss/css21.py b/src/tinycss/css21.py new file mode 100644 index 000000000000..51e6529226f7 --- /dev/null +++ b/src/tinycss/css21.py @@ -0,0 +1,815 @@ +# coding: utf8 +""" + tinycss.css21 + ------------- + + Parser for CSS 2.1 + http://www.w3.org/TR/CSS21/syndata.html + + :copyright: (c) 2012 by Simon Sapin. + :license: BSD, see LICENSE for more details. +""" + +from __future__ import unicode_literals +from itertools import chain, islice + +from .decoding import decode +from .token_data import TokenList +from .tokenizer import tokenize_grouped +from .parsing import (strip_whitespace, remove_whitespace, split_on_comma, + validate_value, validate_block, validate_any, ParseError) + + +# stylesheet : [ CDO | CDC | S | statement ]*; +# statement : ruleset | at-rule; +# at-rule : ATKEYWORD S* any* [ block | ';' S* ]; +# block : '{' S* [ any | block | ATKEYWORD S* | ';' S* ]* '}' S*; +# ruleset : selector? '{' S* declaration? [ ';' S* declaration? ]* '}' S*; +# selector : any+; +# declaration : property S* ':' S* value; +# property : IDENT; +# value : [ any | block | ATKEYWORD S* ]+; +# any : [ IDENT | NUMBER | PERCENTAGE | DIMENSION | STRING +# | DELIM | URI | HASH | UNICODE-RANGE | INCLUDES +# | DASHMATCH | ':' | FUNCTION S* [any|unused]* ')' +# | '(' S* [any|unused]* ')' | '[' S* [any|unused]* ']' +# ] S*; +# unused : block | ATKEYWORD S* | ';' S* | CDO S* | CDC S*; + + +class Stylesheet(object): + """ + A parsed CSS stylesheet. + + .. attribute:: rules + + A mixed list, in source order, of :class:`RuleSet` and various + at-rules such as :class:`ImportRule`, :class:`MediaRule` + and :class:`PageRule`. + Use their :obj:`at_keyword` attribute to distinguish them. + + .. attribute:: errors + + A list of :class:`~.parsing.ParseError`. Invalid rules and declarations + are ignored, with the details logged in this list. + + .. attribute:: encoding + + The character encoding that was used to decode the stylesheet + from bytes, or ``None`` for Unicode stylesheets. + + """ + def __init__(self, rules, errors, encoding): + self.rules = rules + self.errors = errors + self.encoding = encoding + + def __repr__(self): + return '<{0.__class__.__name__} {1} rules {2} errors>'.format( + self, len(self.rules), len(self.errors)) + + +class AtRule(object): + """ + An unparsed at-rule. + + .. attribute:: at_keyword + + The normalized (lower-case) at-keyword as a string. Eg: ``'@page'`` + + .. attribute:: head + + The part of the at-rule between the at-keyword and the ``{`` + marking the body, or the ``;`` marking the end of an at-rule without + a body. A :class:`~.token_data.TokenList`. + + .. attribute:: body + + The content of the body between ``{`` and ``}`` as a + :class:`~.token_data.TokenList`, or ``None`` if there is no body + (ie. if the rule ends with ``;``). + + The head was validated against the core grammar but **not** the body, + as the body might contain declarations. In case of an error in a + declaration, parsing should continue from the next declaration. + The whole rule should not be ignored as it would be for an error + in the head. + + These at-rules are expected to be parsed further before reaching + the user API. + + """ + def __init__(self, at_keyword, head, body, line, column): + self.at_keyword = at_keyword + self.head = TokenList(head) + self.body = TokenList(body) if body is not None else body + self.line = line + self.column = column + + def __repr__(self): + return ('<{0.__class__.__name__} {0.line}:{0.column} {0.at_keyword}>' + .format(self)) + + +class RuleSet(object): + """A ruleset. + + .. attribute:: at_keyword + + Always ``None``. Helps to tell rulesets apart from at-rules. + + .. attribute:: selector + + The selector as a :class:`~.token_data.TokenList`. + In CSS 3, this is actually called a selector group. + + ``rule.selector.as_css()`` gives the selector as a string. + This string can be used with *cssselect*, see :ref:`selectors3`. + + .. attribute:: declarations + + The list of :class:`Declaration`, in source order. + + """ + + at_keyword = None + + def __init__(self, selector, declarations, line, column): + self.selector = TokenList(selector) + self.declarations = declarations + self.line = line + self.column = column + + def __repr__(self): + return ('<{0.__class__.__name__} at {0.line}:{0.column} {1}>' + .format(self, self.selector.as_css())) + + +class Declaration(object): + """A property declaration. + + .. attribute:: name + + The property name as a normalized (lower-case) string. + + .. attribute:: value + + The property value as a :class:`~.token_data.TokenList`. + + The value is not parsed. UAs using tinycss may only support + some properties or some values and tinycss does not know which. + They need to parse values themselves and ignore declarations with + unknown or unsupported properties or values, and fall back + on any previous declaration. + + :mod:`tinycss.color3` parses color values, but other values + will need specific parsing/validation code. + + .. attribute:: priority + + Either the string ``'important'`` or ``None``. + + """ + def __init__(self, name, value, priority, line, column): + self.name = name + self.value = TokenList(value) + self.priority = priority + self.line = line + self.column = column + + def __repr__(self): + priority = ' !' + self.priority if self.priority else '' + return ('<{0.__class__.__name__} {0.line}:{0.column}' + ' {0.name}: {1}{2}>'.format( + self, self.value.as_css(), priority)) + + +class PageRule(object): + """A parsed CSS 2.1 @page rule. + + .. attribute:: at_keyword + + Always ``'@page'`` + + .. attribute:: selector + + The page selector. + In CSS 2.1 this is either ``None`` (no selector), or the string + ``'first'``, ``'left'`` or ``'right'`` for the pseudo class + of the same name. + + .. attribute:: specificity + + Specificity of the page selector. This is a tuple of four integers, + but these tuples are mostly meant to be compared to each other. + + .. attribute:: declarations + + A list of :class:`Declaration`, in source order. + + .. attribute:: at_rules + + The list of parsed at-rules inside the @page block, in source order. + Always empty for CSS 2.1. + + """ + at_keyword = '@page' + + def __init__(self, selector, specificity, declarations, at_rules, + line, column): + self.selector = selector + self.specificity = specificity + self.declarations = declarations + self.at_rules = at_rules + self.line = line + self.column = column + + def __repr__(self): + return ('<{0.__class__.__name__} {0.line}:{0.column}' + ' {0.selector}>'.format(self)) + + +class MediaRule(object): + """A parsed @media rule. + + .. attribute:: at_keyword + + Always ``'@media'`` + + .. attribute:: media + + For CSS 2.1 without media queries: the media types + as a list of strings. + + .. attribute:: rules + + The list :class:`RuleSet` and various at-rules inside the @media + block, in source order. + + """ + at_keyword = '@media' + + def __init__(self, media, rules, line, column): + self.media = media + self.rules = rules + self.line = line + self.column = column + + def __repr__(self): + return ('<{0.__class__.__name__} {0.line}:{0.column}' + ' {0.media}>'.format(self)) + + +class ImportRule(object): + """A parsed @import rule. + + .. attribute:: at_keyword + + Always ``'@import'`` + + .. attribute:: uri + + The URI to be imported, as read from the stylesheet. + (URIs are not made absolute.) + + .. attribute:: media + + For CSS 2.1 without media queries: the media types + as a list of strings. + This attribute is explicitly ``['all']`` if the media was omitted + in the source. + + """ + at_keyword = '@import' + + def __init__(self, uri, media, line, column): + self.uri = uri + self.media = media + self.line = line + self.column = column + + def __repr__(self): + return ('<{0.__class__.__name__} {0.line}:{0.column}' + ' {0.uri}>'.format(self)) + + + +def _remove_at_charset(tokens): + """Remove any valid @charset at the beggining of a token stream. + + :param tokens: + An iterable of tokens + :returns: + A possibly truncated iterable of tokens + + """ + tokens = iter(tokens) + header = list(islice(tokens, 4)) + if [t.type for t in header] == ['ATKEYWORD', 'S', 'STRING', ';']: + atkw, space, string, semicolon = header + if ((atkw.value, space.value) == ('@charset', ' ') + and string.as_css()[0] == '"'): + # Found a valid @charset rule, only keep what’s after it. + return tokens + return chain(header, tokens) + + +class CSS21Parser(object): + """Parser for CSS 2.1 + + This parser supports the core CSS syntax as well as @import, @media, + @page and !important. + + Note that property values are still not parsed, as UAs using this + parser may only support some properties or some values. + + Currently the parser holds no state. It being a class only allows + subclassing and overriding its methods. + + """ + + # User API: + + def parse_stylesheet_file(self, css_file, protocol_encoding=None, + linking_encoding=None, document_encoding=None): + """Parse a stylesheet from a file or filename. + + Character encoding-related parameters and behavior are the same + as in :meth:`parse_stylesheet_bytes`. + + :param css_file: + Either a file (any object with a :meth:`~file.read` method) + or a filename. + :return: + A :class:`Stylesheet`. + + """ + if hasattr(css_file, 'read'): + css_bytes = css_file.read() + else: + with open(css_file, 'rb') as fd: + css_bytes = fd.read() + return self.parse_stylesheet_bytes(css_bytes, protocol_encoding, + linking_encoding, document_encoding) + + def parse_stylesheet_bytes(self, css_bytes, protocol_encoding=None, + linking_encoding=None, document_encoding=None): + """Parse a stylesheet from a byte string. + + The character encoding is determined from the passed metadata and the + ``@charset`` rule in the stylesheet (if any). + If no encoding information is available or decoding fails, + decoding defaults to UTF-8 and then fall back on ISO-8859-1. + + :param css_bytes: + A CSS stylesheet as a byte string. + :param protocol_encoding: + The "charset" parameter of a "Content-Type" HTTP header (if any), + or similar metadata for other protocols. + :param linking_encoding: + ```` or other metadata from the linking mechanism + (if any) + :param document_encoding: + Encoding of the referring style sheet or document (if any) + :return: + A :class:`Stylesheet`. + + """ + css_unicode, encoding = decode(css_bytes, protocol_encoding, + linking_encoding, document_encoding) + return self.parse_stylesheet(css_unicode, encoding=encoding) + + def parse_stylesheet(self, css_unicode, encoding=None): + """Parse a stylesheet from an Unicode string. + + :param css_unicode: + A CSS stylesheet as an unicode string. + :param encoding: + The character encoding used to decode the stylesheet from bytes, + if any. + :return: + A :class:`Stylesheet`. + + """ + tokens = tokenize_grouped(css_unicode) + if encoding: + tokens = _remove_at_charset(tokens) + rules, errors = self.parse_rules(tokens, context='stylesheet') + return Stylesheet(rules, errors, encoding) + + def parse_style_attr(self, css_source): + """Parse a "style" attribute (eg. of an HTML element). + + This method only accepts Unicode as the source (HTML) document + is supposed to handle the character encoding. + + :param css_source: + The attribute value, as an unicode string. + :return: + A tuple of the list of valid :class:`Declaration` and + a list of :class:`~.parsing.ParseError`. + """ + return self.parse_declaration_list(tokenize_grouped(css_source)) + + # API for subclasses: + + def parse_rules(self, tokens, context): + """Parse a sequence of rules (rulesets and at-rules). + + :param tokens: + An iterable of tokens. + :param context: + Either ``'stylesheet'`` or an at-keyword such as ``'@media'``. + (Most at-rules are only allowed in some contexts.) + :return: + A tuple of a list of parsed rules and a list of + :class:`~.parsing.ParseError`. + + """ + rules = [] + errors = [] + tokens = iter(tokens) + for token in tokens: + if token.type not in ('S', 'CDO', 'CDC'): + try: + if token.type == 'ATKEYWORD': + rule = self.read_at_rule(token, tokens) + result = self.parse_at_rule( + rule, rules, errors, context) + rules.append(result) + else: + rule, rule_errors = self.parse_ruleset(token, tokens) + rules.append(rule) + errors.extend(rule_errors) + except ParseError as exc: + errors.append(exc) + # Skip the entire rule + return rules, errors + + def read_at_rule(self, at_keyword_token, tokens): + """Read an at-rule from a token stream. + + :param at_keyword_token: + The ATKEYWORD token that starts this at-rule + You may have read it already to distinguish the rule + from a ruleset. + :param tokens: + An iterator of subsequent tokens. Will be consumed just enough + for one at-rule. + :return: + An unparsed :class:`AtRule`. + :raises: + :class:`~.parsing.ParseError` if the head is invalid for the core + grammar. The body is **not** validated. See :class:`AtRule`. + + """ + # CSS syntax is case-insensitive + at_keyword = at_keyword_token.value.lower() + head = [] + # For the ParseError in case `tokens` is empty: + token = at_keyword_token + for token in tokens: + if token.type in '{;': + break + # Ignore white space just after the at-keyword. + else: + head.append(token) + # On unexpected end of stylesheet, pretend that a ';' was there + head = strip_whitespace(head) + for head_token in head: + validate_any(head_token, 'at-rule head') + body = token.content if token.type == '{' else None + return AtRule(at_keyword, head, body, + at_keyword_token.line, at_keyword_token.column) + + def parse_at_rule(self, rule, previous_rules, errors, context): + """Parse an at-rule. + + Subclasses that override this method must use ``super()`` and + pass its return value for at-rules they do not know. + + In CSS 2.1, this method handles @charset, @import, @media and @page + rules. + + :param rule: + An unparsed :class:`AtRule`. + :param previous_rules: + The list of at-rules and rulesets that have been parsed so far + in this context. This list can be used to decide if the current + rule is valid. (For example, @import rules are only allowed + before anything but a @charset rule.) + :param context: + Either ``'stylesheet'`` or an at-keyword such as ``'@media'``. + (Most at-rules are only allowed in some contexts.) + :raises: + :class:`~.parsing.ParseError` if the rule is invalid. + :return: + A parsed at-rule + + """ + if rule.at_keyword == '@page': + if context != 'stylesheet': + raise ParseError(rule, '@page rule not allowed in ' + context) + selector, specificity = self.parse_page_selector(rule.head) + if rule.body is None: + raise ParseError(rule, + 'invalid {0} rule: missing block'.format(rule.at_keyword)) + declarations, at_rules, rule_errors = \ + self.parse_declarations_and_at_rules(rule.body, '@page') + errors.extend(rule_errors) + return PageRule(selector, specificity, declarations, at_rules, + rule.line, rule.column) + + elif rule.at_keyword == '@media': + if context != 'stylesheet': + raise ParseError(rule, '@media rule not allowed in ' + context) + if not rule.head: + raise ParseError(rule, 'expected media types for @media') + media = self.parse_media(rule.head) + if rule.body is None: + raise ParseError(rule, + 'invalid {0} rule: missing block'.format(rule.at_keyword)) + rules, rule_errors = self.parse_rules(rule.body, '@media') + errors.extend(rule_errors) + return MediaRule(media, rules, rule.line, rule.column) + + elif rule.at_keyword == '@import': + if context != 'stylesheet': + raise ParseError(rule, + '@import rule not allowed in ' + context) + for previous_rule in previous_rules: + if previous_rule.at_keyword not in ('@charset', '@import'): + if previous_rule.at_keyword: + type_ = 'an {0} rule'.format(previous_rule.at_keyword) + else: + type_ = 'a ruleset' + raise ParseError(previous_rule, + '@import rule not allowed after ' + type_) + head = rule.head + if not head: + raise ParseError(rule, + 'expected URI or STRING for @import rule') + if head[0].type not in ('URI', 'STRING'): + raise ParseError(rule, + 'expected URI or STRING for @import rule, got ' + + head[0].type) + uri = head[0].value + media = self.parse_media(strip_whitespace(head[1:])) + if rule.body is not None: + # The position of the ';' token would be best, but we don’t + # have it anymore here. + raise ParseError(head[-1], "expected ';', got a block") + return ImportRule(uri, media, rule.line, rule.column) + + elif rule.at_keyword == '@charset': + raise ParseError(rule, 'mis-placed or malformed @charset rule') + + else: + raise ParseError(rule, 'unknown at-rule in {0} context: {1}' + .format(context, rule.at_keyword)) + + def parse_media(self, tokens): + """For CSS 2.1, parse a list of media types. + + Media Queries are expected to override this. + + :param tokens: + A list of tokens + :raises: + :class:`~.parsing.ParseError` on invalid media types/queries + :returns: + For CSS 2.1, a list of media types as strings + """ + if not tokens: + return ['all'] + media_types = [] + for part in split_on_comma(remove_whitespace(tokens)): + types = [token.type for token in part] + if types == ['IDENT']: + media_types.append(part[0].value) + else: + raise ParseError(tokens[0], 'expected a media type' + + ((', got ' + ', '.join(types)) if types else '')) + return media_types + + def parse_page_selector(self, tokens): + """Parse an @page selector. + + :param tokens: + An iterable of token, typically from the ``head`` attribute of + an unparsed :class:`AtRule`. + :returns: + A page selector. For CSS 2.1, this is ``'first'``, ``'left'``, + ``'right'`` or ``None``. + :raises: + :class:`~.parsing.ParseError` on invalid selectors + + """ + if not tokens: + return None, (0, 0) + if (len(tokens) == 2 and tokens[0].type == ':' + and tokens[1].type == 'IDENT'): + pseudo_class = tokens[1].value + specificity = { + 'first': (1, 0), 'left': (0, 1), 'right': (0, 1), + }.get(pseudo_class) + if specificity: + return pseudo_class, specificity + raise ParseError(tokens[0], 'invalid @page selector') + + def parse_declarations_and_at_rules(self, tokens, context): + """Parse a mixed list of declarations and at rules, as found eg. + in the body of an @page rule. + + Note that to add supported at-rules inside @page, + :class:`~.page3.CSSPage3Parser` extends :meth:`parse_at_rule`, + not this method. + + :param tokens: + An iterable of token, typically from the ``body`` attribute of + an unparsed :class:`AtRule`. + :param context: + An at-keyword such as ``'@page'``. + (Most at-rules are only allowed in some contexts.) + :returns: + A tuple of: + + * A list of :class:`Declaration` + * A list of parsed at-rules (empty for CSS 2.1) + * A list of :class:`~.parsing.ParseError` + + """ + at_rules = [] + declarations = [] + errors = [] + tokens = iter(tokens) + for token in tokens: + if token.type == 'ATKEYWORD': + try: + rule = self.read_at_rule(token, tokens) + result = self.parse_at_rule( + rule, at_rules, errors, context) + at_rules.append(result) + except ParseError as err: + errors.append(err) + elif token.type != 'S': + declaration_tokens = [] + while token and token.type != ';': + declaration_tokens.append(token) + token = next(tokens, None) + if declaration_tokens: + try: + declarations.append( + self.parse_declaration(declaration_tokens)) + except ParseError as err: + errors.append(err) + return declarations, at_rules, errors + + def parse_ruleset(self, first_token, tokens): + """Parse a ruleset: a selector followed by declaration block. + + :param first_token: + The first token of the ruleset (probably of the selector). + You may have read it already to distinguish the rule + from an at-rule. + :param tokens: + an iterator of subsequent tokens. Will be consumed just enough + for one ruleset. + :return: + a tuple of a :class:`RuleSet` and an error list. + The errors are recovered :class:`~.parsing.ParseError` in declarations. + (Parsing continues from the next declaration on such errors.) + :raises: + :class:`~.parsing.ParseError` if the selector is invalid for the + core grammar. + Note a that a selector can be valid for the core grammar but + not for CSS 2.1 or another level. + + """ + selector = [] + for token in chain([first_token], tokens): + if token.type == '{': + # Parse/validate once we’ve read the whole rule + selector = strip_whitespace(selector) + if not selector: + raise ParseError(first_token, 'empty selector') + for selector_token in selector: + validate_any(selector_token, 'selector') + declarations, errors = self.parse_declaration_list( + token.content) + ruleset = RuleSet(selector, declarations, + first_token.line, first_token.column) + return ruleset, errors + else: + selector.append(token) + raise ParseError(token, 'no declaration block found for ruleset') + + def parse_declaration_list(self, tokens): + """Parse a ``;`` separated declaration list. + + You may want to use :meth:`parse_declarations_and_at_rules` (or + some other method that uses :func:`parse_declaration` directly) + instead if you have not just declarations in the same context. + + :param tokens: + an iterable of tokens. Should stop at (before) the end + of the block, as marked by ``}``. + :return: + a tuple of the list of valid :class:`Declaration` and a list + of :class:`~.parsing.ParseError` + + """ + # split at ';' + parts = [] + this_part = [] + for token in tokens: + if token.type == ';': + parts.append(this_part) + this_part = [] + else: + this_part.append(token) + parts.append(this_part) + + declarations = [] + errors = [] + for tokens in parts: + tokens = strip_whitespace(tokens) + if tokens: + try: + declarations.append(self.parse_declaration(tokens)) + except ParseError as exc: + errors.append(exc) + # Skip the entire declaration + return declarations, errors + + def parse_declaration(self, tokens): + """Parse a single declaration. + + :param tokens: + an iterable of at least one token. Should stop at (before) + the end of the declaration, as marked by a ``;`` or ``}``. + Empty declarations (ie. consecutive ``;`` with only white space + in-between) should be skipped earlier and not passed to + this method. + :returns: + a :class:`Declaration` + :raises: + :class:`~.parsing.ParseError` if the tokens do not match the + 'declaration' production of the core grammar. + + """ + tokens = iter(tokens) + + name_token = next(tokens) # assume there is at least one + if name_token.type == 'IDENT': + # CSS syntax is case-insensitive + property_name = name_token.value.lower() + else: + raise ParseError(name_token, + 'expected a property name, got {0}'.format(name_token.type)) + + token = name_token # In case ``tokens`` is now empty + for token in tokens: + if token.type == ':': + break + elif token.type != 'S': + raise ParseError( + token, "expected ':', got {0}".format(token.type)) + else: + raise ParseError(token, "expected ':'") + + value = strip_whitespace(list(tokens)) + if not value: + raise ParseError(token, 'expected a property value') + validate_value(value) + value, priority = self.parse_value_priority(value) + return Declaration( + property_name, value, priority, name_token.line, name_token.column) + + def parse_value_priority(self, tokens): + """Separate any ``!important`` marker at the end of a property value. + + :param tokens: + A list of tokens for the property value. + :returns: + A tuple of the actual property value (a list of tokens) + and the :attr:`~Declaration.priority`. + """ + value = list(tokens) + # Walk the token list from the end + token = value.pop() + if token.type == 'IDENT' and token.value.lower() == 'important': + while value: + token = value.pop() + if token.type == 'DELIM' and token.value == '!': + # Skip any white space before the '!' + while value and value[-1].type == 'S': + value.pop() + if not value: + raise ParseError( + token, 'expected a value before !important') + return value, 'important' + # Skip white space between '!' and 'important' + elif token.type != 'S': + break + return tokens, None diff --git a/src/tinycss/decoding.py b/src/tinycss/decoding.py new file mode 100644 index 000000000000..6303e1afda3c --- /dev/null +++ b/src/tinycss/decoding.py @@ -0,0 +1,254 @@ +# coding: utf8 +""" + tinycss.decoding + ---------------- + + Decoding stylesheets from bytes to Unicode. + http://www.w3.org/TR/CSS21/syndata.html#charset + + :copyright: (c) 2012 by Simon Sapin. + :license: BSD, see LICENSE for more details. +""" + +from __future__ import unicode_literals + +from binascii import unhexlify +import operator +import re +import sys + + +__all__ = ['decode'] # Everything else is implementation detail + + +def decode(css_bytes, protocol_encoding=None, + linking_encoding=None, document_encoding=None): + """ + Determine the character encoding from the passed metadata and the + ``@charset`` rule in the stylesheet (if any); and decode accordingly. + If no encoding information is available or decoding fails, + decoding defaults to UTF-8 and then fall back on ISO-8859-1. + + :param css_bytes: + a CSS stylesheet as a byte string + :param protocol_encoding: + The "charset" parameter of a "Content-Type" HTTP header (if any), + or similar metadata for other protocols. + :param linking_encoding: + ```` or other metadata from the linking mechanism + (if any) + :param document_encoding: + Encoding of the referring style sheet or document (if any) + :return: + A tuple of an Unicode string, with any BOM removed, and the + encoding that was used. + + """ + if protocol_encoding: + css_unicode = try_encoding(css_bytes, protocol_encoding) + if css_unicode is not None: + return css_unicode, protocol_encoding + for encoding, pattern in ENCODING_MAGIC_NUMBERS: + match = pattern(css_bytes) + if match: + has_at_charset = isinstance(encoding, tuple) + if has_at_charset: + extract, endianness = encoding + encoding = extract(match.group(1)) + # Get an ASCII-only unicode value. + # This is the only thing that works on both Python 2 and 3 + # for bytes.decode() + # Non-ASCII encoding names are invalid anyway, + # but make sure they stay invalid. + encoding = encoding.decode('ascii', 'replace') + encoding = encoding.replace('\ufffd', '?') + if encoding.replace('-', '').replace('_', '').lower() in [ + 'utf16', 'utf32']: + encoding += endianness + encoding = encoding.encode('ascii', 'replace').decode('ascii') + css_unicode = try_encoding(css_bytes, encoding) + if css_unicode and not (has_at_charset and not + css_unicode.startswith('@charset "')): + return css_unicode, encoding + break + for encoding in [linking_encoding, document_encoding]: + if encoding: + css_unicode = try_encoding(css_bytes, encoding) + if css_unicode is not None: + return css_unicode, encoding + css_unicode = try_encoding(css_bytes, 'UTF-8') + if css_unicode is not None: + return css_unicode, 'UTF-8' + return try_encoding(css_bytes, 'ISO-8859-1', fallback=False), 'ISO-8859-1' + + +def try_encoding(css_bytes, encoding, fallback=True): + if fallback: + try: + css_unicode = css_bytes.decode(encoding) + # LookupError means unknown encoding + except (UnicodeDecodeError, LookupError): + return None + else: + css_unicode = css_bytes.decode(encoding) + if css_unicode and css_unicode[0] == '\ufeff': + # Remove any Byte Order Mark + css_unicode = css_unicode[1:] + return css_unicode + + +def hex2re(hex_data): + return re.escape(unhexlify(hex_data.replace(' ', '').encode('ascii'))) + + +class Slicer(object): + """Slice()[start:stop:end] == slice(start, stop, end)""" + def __getitem__(self, slice_): + return operator.itemgetter(slice_) + +Slice = Slicer() + + +# List of (bom_size, encoding, pattern) +# bom_size is in bytes and can be zero +# encoding is a string or (slice_, endianness) for "as specified" +# slice_ is a slice object.How to extract the specified + +ENCODING_MAGIC_NUMBERS = [ + ((Slice[:], ''), re.compile( + hex2re('EF BB BF 40 63 68 61 72 73 65 74 20 22') + + b'([^\x22]*?)' + + hex2re('22 3B')).match), + + ('UTF-8', re.compile( + hex2re('EF BB BF')).match), + + ((Slice[:], ''), re.compile( + hex2re('40 63 68 61 72 73 65 74 20 22') + + b'([^\x22]*?)' + + hex2re('22 3B')).match), + + ((Slice[1::2], '-BE'), re.compile( + hex2re('FE FF 00 40 00 63 00 68 00 61 00 72 00 73 00 65 00' + '74 00 20 00 22') + + b'((\x00[^\x22])*?)' + + hex2re('00 22 00 3B')).match), + + ((Slice[1::2], '-BE'), re.compile( + hex2re('00 40 00 63 00 68 00 61 00 72 00 73 00 65 00 74 00' + '20 00 22') + + b'((\x00[^\x22])*?)' + + hex2re('00 22 00 3B')).match), + + ((Slice[::2], '-LE'), re.compile( + hex2re('FF FE 40 00 63 00 68 00 61 00 72 00 73 00 65 00 74' + '00 20 00 22 00') + + b'(([^\x22]\x00)*?)' + + hex2re('22 00 3B 00')).match), + + ((Slice[::2], '-LE'), re.compile( + hex2re('40 00 63 00 68 00 61 00 72 00 73 00 65 00 74 00 20' + '00 22 00') + + b'(([^\x22]\x00)*?)' + + hex2re('22 00 3B 00')).match), + + ((Slice[3::4], '-BE'), re.compile( + hex2re('00 00 FE FF 00 00 00 40 00 00 00 63 00 00 00 68 00' + '00 00 61 00 00 00 72 00 00 00 73 00 00 00 65 00 00' + '00 74 00 00 00 20 00 00 00 22') + + b'((\x00\x00\x00[^\x22])*?)' + + hex2re('00 00 00 22 00 00 00 3B')).match), + + ((Slice[3::4], '-BE'), re.compile( + hex2re('00 00 00 40 00 00 00 63 00 00 00 68 00 00 00 61 00' + '00 00 72 00 00 00 73 00 00 00 65 00 00 00 74 00 00' + '00 20 00 00 00 22') + + b'((\x00\x00\x00[^\x22])*?)' + + hex2re('00 00 00 22 00 00 00 3B')).match), + + +# Python does not support 2143 or 3412 endianness, AFAIK. +# I guess we could fix it up ourselves but meh. Patches welcome. + +# ((Slice[2::4], '-2143'), re.compile( +# hex2re('00 00 FF FE 00 00 40 00 00 00 63 00 00 00 68 00 00' +# '00 61 00 00 00 72 00 00 00 73 00 00 00 65 00 00 00' +# '74 00 00 00 20 00 00 00 22 00') +# + b'((\x00\x00[^\x22]\x00)*?)' +# + hex2re('00 00 22 00 00 00 3B 00')).match), + +# ((Slice[2::4], '-2143'), re.compile( +# hex2re('00 00 40 00 00 00 63 00 00 00 68 00 00 00 61 00 00' +# '00 72 00 00 00 73 00 00 00 65 00 00 00 74 00 00 00' +# '20 00 00 00 22 00') +# + b'((\x00\x00[^\x22]\x00)*?)' +# + hex2re('00 00 22 00 00 00 3B 00')).match), + +# ((Slice[1::4], '-3412'), re.compile( +# hex2re('FE FF 00 00 00 40 00 00 00 63 00 00 00 68 00 00 00' +# '61 00 00 00 72 00 00 00 73 00 00 00 65 00 00 00 74' +# '00 00 00 20 00 00 00 22 00 00') +# + b'((\x00[^\x22]\x00\x00)*?)' +# + hex2re('00 22 00 00 00 3B 00 00')).match), + +# ((Slice[1::4], '-3412'), re.compile( +# hex2re('00 40 00 00 00 63 00 00 00 68 00 00 00 61 00 00 00' +# '72 00 00 00 73 00 00 00 65 00 00 00 74 00 00 00 20' +# '00 00 00 22 00 00') +# + b'((\x00[^\x22]\x00\x00)*?)' +# + hex2re('00 22 00 00 00 3B 00 00')).match), + + ((Slice[::4], '-LE'), re.compile( + hex2re('FF FE 00 00 40 00 00 00 63 00 00 00 68 00 00 00 61' + '00 00 00 72 00 00 00 73 00 00 00 65 00 00 00 74 00' + '00 00 20 00 00 00 22 00 00 00') + + b'(([^\x22]\x00\x00\x00)*?)' + + hex2re('22 00 00 00 3B 00 00 00')).match), + + ((Slice[::4], '-LE'), re.compile( + hex2re('40 00 00 00 63 00 00 00 68 00 00 00 61 00 00 00 72' + '00 00 00 73 00 00 00 65 00 00 00 74 00 00 00 20 00' + '00 00 22 00 00 00') + + b'(([^\x22]\x00\x00\x00)*?)' + + hex2re('22 00 00 00 3B 00 00 00')).match), + + ('UTF-32-BE', re.compile( + hex2re('00 00 FE FF')).match), + + ('UTF-32-LE', re.compile( + hex2re('FF FE 00 00')).match), + +# ('UTF-32-2143', re.compile( +# hex2re('00 00 FF FE')).match), + +# ('UTF-32-3412', re.compile( +# hex2re('FE FF 00 00')).match), + + ('UTF-16-BE', re.compile( + hex2re('FE FF')).match), + + ('UTF-16-LE', re.compile( + hex2re('FF FE')).match), + + +# Some of there are supported by Python, but I didn’t bother. +# You know the story with patches ... + +# # as specified, transcoded from EBCDIC to ASCII +# ('as_specified-EBCDIC', re.compile( +# hex2re('7C 83 88 81 99 A2 85 A3 40 7F') +# + b'([^\x7F]*?)' +# + hex2re('7F 5E')).match), + +# # as specified, transcoded from IBM1026 to ASCII +# ('as_specified-IBM1026', re.compile( +# hex2re('AE 83 88 81 99 A2 85 A3 40 FC') +# + b'([^\xFC]*?)' +# + hex2re('FC 5E')).match), + +# # as specified, transcoded from GSM 03.38 to ASCII +# ('as_specified-GSM_03.38', re.compile( +# hex2re('00 63 68 61 72 73 65 74 20 22') +# + b'([^\x22]*?)' +# + hex2re('22 3B')).match), +] diff --git a/src/tinycss/page3.py b/src/tinycss/page3.py new file mode 100644 index 000000000000..3c8786002d1c --- /dev/null +++ b/src/tinycss/page3.py @@ -0,0 +1,159 @@ +# coding: utf8 +""" + tinycss.page3 + ------------------ + + Support for CSS 3 Paged Media syntax: + http://dev.w3.org/csswg/css3-page/ + + Adds support for named page selectors and margin rules. + + :copyright: (c) 2012 by Simon Sapin. + :license: BSD, see LICENSE for more details. +""" + +from __future__ import unicode_literals, division +from .css21 import CSS21Parser, ParseError + + +class MarginRule(object): + """A parsed at-rule for margin box. + + .. attribute:: at_keyword + + One of the 16 following strings: + + * ``@top-left-corner`` + * ``@top-left`` + * ``@top-center`` + * ``@top-right`` + * ``@top-right-corner`` + * ``@bottom-left-corner`` + * ``@bottom-left`` + * ``@bottom-center`` + * ``@bottom-right`` + * ``@bottom-right-corner`` + * ``@left-top`` + * ``@left-middle`` + * ``@left-bottom`` + * ``@right-top`` + * ``@right-middle`` + * ``@right-bottom`` + + .. attribute:: declarations + + A list of :class:`~.css21.Declaration` objects. + + .. attribute:: line + + Source line where this was read. + + .. attribute:: column + + Source column where this was read. + + """ + + def __init__(self, at_keyword, declarations, line, column): + self.at_keyword = at_keyword + self.declarations = declarations + self.line = line + self.column = column + + +class CSSPage3Parser(CSS21Parser): + """Extend :class:`~.css21.CSS21Parser` for `CSS 3 Paged Media`_ syntax. + + .. _CSS 3 Paged Media: http://dev.w3.org/csswg/css3-page/ + + Compared to CSS 2.1, the ``at_rules`` and ``selector`` attributes of + :class:`~.css21.PageRule` objects are modified: + + * ``at_rules`` is not always empty, it is a list of :class:`MarginRule` + objects. + + * ``selector``, instead of a single string, is a tuple of the page name + and the pseudo class. Each of these may be a ``None`` or a string. + + +--------------------------+------------------------+ + | CSS | Parsed selectors | + +==========================+========================+ + | .. code-block:: css | .. code-block:: python | + | | | + | @page {} | (None, None) | + | @page :first {} | (None, 'first') | + | @page chapter {} | ('chapter', None) | + | @page table:right {} | ('table', 'right') | + +--------------------------+------------------------+ + + """ + + PAGE_MARGIN_AT_KEYWORDS = [ + '@top-left-corner', + '@top-left', + '@top-center', + '@top-right', + '@top-right-corner', + '@bottom-left-corner', + '@bottom-left', + '@bottom-center', + '@bottom-right', + '@bottom-right-corner', + '@left-top', + '@left-middle', + '@left-bottom', + '@right-top', + '@right-middle', + '@right-bottom', + ] + + def parse_at_rule(self, rule, previous_rules, errors, context): + if rule.at_keyword in self.PAGE_MARGIN_AT_KEYWORDS: + if context != '@page': + raise ParseError(rule, + '%s rule not allowed in %s' % (rule.at_keyword, context)) + if rule.head: + raise ParseError(rule.head[0], + 'unexpected %s token in %s rule header' + % (rule.head[0].type, rule.at_keyword)) + declarations, body_errors = self.parse_declaration_list(rule.body) + errors.extend(body_errors) + return MarginRule(rule.at_keyword, declarations, + rule.line, rule.column) + return super(CSSPage3Parser, self).parse_at_rule( + rule, previous_rules, errors, context) + + def parse_page_selector(self, head): + """Parse an @page selector. + + :param head: + The ``head`` attribute of an unparsed :class:`AtRule`. + :returns: + A page selector. For CSS 2.1, this is 'first', 'left', 'right' + or None. 'blank' is added by GCPM. + :raises: + :class`~parsing.ParseError` on invalid selectors + + """ + if not head: + return (None, None), (0, 0, 0) + if head[0].type == 'IDENT': + name = head.pop(0).value + while head and head[0].type == 'S': + head.pop(0) + if not head: + return (name, None), (1, 0, 0) + name_specificity = (1,) + else: + name = None + name_specificity = (0,) + if (len(head) == 2 and head[0].type == ':' + and head[1].type == 'IDENT'): + pseudo_class = head[1].value + specificity = { + 'first': (1, 0), 'blank': (1, 0), + 'left': (0, 1), 'right': (0, 1), + }.get(pseudo_class) + if specificity: + return (name, pseudo_class), (name_specificity + specificity) + raise ParseError(head[0], 'invalid @page selector') diff --git a/src/tinycss/parsing.py b/src/tinycss/parsing.py new file mode 100644 index 000000000000..86e93c07f7b4 --- /dev/null +++ b/src/tinycss/parsing.py @@ -0,0 +1,165 @@ +# coding: utf8 +""" + tinycss.parsing + --------------- + + Utilities for parsing lists of tokens. + + :copyright: (c) 2012 by Simon Sapin. + :license: BSD, see LICENSE for more details. +""" + +from __future__ import unicode_literals + + +# TODO: unit tests + +def split_on_comma(tokens): + """Split a list of tokens on commas, ie ``,`` DELIM tokens. + + Only "top-level" comma tokens are splitting points, not commas inside a + function or other :class:`ContainerToken`. + + :param tokens: + An iterable of :class:`~.token_data.Token` or + :class:`~.token_data.ContainerToken`. + :returns: + A list of lists of tokens + + """ + parts = [] + this_part = [] + for token in tokens: + if token.type == 'DELIM' and token.value == ',': + parts.append(this_part) + this_part = [] + else: + this_part.append(token) + parts.append(this_part) + return parts + + +def strip_whitespace(tokens): + """Remove whitespace at the beggining and end of a token list. + + Whitespace tokens in-between other tokens in the list are preserved. + + :param tokens: + A list of :class:`~.token_data.Token` or + :class:`~.token_data.ContainerToken`. + :return: + A new sub-sequence of the list. + + """ + for i, token in enumerate(tokens): + if token.type != 'S': + break + else: + return [] # only whitespace + tokens = tokens[i:] + while tokens and tokens[-1].type == 'S': + tokens.pop() + return tokens + + +def remove_whitespace(tokens): + """Remove any top-level whitespace in a token list. + + Whitespace tokens inside recursive :class:`~.token_data.ContainerToken` + are preserved. + + :param tokens: + A list of :class:`~.token_data.Token` or + :class:`~.token_data.ContainerToken`. + :return: + A new sub-sequence of the list. + + """ + return [token for token in tokens if token.type != 'S'] + + +def validate_value(tokens): + """Validate a property value. + + :param tokens: + an iterable of tokens + :raises: + :class:`ParseError` if there is any invalid token for the 'value' + production of the core grammar. + + """ + for token in tokens: + type_ = token.type + if type_ == '{': + validate_block(token.content, 'property value') + else: + validate_any(token, 'property value') + +def validate_block(tokens, context): + """ + :raises: + :class:`ParseError` if there is any invalid token for the 'block' + production of the core grammar. + :param tokens: an iterable of tokens + :param context: a string for the 'unexpected in ...' message + + """ + for token in tokens: + type_ = token.type + if type_ == '{': + validate_block(token.content, context) + elif type_ not in (';', 'ATKEYWORD'): + validate_any(token, context) + + +def validate_any(token, context): + """ + :raises: + :class:`ParseError` if this is an invalid token for the + 'any' production of the core grammar. + :param token: a single token + :param context: a string for the 'unexpected in ...' message + + """ + type_ = token.type + if type_ in ('FUNCTION', '(', '['): + for token in token.content: + validate_any(token, type_) + elif type_ not in ('S', 'IDENT', 'DIMENSION', 'PERCENTAGE', 'NUMBER', + 'INTEGER', 'URI', 'DELIM', 'STRING', 'HASH', ':', + 'UNICODE-RANGE'): + if type_ in ('}', ')', ']'): + adjective = 'unmatched' + else: + adjective = 'unexpected' + raise ParseError(token, + '{0} {1} token in {2}'.format(adjective, type_, context)) + + +class ParseError(ValueError): + """Details about a CSS syntax error. Usually indicates that something + (a rule or a declaration) was ignored and will not appear as a parsed + object. + + This exception is typically logged in a list rather than being propagated + to the user API. + + .. attribute:: line + + Source line where the error occured. + + .. attribute:: column + + Column in the source line where the error occured. + + .. attribute:: reason + + What happend (a string). + + """ + def __init__(self, subject, reason): + self.line = subject.line + self.column = subject.column + self.reason = reason + super(ParseError, self).__init__( + 'Parse error at {0.line}:{0.column}, {0.reason}'.format(self)) diff --git a/src/tinycss/token_data.py b/src/tinycss/token_data.py new file mode 100644 index 000000000000..dcd923229997 --- /dev/null +++ b/src/tinycss/token_data.py @@ -0,0 +1,441 @@ +# coding: utf8 +""" + tinycss.token_data + ------------------ + + Shared data for both implementations (Cython and Python) of the tokenizer. + + :copyright: (c) 2012 by Simon Sapin. + :license: BSD, see LICENSE for more details. +""" + +from __future__ import unicode_literals + +import re +import sys +import operator +import functools +import string + + +# * Raw strings with the r'' notation are used so that \ do not need +# to be escaped. +# * Names and regexps are separated by a tabulation. +# * Macros are re-ordered so that only previous definitions are needed. +# * {} are used for macro substitution with ``string.Formatter``, +# so other uses of { or } have been doubled. +# * The syntax is otherwise compatible with re.compile. +# * Some parentheses were added to add capturing groups. +# (in unicode, DIMENSION and URI) + +# *** Willful violation: *** +# Numbers can take a + or - sign, but the sign is a separate DELIM token. +# Since comments are allowed anywhere between tokens, this makes +# the following this is valid. It means 10 negative pixels: +# margin-top: -/**/10px + +# This makes parsing numbers a pain, so instead we’ll do the same is Firefox +# and make the sign part as of the 'num' macro. The above CSS will be invalid. +# See discussion: +# http://lists.w3.org/Archives/Public/www-style/2011Oct/0028.html +MACROS = r''' + nl \n|\r\n|\r|\f + w [ \t\r\n\f]* + nonascii [^\0-\237] + unicode \\([0-9a-f]{{1,6}})(\r\n|[ \n\r\t\f])? + simple_escape [^\n\r\f0-9a-f] + escape {unicode}|\\{simple_escape} + nmstart [_a-z]|{nonascii}|{escape} + nmchar [_a-z0-9-]|{nonascii}|{escape} + name {nmchar}+ + ident [-]?{nmstart}{nmchar}* + num [-+]?(?:[0-9]*\.[0-9]+|[0-9]+) + string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" + string2 \'([^\n\r\f\\']|\\{nl}|{escape})*\' + string {string1}|{string2} + badstring1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\\? + badstring2 \'([^\n\r\f\\']|\\{nl}|{escape})*\\? + badstring {badstring1}|{badstring2} + badcomment1 \/\*[^*]*\*+([^/*][^*]*\*+)* + badcomment2 \/\*[^*]*(\*+[^/*][^*]*)* + badcomment {badcomment1}|{badcomment2} + baduri1 url\({w}([!#$%&*-~]|{nonascii}|{escape})*{w} + baduri2 url\({w}{string}{w} + baduri3 url\({w}{badstring} + baduri {baduri1}|{baduri2}|{baduri3} +'''.replace(r'\0', '\0').replace(r'\237', '\237') + +# Removed these tokens. Instead, they’re tokenized as two DELIM each. +# INCLUDES ~= +# DASHMATCH |= +# They are only used in selectors but selectors3 also have ^=, *= and $=. +# We don’t actually parse selectors anyway + +# Re-ordered so that the longest match is always the first. +# For example, "url('foo')" matches URI, BAD_URI, FUNCTION and IDENT, +# but URI would always be a longer match than the others. +TOKENS = r''' + S [ \t\r\n\f]+ + + URI url\({w}({string}|([!#$%&*-\[\]-~]|{nonascii}|{escape})*){w}\) + BAD_URI {baduri} + FUNCTION {ident}\( + UNICODE-RANGE u\+[0-9a-f?]{{1,6}}(-[0-9a-f]{{1,6}})? + IDENT {ident} + + ATKEYWORD @{ident} + HASH #{name} + + DIMENSION ({num})({ident}) + PERCENTAGE {num}% + NUMBER {num} + + STRING {string} + BAD_STRING {badstring} + + COMMENT \/\*[^*]*\*+([^/*][^*]*\*+)*\/ + BAD_COMMENT {badcomment} + + : : + ; ; + { \{{ + } \}} + ( \( + ) \) + [ \[ + ] \] + CDO +''' + + +# Strings with {macro} expanded +COMPILED_MACROS = {} + + +COMPILED_TOKEN_REGEXPS = [] # [(name, regexp.match)] ordered +COMPILED_TOKEN_INDEXES = {} # {name: i} helper for the C speedups + + +# Indexed by codepoint value of the first character of a token. +# Codepoints >= 160 (aka nonascii) all use the index 160. +# values are (i, name, regexp.match) +TOKEN_DISPATCH = [] + + +try: + unichr +except NameError: + # Python 3 + unichr = chr + unicode = str + + +def _init(): + """Import-time initialization.""" + COMPILED_MACROS.clear() + for line in MACROS.splitlines(): + if line.strip(): + name, value = line.split('\t') + COMPILED_MACROS[name.strip()] = '(?:%s)' \ + % value.format(**COMPILED_MACROS) + + COMPILED_TOKEN_REGEXPS[:] = ( + ( + name.strip(), + re.compile( + value.format(**COMPILED_MACROS), + # Case-insensitive when matching eg. uRL(foo) + # but preserve the case in extracted groups + re.I + ).match + ) + for line in TOKENS.splitlines() + if line.strip() + for name, value in [line.split('\t')] + ) + + COMPILED_TOKEN_INDEXES.clear() + for i, (name, regexp) in enumerate(COMPILED_TOKEN_REGEXPS): + COMPILED_TOKEN_INDEXES[name] = i + + dispatch = [[] for i in range(161)] + for chars, names in [ + (' \t\r\n\f', ['S']), + ('uU', ['URI', 'BAD_URI', 'UNICODE-RANGE']), + # \ is an escape outside of another token + (string.ascii_letters + '\\_-' + unichr(160), ['FUNCTION', 'IDENT']), + (string.digits + '.+-', ['DIMENSION', 'PERCENTAGE', 'NUMBER']), + ('@', ['ATKEYWORD']), + ('#', ['HASH']), + ('\'"', ['STRING', 'BAD_STRING']), + ('/', ['COMMENT', 'BAD_COMMENT']), + ('<', ['CDO']), + ('-', ['CDC']), + ]: + for char in chars: + dispatch[ord(char)].extend(names) + for char in ':;{}()[]': + dispatch[ord(char)] = [char] + + TOKEN_DISPATCH[:] = ( + [ + (index,) + COMPILED_TOKEN_REGEXPS[index] + for name in names + for index in [COMPILED_TOKEN_INDEXES[name]] + ] + for names in dispatch + ) + +_init() + + +def _unicode_replace(match, int=int, unichr=unichr, maxunicode=sys.maxunicode): + codepoint = int(match.group(1), 16) + if codepoint <= maxunicode: + return unichr(codepoint) + else: + return '\N{REPLACEMENT CHARACTER}' # U+FFFD + +UNICODE_UNESCAPE = functools.partial( + re.compile(COMPILED_MACROS['unicode'], re.I).sub, + _unicode_replace) + +NEWLINE_UNESCAPE = functools.partial( + re.compile(r'()\\' + COMPILED_MACROS['nl']).sub, + '') + +SIMPLE_UNESCAPE = functools.partial( + re.compile(r'\\(%s)' % COMPILED_MACROS['simple_escape'] , re.I).sub, + # Same as r'\1', but faster on CPython + operator.methodcaller('group', 1)) + +FIND_NEWLINES = re.compile(COMPILED_MACROS['nl']).finditer + + +class Token(object): + """A single atomic token. + + .. attribute:: is_container + + Always ``False``. + Helps to tell :class:`Token` apart from :class:`ContainerToken`. + + .. attribute:: type + + The type of token as a string: + + ``S`` + A sequence of white space + + ``IDENT`` + An identifier: a name that does not start with a digit. + A name is a sequence of letters, digits, ``_``, ``-``, escaped + characters and non-ASCII characters. Eg: ``margin-left`` + + ``HASH`` + ``#`` followed immediately by a name. Eg: ``#ff8800`` + + ``ATKEYWORD`` + ``@`` followed immediately by an identifier. Eg: ``@page`` + + ``URI`` + Eg: ``url(foo)`` The content may or may not be quoted. + + ``UNICODE-RANGE`` + ``U+`` followed by one or two hexadecimal + Unicode codepoints. Eg: ``U+20-00FF`` + + ``INTEGER`` + An integer with an optional ``+`` or ``-`` sign + + ``NUMBER`` + A non-integer number with an optional ``+`` or ``-`` sign + + ``DIMENSION`` + An integer or number followed immediately by an + identifier (the unit). Eg: ``12px`` + + ``PERCENTAGE`` + An integer or number followed immediately by ``%`` + + ``STRING`` + A string, quoted with ``"`` or ``'`` + + ``:`` or ``;`` + That character. + + ``DELIM`` + A single character not matched in another token. Eg: ``,`` + + See the source of the :mod:`.token_data` module for the precise + regular expressions that match various tokens. + + Note that other token types exist in the early tokenization steps, + but these are ignored, are syntax errors, or are later transformed + into :class:`ContainerToken` or :class:`FunctionToken`. + + .. attribute:: value + + The parsed value: + + * INTEGER, NUMBER, PERCENTAGE or DIMENSION tokens: the numeric value + as an int or float. + * STRING tokens: the unescaped string without quotes + * URI tokens: the unescaped URI without quotes or + ``url(`` and ``)`` markers. + * IDENT, ATKEYWORD or HASH tokens: the unescaped token, + with ``@`` or ``#`` markers left as-is + * Other tokens: same as :attr:`as_css` + + *Unescaped* refers to the various escaping methods based on the + backslash ``\`` character in CSS syntax. + + .. attribute:: unit + + * DIMENSION tokens: the normalized (unescaped, lower-case) + unit name as a string. eg. ``'px'`` + * PERCENTAGE tokens: the string ``'%'`` + * Other tokens: ``None`` + + .. attribute:: line + + The line number in the CSS source of the start of this token. + + .. attribute:: column + + The column number (inside a source line) of the start of this token. + + """ + is_container = False + __slots__ = 'type', '_as_css', 'value', 'unit', 'line', 'column' + + def __init__(self, type_, css_value, value, unit, line, column): + self.type = type_ + self._as_css = css_value + self.value = value + self.unit = unit + self.line = line + self.column = column + + def as_css(self): + """ + Return as an Unicode string the CSS representation of the token, + as parsed in the source. + """ + return self._as_css + + def __repr__(self): + return ('' + .format(self, self.unit or '')) + + +class ContainerToken(object): + """A token that contains other (nested) tokens. + + .. attribute:: is_container + + Always ``True``. + Helps to tell :class:`ContainerToken` apart from :class:`Token`. + + .. attribute:: type + + The type of token as a string. One of ``{``, ``(``, ``[`` or + ``FUNCTION``. For ``FUNCTION``, the object is actually a + :class:`FunctionToken`. + + .. attribute:: unit + + Always ``None``. Included to make :class:`ContainerToken` behave + more like :class:`Token`. + + .. attribute:: content + + A list of :class:`Token` or nested :class:`ContainerToken`, + not including the opening or closing token. + + .. attribute:: line + + The line number in the CSS source of the start of this token. + + .. attribute:: column + + The column number (inside a source line) of the start of this token. + + """ + is_container = True + unit = None + __slots__ = 'type', '_css_start', '_css_end', 'content', 'line', 'column' + + def __init__(self, type_, css_start, css_end, content, line, column): + self.type = type_ + self._css_start = css_start + self._css_end = css_end + self.content = content + self.line = line + self.column = column + + def as_css(self): + """ + Return as an Unicode string the CSS representation of the token, + as parsed in the source. + """ + parts = [self._css_start] + parts.extend(token.as_css() for token in self.content) + parts.append(self._css_end) + return ''.join(parts) + + format_string = '' + + def __repr__(self): + return (self.format_string + ' {0.content}').format(self) + + +class FunctionToken(ContainerToken): + """A specialized :class:`ContainerToken` for a ``FUNCTION`` group. + Has an additional attribute: + + .. attribute:: function_name + + The unescaped name of the function, with the ``(`` marker removed. + + """ + __slots__ = 'function_name', + + def __init__(self, type_, css_start, css_end, function_name, content, + line, column): + super(FunctionToken, self).__init__( + type_, css_start, css_end, content, line, column) + # Remove the ( marker: + self.function_name = function_name[:-1] + + format_string = ('') + + +class TokenList(list): + """ + A mixed list of :class:`~.token_data.Token` and + :class:`~.token_data.ContainerToken` objects. + + This is a subclass of the builtin :class:`~builtins.list` type. + It can be iterated, indexed and sliced as usual, but also has some + additional API: + + """ + @property + def line(self): + """The line number in the CSS source of the first token.""" + return self[0].line + + @property + def column(self): + """The column number (inside a source line) of the first token.""" + return self[0].column + + def as_css(self): + """ + Return as an Unicode string the CSS representation of the tokens, + as parsed in the source. + """ + return ''.join(token.as_css() for token in self) diff --git a/src/tinycss/tokenizer.py b/src/tinycss/tokenizer.py new file mode 100644 index 000000000000..eba44c1b933b --- /dev/null +++ b/src/tinycss/tokenizer.py @@ -0,0 +1,216 @@ +# coding: utf8 +""" + tinycss.tokenizer + ----------------- + + Tokenizer for the CSS core syntax: + http://www.w3.org/TR/CSS21/syndata.html#tokenization + + This is the pure-python implementation. See also speedups.pyx + + :copyright: (c) 2012 by Simon Sapin. + :license: BSD, see LICENSE for more details. +""" + +from __future__ import unicode_literals + +from . import token_data + + +def tokenize_flat(css_source, ignore_comments=True, + # Make these local variable to avoid global lookups in the loop + tokens_dispatch=token_data.TOKEN_DISPATCH, + unicode_unescape=token_data.UNICODE_UNESCAPE, + newline_unescape=token_data.NEWLINE_UNESCAPE, + simple_unescape=token_data.SIMPLE_UNESCAPE, + find_newlines=token_data.FIND_NEWLINES, + Token=token_data.Token, + len=len, + int=int, + float=float, + list=list, + _None=None, +): + """ + :param css_source: + CSS as an unicode string + :param ignore_comments: + if true (the default) comments will not be included in the + return value + :return: + An iterator of :class:`Token` + + """ + + pos = 0 + line = 1 + column = 1 + source_len = len(css_source) + tokens = [] + while pos < source_len: + char = css_source[pos] + if char in ':;{}()[]': + type_ = char + css_value = char + else: + codepoint = min(ord(char), 160) + for _index, type_, regexp in tokens_dispatch[codepoint]: + match = regexp(css_source, pos) + if match: + # First match is the longest. See comments on TOKENS above. + css_value = match.group() + break + else: + # No match. + # "Any other character not matched by the above rules, + # and neither a single nor a double quote." + # ... but quotes at the start of a token are always matched + # by STRING or BAD_STRING. So DELIM is any single character. + type_ = 'DELIM' + css_value = char + length = len(css_value) + next_pos = pos + length + + # A BAD_COMMENT is a comment at EOF. Ignore it too. + if not (ignore_comments and type_ in ('COMMENT', 'BAD_COMMENT')): + # Parse numbers, extract strings and URIs, unescape + unit = _None + if type_ == 'DIMENSION': + value = match.group(1) + value = float(value) if '.' in value else int(value) + unit = match.group(2) + unit = simple_unescape(unit) + unit = unicode_unescape(unit) + unit = unit.lower() # normalize + elif type_ == 'PERCENTAGE': + value = css_value[:-1] + value = float(value) if '.' in value else int(value) + unit = '%' + elif type_ == 'NUMBER': + value = css_value + if '.' in value: + value = float(value) + else: + value = int(value) + type_ = 'INTEGER' + elif type_ in ('IDENT', 'ATKEYWORD', 'HASH', 'FUNCTION'): + value = simple_unescape(css_value) + value = unicode_unescape(value) + elif type_ == 'URI': + value = match.group(1) + if value and value[0] in '"\'': + value = value[1:-1] # Remove quotes + value = newline_unescape(value) + value = simple_unescape(value) + value = unicode_unescape(value) + elif type_ == 'STRING': + value = css_value[1:-1] # Remove quotes + value = newline_unescape(value) + value = simple_unescape(value) + value = unicode_unescape(value) + # BAD_STRING can only be one of: + # * Unclosed string at the end of the stylesheet: + # Close the string, but this is not an error. + # Make it a "good" STRING token. + # * Unclosed string at the (unescaped) end of the line: + # Close the string, but this is an error. + # Leave it as a BAD_STRING, don’t bother parsing it. + # See http://www.w3.org/TR/CSS21/syndata.html#parsing-errors + elif type_ == 'BAD_STRING' and next_pos == source_len: + type_ = 'STRING' + value = css_value[1:] # Remove quote + value = newline_unescape(value) + value = simple_unescape(value) + value = unicode_unescape(value) + else: + value = css_value + tokens.append(Token(type_, css_value, value, unit, line, column)) + + pos = next_pos + newlines = list(find_newlines(css_value)) + if newlines: + line += len(newlines) + # Add 1 to have lines start at column 1, not 0 + column = length - newlines[-1].end() + 1 + else: + column += length + return tokens + + +def regroup(tokens): + """ + Match pairs of tokens: () [] {} function() + (Strings in "" or '' are taken care of by the tokenizer.) + + Opening tokens are replaced by a :class:`ContainerToken`. + Closing tokens are removed. Unmatched closing tokens are invalid + but left as-is. All nested structures that are still open at + the end of the stylesheet are implicitly closed. + + :param tokens: + a *flat* iterable of tokens, as returned by :func:`tokenize_flat`. + :return: + A tree of tokens. + + """ + # "global" objects for the inner recursion + pairs = {'FUNCTION': ')', '(': ')', '[': ']', '{': '}'} + tokens = iter(tokens) + eof = [False] + + def _regroup_inner(stop_at=None, + tokens=tokens, pairs=pairs, eof=eof, + ContainerToken=token_data.ContainerToken, + FunctionToken=token_data.FunctionToken): + for token in tokens: + type_ = token.type + if type_ == stop_at: + return + + end = pairs.get(type_) + if end is None: + yield token # Not a grouping token + else: + assert not isinstance(token, ContainerToken), ( + 'Token looks already grouped: {0}'.format(token)) + content = list(_regroup_inner(end)) + if eof[0]: + end = '' # Implicit end of structure at EOF. + if type_ == 'FUNCTION': + yield FunctionToken(token.type, token.as_css(), end, + token.value, content, + token.line, token.column) + else: + yield ContainerToken(token.type, token.as_css(), end, + content, + token.line, token.column) + else: + eof[0] = True # end of file/stylesheet + return _regroup_inner() + + +def tokenize_grouped(css_source, ignore_comments=True): + """ + :param css_source: + CSS as an unicode string + :param ignore_comments: + if true (the default) comments will not be included in the + return value + :return: + An iterator of :class:`Token` + + """ + return regroup(tokenize_flat(css_source, ignore_comments)) + + +# Optional Cython version of tokenize_flat +# Make both versions available with explicit names for tests. +python_tokenize_flat = tokenize_flat +try: + from . import speedups +except ImportError: + cython_tokenize_flat = None +else: + cython_tokenize_flat = speedups.tokenize_flat + # Default to the Cython version if available + tokenize_flat = cython_tokenize_flat diff --git a/src/tinycss/version.py b/src/tinycss/version.py new file mode 100644 index 000000000000..014a8e46baa2 --- /dev/null +++ b/src/tinycss/version.py @@ -0,0 +1 @@ +VERSION = '0.3'