diff --git a/COPYRIGHT b/COPYRIGHT
index 47aefca1fbab..788f4a3370a0 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -53,6 +53,10 @@ License: other
are permitted in any medium without royalty provided the copyright
notice and this notice are preserved.
+Files: src/tinycss/*
+Copyright: Simon Sapin
+License: BSD
+
Files: src/calibre/ebooks/readability/*
Copyright: Unknown
License: Apache 2.0
diff --git a/src/tinycss/__init__.py b/src/tinycss/__init__.py
new file mode 100644
index 000000000000..9eca2b1b4648
--- /dev/null
+++ b/src/tinycss/__init__.py
@@ -0,0 +1,44 @@
+# coding: utf8
+"""
+ tinycss
+ -------
+
+ A CSS parser, and nothing else.
+
+ :copyright: (c) 2012 by Simon Sapin.
+ :license: BSD, see LICENSE for more details.
+"""
+
+import sys
+
+from .version import VERSION
+__version__ = VERSION
+
+from .css21 import CSS21Parser
+from .page3 import CSSPage3Parser
+
+
+PARSER_MODULES = {
+ 'page3': CSSPage3Parser,
+}
+
+
+def make_parser(*features, **kwargs):
+ """Make a parser object with the chosen features.
+
+ :param features:
+ Positional arguments are base classes the new parser class will extend.
+ The string ``'page3'`` is accepted as short for
+ :class:`~page3.CSSPage3Parser`.
+ :param kwargs:
+ Keyword arguments are passed to the parser’s constructor.
+ :returns:
+ An instance of a new subclass of :class:`CSS21Parser`
+
+ """
+ if features:
+ bases = tuple(PARSER_MODULES.get(f, f) for f in features)
+ parser_class = type('CustomCSSParser', bases + (CSS21Parser,), {})
+ else:
+ parser_class = CSS21Parser
+ return parser_class(**kwargs)
diff --git a/src/tinycss/color3.py b/src/tinycss/color3.py
new file mode 100644
index 000000000000..187196e7a001
--- /dev/null
+++ b/src/tinycss/color3.py
@@ -0,0 +1,382 @@
+# coding: utf8
+"""
+ tinycss.colors3
+ ---------------
+
+ Parser for CSS 3 color values
+ http://www.w3.org/TR/css3-color/
+
+ This module does not provide anything that integrates in a parser class,
+ only functions that parse single tokens from (eg.) a property value.
+
+ :copyright: (c) 2012 by Simon Sapin.
+ :license: BSD, see LICENSE for more details.
+"""
+
+from __future__ import unicode_literals, division
+import collections
+import itertools
+import re
+
+from .tokenizer import tokenize_grouped
+
+
+class RGBA(collections.namedtuple('RGBA', ['red', 'green', 'blue', 'alpha'])):
+ """An RGBA color.
+
+ A tuple of four floats in the 0..1 range: ``(r, g, b, a)``.
+ Also has ``red``, ``green``, ``blue`` and ``alpha`` attributes to access
+ the same values.
+
+ """
+
+
+def parse_color_string(css_string):
+ """Parse a CSS string as a color value.
+
+ This is a convenience wrapper around :func:`parse_color` in case you
+ have a string that is not from a CSS stylesheet.
+
+ :param css_string:
+ An unicode string in CSS syntax.
+ :returns:
+ Same as :func:`parse_color`.
+
+ """
+ tokens = list(tokenize_grouped(css_string.strip()))
+ if len(tokens) == 1:
+ return parse_color(tokens[0])
+
+
+def parse_color(token):
+ """Parse single token as a color value.
+
+ :param token:
+ A single :class:`~.token_data.Token` or
+ :class:`~.token_data.ContainerToken`, as found eg. in a
+ property value.
+ :returns:
+ * ``None``, if the token is not a valid CSS 3 color value.
+ (No exception is raised.)
+ * For the *currentColor* keyword: the string ``'currentColor'``
+ * Every other values (including keywords, HSL and HSLA) is converted
+ to RGBA and returned as an :class:`RGBA` object (a 4-tuple with
+ attribute access).
+ The alpha channel is clipped to [0, 1], but R, G, or B can be
+ out of range (eg. ``rgb(-51, 306, 0)`` is represented as
+ ``(-.2, 1.2, 0, 1)``.)
+
+ """
+ if token.type == 'IDENT':
+ return COLOR_KEYWORDS.get(token.value.lower())
+ elif token.type == 'HASH':
+ for multiplier, regexp in HASH_REGEXPS:
+ match = regexp(token.value)
+ if match:
+ r, g, b = [int(group * multiplier, 16) / 255
+ for group in match.groups()]
+ return RGBA(r, g, b, 1.)
+ elif token.type == 'FUNCTION':
+ args = parse_comma_separated(token.content)
+ if args:
+ name = token.function_name.lower()
+ if name == 'rgb':
+ return parse_rgb(args, alpha=1.)
+ elif name == 'rgba':
+ alpha = parse_alpha(args[3:])
+ if alpha is not None:
+ return parse_rgb(args[:3], alpha)
+ elif name == 'hsl':
+ return parse_hsl(args, alpha=1.)
+ elif name == 'hsla':
+ alpha = parse_alpha(args[3:])
+ if alpha is not None:
+ return parse_hsl(args[:3], alpha)
+
+
+def parse_alpha(args):
+ """
+ If args is a list of a single INTEGER or NUMBER token,
+ retur its value clipped to the 0..1 range
+ Otherwise, return None.
+ """
+ if len(args) == 1 and args[0].type in ('NUMBER', 'INTEGER'):
+ return min(1, max(0, args[0].value))
+
+
+def parse_rgb(args, alpha):
+ """
+ If args is a list of 3 INTEGER tokens or 3 PERCENTAGE tokens,
+ return RGB values as a tuple of 3 floats in 0..1.
+ Otherwise, return None.
+ """
+ types = [arg.type for arg in args]
+ if types == ['INTEGER', 'INTEGER', 'INTEGER']:
+ r, g, b = [arg.value / 255 for arg in args[:3]]
+ return RGBA(r, g, b, alpha)
+ elif types == ['PERCENTAGE', 'PERCENTAGE', 'PERCENTAGE']:
+ r, g, b = [arg.value / 100 for arg in args[:3]]
+ return RGBA(r, g, b, alpha)
+
+
+def parse_hsl(args, alpha):
+ """
+ If args is a list of 1 INTEGER token and 2 PERCENTAGE tokens,
+ return RGB values as a tuple of 3 floats in 0..1.
+ Otherwise, return None.
+ """
+ types = [arg.type for arg in args]
+ if types == ['INTEGER', 'PERCENTAGE', 'PERCENTAGE']:
+ hsl = [arg.value for arg in args[:3]]
+ r, g, b = hsl_to_rgb(*hsl)
+ return RGBA(r, g, b, alpha)
+
+
+def hsl_to_rgb(hue, saturation, lightness):
+ """
+ :param hue: degrees
+ :param saturation: percentage
+ :param lightness: percentage
+ :returns: (r, g, b) as floats in the 0..1 range
+ """
+ hue = (hue / 360) % 1
+ saturation = min(1, max(0, saturation / 100))
+ lightness = min(1, max(0, lightness / 100))
+
+ # Translated from ABC: http://www.w3.org/TR/css3-color/#hsl-color
+ def hue_to_rgb(m1, m2, h):
+ if h < 0:
+ h += 1
+ if h > 1:
+ h -= 1
+ if h * 6 < 1:
+ return m1 + (m2 - m1) * h * 6
+ if h * 2 < 1:
+ return m2
+ if h * 3 < 2:
+ return m1 + (m2 - m1) * (2 / 3 - h) * 6
+ return m1
+
+ if lightness <= 0.5:
+ m2 = lightness * (saturation + 1)
+ else:
+ m2 = lightness + saturation - lightness * saturation
+ m1 = lightness * 2 - m2
+ return (
+ hue_to_rgb(m1, m2, hue + 1 / 3),
+ hue_to_rgb(m1, m2, hue),
+ hue_to_rgb(m1, m2, hue - 1 / 3),
+ )
+
+
+def parse_comma_separated(tokens):
+ """Parse a list of tokens (typically the content of a function token)
+ as arguments made of a single token each, separated by mandatory commas,
+ with optional white space around each argument.
+
+ return the argument list without commas or white space;
+ or None if the function token content do not match the description above.
+
+ """
+ tokens = [token for token in tokens if token.type != 'S']
+ if not tokens:
+ return []
+ if len(tokens) % 2 == 1 and all(
+ token.type == 'DELIM' and token.value == ','
+ for token in tokens[1::2]):
+ return tokens[::2]
+
+
+HASH_REGEXPS = (
+ (2, re.compile('^#([\da-f])([\da-f])([\da-f])$', re.I).match),
+ (1, re.compile('^#([\da-f]{2})([\da-f]{2})([\da-f]{2})$', re.I).match),
+)
+
+
+# (r, g, b) in 0..255
+BASIC_COLOR_KEYWORDS = [
+ ('black', (0, 0, 0)),
+ ('silver', (192, 192, 192)),
+ ('gray', (128, 128, 128)),
+ ('white', (255, 255, 255)),
+ ('maroon', (128, 0, 0)),
+ ('red', (255, 0, 0)),
+ ('purple', (128, 0, 128)),
+ ('fuchsia', (255, 0, 255)),
+ ('green', (0, 128, 0)),
+ ('lime', (0, 255, 0)),
+ ('olive', (128, 128, 0)),
+ ('yellow', (255, 255, 0)),
+ ('navy', (0, 0, 128)),
+ ('blue', (0, 0, 255)),
+ ('teal', (0, 128, 128)),
+ ('aqua', (0, 255, 255)),
+]
+
+
+# (r, g, b) in 0..255
+EXTENDED_COLOR_KEYWORDS = [
+ ('aliceblue', (240, 248, 255)),
+ ('antiquewhite', (250, 235, 215)),
+ ('aqua', (0, 255, 255)),
+ ('aquamarine', (127, 255, 212)),
+ ('azure', (240, 255, 255)),
+ ('beige', (245, 245, 220)),
+ ('bisque', (255, 228, 196)),
+ ('black', (0, 0, 0)),
+ ('blanchedalmond', (255, 235, 205)),
+ ('blue', (0, 0, 255)),
+ ('blueviolet', (138, 43, 226)),
+ ('brown', (165, 42, 42)),
+ ('burlywood', (222, 184, 135)),
+ ('cadetblue', (95, 158, 160)),
+ ('chartreuse', (127, 255, 0)),
+ ('chocolate', (210, 105, 30)),
+ ('coral', (255, 127, 80)),
+ ('cornflowerblue', (100, 149, 237)),
+ ('cornsilk', (255, 248, 220)),
+ ('crimson', (220, 20, 60)),
+ ('cyan', (0, 255, 255)),
+ ('darkblue', (0, 0, 139)),
+ ('darkcyan', (0, 139, 139)),
+ ('darkgoldenrod', (184, 134, 11)),
+ ('darkgray', (169, 169, 169)),
+ ('darkgreen', (0, 100, 0)),
+ ('darkgrey', (169, 169, 169)),
+ ('darkkhaki', (189, 183, 107)),
+ ('darkmagenta', (139, 0, 139)),
+ ('darkolivegreen', (85, 107, 47)),
+ ('darkorange', (255, 140, 0)),
+ ('darkorchid', (153, 50, 204)),
+ ('darkred', (139, 0, 0)),
+ ('darksalmon', (233, 150, 122)),
+ ('darkseagreen', (143, 188, 143)),
+ ('darkslateblue', (72, 61, 139)),
+ ('darkslategray', (47, 79, 79)),
+ ('darkslategrey', (47, 79, 79)),
+ ('darkturquoise', (0, 206, 209)),
+ ('darkviolet', (148, 0, 211)),
+ ('deeppink', (255, 20, 147)),
+ ('deepskyblue', (0, 191, 255)),
+ ('dimgray', (105, 105, 105)),
+ ('dimgrey', (105, 105, 105)),
+ ('dodgerblue', (30, 144, 255)),
+ ('firebrick', (178, 34, 34)),
+ ('floralwhite', (255, 250, 240)),
+ ('forestgreen', (34, 139, 34)),
+ ('fuchsia', (255, 0, 255)),
+ ('gainsboro', (220, 220, 220)),
+ ('ghostwhite', (248, 248, 255)),
+ ('gold', (255, 215, 0)),
+ ('goldenrod', (218, 165, 32)),
+ ('gray', (128, 128, 128)),
+ ('green', (0, 128, 0)),
+ ('greenyellow', (173, 255, 47)),
+ ('grey', (128, 128, 128)),
+ ('honeydew', (240, 255, 240)),
+ ('hotpink', (255, 105, 180)),
+ ('indianred', (205, 92, 92)),
+ ('indigo', (75, 0, 130)),
+ ('ivory', (255, 255, 240)),
+ ('khaki', (240, 230, 140)),
+ ('lavender', (230, 230, 250)),
+ ('lavenderblush', (255, 240, 245)),
+ ('lawngreen', (124, 252, 0)),
+ ('lemonchiffon', (255, 250, 205)),
+ ('lightblue', (173, 216, 230)),
+ ('lightcoral', (240, 128, 128)),
+ ('lightcyan', (224, 255, 255)),
+ ('lightgoldenrodyellow', (250, 250, 210)),
+ ('lightgray', (211, 211, 211)),
+ ('lightgreen', (144, 238, 144)),
+ ('lightgrey', (211, 211, 211)),
+ ('lightpink', (255, 182, 193)),
+ ('lightsalmon', (255, 160, 122)),
+ ('lightseagreen', (32, 178, 170)),
+ ('lightskyblue', (135, 206, 250)),
+ ('lightslategray', (119, 136, 153)),
+ ('lightslategrey', (119, 136, 153)),
+ ('lightsteelblue', (176, 196, 222)),
+ ('lightyellow', (255, 255, 224)),
+ ('lime', (0, 255, 0)),
+ ('limegreen', (50, 205, 50)),
+ ('linen', (250, 240, 230)),
+ ('magenta', (255, 0, 255)),
+ ('maroon', (128, 0, 0)),
+ ('mediumaquamarine', (102, 205, 170)),
+ ('mediumblue', (0, 0, 205)),
+ ('mediumorchid', (186, 85, 211)),
+ ('mediumpurple', (147, 112, 219)),
+ ('mediumseagreen', (60, 179, 113)),
+ ('mediumslateblue', (123, 104, 238)),
+ ('mediumspringgreen', (0, 250, 154)),
+ ('mediumturquoise', (72, 209, 204)),
+ ('mediumvioletred', (199, 21, 133)),
+ ('midnightblue', (25, 25, 112)),
+ ('mintcream', (245, 255, 250)),
+ ('mistyrose', (255, 228, 225)),
+ ('moccasin', (255, 228, 181)),
+ ('navajowhite', (255, 222, 173)),
+ ('navy', (0, 0, 128)),
+ ('oldlace', (253, 245, 230)),
+ ('olive', (128, 128, 0)),
+ ('olivedrab', (107, 142, 35)),
+ ('orange', (255, 165, 0)),
+ ('orangered', (255, 69, 0)),
+ ('orchid', (218, 112, 214)),
+ ('palegoldenrod', (238, 232, 170)),
+ ('palegreen', (152, 251, 152)),
+ ('paleturquoise', (175, 238, 238)),
+ ('palevioletred', (219, 112, 147)),
+ ('papayawhip', (255, 239, 213)),
+ ('peachpuff', (255, 218, 185)),
+ ('peru', (205, 133, 63)),
+ ('pink', (255, 192, 203)),
+ ('plum', (221, 160, 221)),
+ ('powderblue', (176, 224, 230)),
+ ('purple', (128, 0, 128)),
+ ('red', (255, 0, 0)),
+ ('rosybrown', (188, 143, 143)),
+ ('royalblue', (65, 105, 225)),
+ ('saddlebrown', (139, 69, 19)),
+ ('salmon', (250, 128, 114)),
+ ('sandybrown', (244, 164, 96)),
+ ('seagreen', (46, 139, 87)),
+ ('seashell', (255, 245, 238)),
+ ('sienna', (160, 82, 45)),
+ ('silver', (192, 192, 192)),
+ ('skyblue', (135, 206, 235)),
+ ('slateblue', (106, 90, 205)),
+ ('slategray', (112, 128, 144)),
+ ('slategrey', (112, 128, 144)),
+ ('snow', (255, 250, 250)),
+ ('springgreen', (0, 255, 127)),
+ ('steelblue', (70, 130, 180)),
+ ('tan', (210, 180, 140)),
+ ('teal', (0, 128, 128)),
+ ('thistle', (216, 191, 216)),
+ ('tomato', (255, 99, 71)),
+ ('turquoise', (64, 224, 208)),
+ ('violet', (238, 130, 238)),
+ ('wheat', (245, 222, 179)),
+ ('white', (255, 255, 255)),
+ ('whitesmoke', (245, 245, 245)),
+ ('yellow', (255, 255, 0)),
+ ('yellowgreen', (154, 205, 50)),
+]
+
+
+# (r, g, b, a) in 0..1 or a string marker
+SPECIAL_COLOR_KEYWORDS = {
+ 'currentcolor': 'currentColor',
+ 'transparent': RGBA(0., 0., 0., 0.),
+}
+
+
+# RGBA namedtuples of (r, g, b, a) in 0..1 or a string marker
+COLOR_KEYWORDS = SPECIAL_COLOR_KEYWORDS.copy()
+COLOR_KEYWORDS.update(
+ # 255 maps to 1, 0 to 0, the rest is linear.
+ (keyword, RGBA(r / 255., g / 255., b / 255., 1.))
+ for keyword, (r, g, b) in itertools.chain(
+ BASIC_COLOR_KEYWORDS, EXTENDED_COLOR_KEYWORDS))
diff --git a/src/tinycss/css21.py b/src/tinycss/css21.py
new file mode 100644
index 000000000000..51e6529226f7
--- /dev/null
+++ b/src/tinycss/css21.py
@@ -0,0 +1,815 @@
+# coding: utf8
+"""
+ tinycss.css21
+ -------------
+
+ Parser for CSS 2.1
+ http://www.w3.org/TR/CSS21/syndata.html
+
+ :copyright: (c) 2012 by Simon Sapin.
+ :license: BSD, see LICENSE for more details.
+"""
+
+from __future__ import unicode_literals
+from itertools import chain, islice
+
+from .decoding import decode
+from .token_data import TokenList
+from .tokenizer import tokenize_grouped
+from .parsing import (strip_whitespace, remove_whitespace, split_on_comma,
+ validate_value, validate_block, validate_any, ParseError)
+
+
+# stylesheet : [ CDO | CDC | S | statement ]*;
+# statement : ruleset | at-rule;
+# at-rule : ATKEYWORD S* any* [ block | ';' S* ];
+# block : '{' S* [ any | block | ATKEYWORD S* | ';' S* ]* '}' S*;
+# ruleset : selector? '{' S* declaration? [ ';' S* declaration? ]* '}' S*;
+# selector : any+;
+# declaration : property S* ':' S* value;
+# property : IDENT;
+# value : [ any | block | ATKEYWORD S* ]+;
+# any : [ IDENT | NUMBER | PERCENTAGE | DIMENSION | STRING
+# | DELIM | URI | HASH | UNICODE-RANGE | INCLUDES
+# | DASHMATCH | ':' | FUNCTION S* [any|unused]* ')'
+# | '(' S* [any|unused]* ')' | '[' S* [any|unused]* ']'
+# ] S*;
+# unused : block | ATKEYWORD S* | ';' S* | CDO S* | CDC S*;
+
+
+class Stylesheet(object):
+ """
+ A parsed CSS stylesheet.
+
+ .. attribute:: rules
+
+ A mixed list, in source order, of :class:`RuleSet` and various
+ at-rules such as :class:`ImportRule`, :class:`MediaRule`
+ and :class:`PageRule`.
+ Use their :obj:`at_keyword` attribute to distinguish them.
+
+ .. attribute:: errors
+
+ A list of :class:`~.parsing.ParseError`. Invalid rules and declarations
+ are ignored, with the details logged in this list.
+
+ .. attribute:: encoding
+
+ The character encoding that was used to decode the stylesheet
+ from bytes, or ``None`` for Unicode stylesheets.
+
+ """
+ def __init__(self, rules, errors, encoding):
+ self.rules = rules
+ self.errors = errors
+ self.encoding = encoding
+
+ def __repr__(self):
+ return '<{0.__class__.__name__} {1} rules {2} errors>'.format(
+ self, len(self.rules), len(self.errors))
+
+
+class AtRule(object):
+ """
+ An unparsed at-rule.
+
+ .. attribute:: at_keyword
+
+ The normalized (lower-case) at-keyword as a string. Eg: ``'@page'``
+
+ .. attribute:: head
+
+ The part of the at-rule between the at-keyword and the ``{``
+ marking the body, or the ``;`` marking the end of an at-rule without
+ a body. A :class:`~.token_data.TokenList`.
+
+ .. attribute:: body
+
+ The content of the body between ``{`` and ``}`` as a
+ :class:`~.token_data.TokenList`, or ``None`` if there is no body
+ (ie. if the rule ends with ``;``).
+
+ The head was validated against the core grammar but **not** the body,
+ as the body might contain declarations. In case of an error in a
+ declaration, parsing should continue from the next declaration.
+ The whole rule should not be ignored as it would be for an error
+ in the head.
+
+ These at-rules are expected to be parsed further before reaching
+ the user API.
+
+ """
+ def __init__(self, at_keyword, head, body, line, column):
+ self.at_keyword = at_keyword
+ self.head = TokenList(head)
+ self.body = TokenList(body) if body is not None else body
+ self.line = line
+ self.column = column
+
+ def __repr__(self):
+ return ('<{0.__class__.__name__} {0.line}:{0.column} {0.at_keyword}>'
+ .format(self))
+
+
+class RuleSet(object):
+ """A ruleset.
+
+ .. attribute:: at_keyword
+
+ Always ``None``. Helps to tell rulesets apart from at-rules.
+
+ .. attribute:: selector
+
+ The selector as a :class:`~.token_data.TokenList`.
+ In CSS 3, this is actually called a selector group.
+
+ ``rule.selector.as_css()`` gives the selector as a string.
+ This string can be used with *cssselect*, see :ref:`selectors3`.
+
+ .. attribute:: declarations
+
+ The list of :class:`Declaration`, in source order.
+
+ """
+
+ at_keyword = None
+
+ def __init__(self, selector, declarations, line, column):
+ self.selector = TokenList(selector)
+ self.declarations = declarations
+ self.line = line
+ self.column = column
+
+ def __repr__(self):
+ return ('<{0.__class__.__name__} at {0.line}:{0.column} {1}>'
+ .format(self, self.selector.as_css()))
+
+
+class Declaration(object):
+ """A property declaration.
+
+ .. attribute:: name
+
+ The property name as a normalized (lower-case) string.
+
+ .. attribute:: value
+
+ The property value as a :class:`~.token_data.TokenList`.
+
+ The value is not parsed. UAs using tinycss may only support
+ some properties or some values and tinycss does not know which.
+ They need to parse values themselves and ignore declarations with
+ unknown or unsupported properties or values, and fall back
+ on any previous declaration.
+
+ :mod:`tinycss.color3` parses color values, but other values
+ will need specific parsing/validation code.
+
+ .. attribute:: priority
+
+ Either the string ``'important'`` or ``None``.
+
+ """
+ def __init__(self, name, value, priority, line, column):
+ self.name = name
+ self.value = TokenList(value)
+ self.priority = priority
+ self.line = line
+ self.column = column
+
+ def __repr__(self):
+ priority = ' !' + self.priority if self.priority else ''
+ return ('<{0.__class__.__name__} {0.line}:{0.column}'
+ ' {0.name}: {1}{2}>'.format(
+ self, self.value.as_css(), priority))
+
+
+class PageRule(object):
+ """A parsed CSS 2.1 @page rule.
+
+ .. attribute:: at_keyword
+
+ Always ``'@page'``
+
+ .. attribute:: selector
+
+ The page selector.
+ In CSS 2.1 this is either ``None`` (no selector), or the string
+ ``'first'``, ``'left'`` or ``'right'`` for the pseudo class
+ of the same name.
+
+ .. attribute:: specificity
+
+ Specificity of the page selector. This is a tuple of four integers,
+ but these tuples are mostly meant to be compared to each other.
+
+ .. attribute:: declarations
+
+ A list of :class:`Declaration`, in source order.
+
+ .. attribute:: at_rules
+
+ The list of parsed at-rules inside the @page block, in source order.
+ Always empty for CSS 2.1.
+
+ """
+ at_keyword = '@page'
+
+ def __init__(self, selector, specificity, declarations, at_rules,
+ line, column):
+ self.selector = selector
+ self.specificity = specificity
+ self.declarations = declarations
+ self.at_rules = at_rules
+ self.line = line
+ self.column = column
+
+ def __repr__(self):
+ return ('<{0.__class__.__name__} {0.line}:{0.column}'
+ ' {0.selector}>'.format(self))
+
+
+class MediaRule(object):
+ """A parsed @media rule.
+
+ .. attribute:: at_keyword
+
+ Always ``'@media'``
+
+ .. attribute:: media
+
+ For CSS 2.1 without media queries: the media types
+ as a list of strings.
+
+ .. attribute:: rules
+
+ The list :class:`RuleSet` and various at-rules inside the @media
+ block, in source order.
+
+ """
+ at_keyword = '@media'
+
+ def __init__(self, media, rules, line, column):
+ self.media = media
+ self.rules = rules
+ self.line = line
+ self.column = column
+
+ def __repr__(self):
+ return ('<{0.__class__.__name__} {0.line}:{0.column}'
+ ' {0.media}>'.format(self))
+
+
+class ImportRule(object):
+ """A parsed @import rule.
+
+ .. attribute:: at_keyword
+
+ Always ``'@import'``
+
+ .. attribute:: uri
+
+ The URI to be imported, as read from the stylesheet.
+ (URIs are not made absolute.)
+
+ .. attribute:: media
+
+ For CSS 2.1 without media queries: the media types
+ as a list of strings.
+ This attribute is explicitly ``['all']`` if the media was omitted
+ in the source.
+
+ """
+ at_keyword = '@import'
+
+ def __init__(self, uri, media, line, column):
+ self.uri = uri
+ self.media = media
+ self.line = line
+ self.column = column
+
+ def __repr__(self):
+ return ('<{0.__class__.__name__} {0.line}:{0.column}'
+ ' {0.uri}>'.format(self))
+
+
+
+def _remove_at_charset(tokens):
+ """Remove any valid @charset at the beggining of a token stream.
+
+ :param tokens:
+ An iterable of tokens
+ :returns:
+ A possibly truncated iterable of tokens
+
+ """
+ tokens = iter(tokens)
+ header = list(islice(tokens, 4))
+ if [t.type for t in header] == ['ATKEYWORD', 'S', 'STRING', ';']:
+ atkw, space, string, semicolon = header
+ if ((atkw.value, space.value) == ('@charset', ' ')
+ and string.as_css()[0] == '"'):
+ # Found a valid @charset rule, only keep what’s after it.
+ return tokens
+ return chain(header, tokens)
+
+
+class CSS21Parser(object):
+ """Parser for CSS 2.1
+
+ This parser supports the core CSS syntax as well as @import, @media,
+ @page and !important.
+
+ Note that property values are still not parsed, as UAs using this
+ parser may only support some properties or some values.
+
+ Currently the parser holds no state. It being a class only allows
+ subclassing and overriding its methods.
+
+ """
+
+ # User API:
+
+ def parse_stylesheet_file(self, css_file, protocol_encoding=None,
+ linking_encoding=None, document_encoding=None):
+ """Parse a stylesheet from a file or filename.
+
+ Character encoding-related parameters and behavior are the same
+ as in :meth:`parse_stylesheet_bytes`.
+
+ :param css_file:
+ Either a file (any object with a :meth:`~file.read` method)
+ or a filename.
+ :return:
+ A :class:`Stylesheet`.
+
+ """
+ if hasattr(css_file, 'read'):
+ css_bytes = css_file.read()
+ else:
+ with open(css_file, 'rb') as fd:
+ css_bytes = fd.read()
+ return self.parse_stylesheet_bytes(css_bytes, protocol_encoding,
+ linking_encoding, document_encoding)
+
+ def parse_stylesheet_bytes(self, css_bytes, protocol_encoding=None,
+ linking_encoding=None, document_encoding=None):
+ """Parse a stylesheet from a byte string.
+
+ The character encoding is determined from the passed metadata and the
+ ``@charset`` rule in the stylesheet (if any).
+ If no encoding information is available or decoding fails,
+ decoding defaults to UTF-8 and then fall back on ISO-8859-1.
+
+ :param css_bytes:
+ A CSS stylesheet as a byte string.
+ :param protocol_encoding:
+ The "charset" parameter of a "Content-Type" HTTP header (if any),
+ or similar metadata for other protocols.
+ :param linking_encoding:
+ ```` or other metadata from the linking mechanism
+ (if any)
+ :param document_encoding:
+ Encoding of the referring style sheet or document (if any)
+ :return:
+ A :class:`Stylesheet`.
+
+ """
+ css_unicode, encoding = decode(css_bytes, protocol_encoding,
+ linking_encoding, document_encoding)
+ return self.parse_stylesheet(css_unicode, encoding=encoding)
+
+ def parse_stylesheet(self, css_unicode, encoding=None):
+ """Parse a stylesheet from an Unicode string.
+
+ :param css_unicode:
+ A CSS stylesheet as an unicode string.
+ :param encoding:
+ The character encoding used to decode the stylesheet from bytes,
+ if any.
+ :return:
+ A :class:`Stylesheet`.
+
+ """
+ tokens = tokenize_grouped(css_unicode)
+ if encoding:
+ tokens = _remove_at_charset(tokens)
+ rules, errors = self.parse_rules(tokens, context='stylesheet')
+ return Stylesheet(rules, errors, encoding)
+
+ def parse_style_attr(self, css_source):
+ """Parse a "style" attribute (eg. of an HTML element).
+
+ This method only accepts Unicode as the source (HTML) document
+ is supposed to handle the character encoding.
+
+ :param css_source:
+ The attribute value, as an unicode string.
+ :return:
+ A tuple of the list of valid :class:`Declaration` and
+ a list of :class:`~.parsing.ParseError`.
+ """
+ return self.parse_declaration_list(tokenize_grouped(css_source))
+
+ # API for subclasses:
+
+ def parse_rules(self, tokens, context):
+ """Parse a sequence of rules (rulesets and at-rules).
+
+ :param tokens:
+ An iterable of tokens.
+ :param context:
+ Either ``'stylesheet'`` or an at-keyword such as ``'@media'``.
+ (Most at-rules are only allowed in some contexts.)
+ :return:
+ A tuple of a list of parsed rules and a list of
+ :class:`~.parsing.ParseError`.
+
+ """
+ rules = []
+ errors = []
+ tokens = iter(tokens)
+ for token in tokens:
+ if token.type not in ('S', 'CDO', 'CDC'):
+ try:
+ if token.type == 'ATKEYWORD':
+ rule = self.read_at_rule(token, tokens)
+ result = self.parse_at_rule(
+ rule, rules, errors, context)
+ rules.append(result)
+ else:
+ rule, rule_errors = self.parse_ruleset(token, tokens)
+ rules.append(rule)
+ errors.extend(rule_errors)
+ except ParseError as exc:
+ errors.append(exc)
+ # Skip the entire rule
+ return rules, errors
+
+ def read_at_rule(self, at_keyword_token, tokens):
+ """Read an at-rule from a token stream.
+
+ :param at_keyword_token:
+ The ATKEYWORD token that starts this at-rule
+ You may have read it already to distinguish the rule
+ from a ruleset.
+ :param tokens:
+ An iterator of subsequent tokens. Will be consumed just enough
+ for one at-rule.
+ :return:
+ An unparsed :class:`AtRule`.
+ :raises:
+ :class:`~.parsing.ParseError` if the head is invalid for the core
+ grammar. The body is **not** validated. See :class:`AtRule`.
+
+ """
+ # CSS syntax is case-insensitive
+ at_keyword = at_keyword_token.value.lower()
+ head = []
+ # For the ParseError in case `tokens` is empty:
+ token = at_keyword_token
+ for token in tokens:
+ if token.type in '{;':
+ break
+ # Ignore white space just after the at-keyword.
+ else:
+ head.append(token)
+ # On unexpected end of stylesheet, pretend that a ';' was there
+ head = strip_whitespace(head)
+ for head_token in head:
+ validate_any(head_token, 'at-rule head')
+ body = token.content if token.type == '{' else None
+ return AtRule(at_keyword, head, body,
+ at_keyword_token.line, at_keyword_token.column)
+
+ def parse_at_rule(self, rule, previous_rules, errors, context):
+ """Parse an at-rule.
+
+ Subclasses that override this method must use ``super()`` and
+ pass its return value for at-rules they do not know.
+
+ In CSS 2.1, this method handles @charset, @import, @media and @page
+ rules.
+
+ :param rule:
+ An unparsed :class:`AtRule`.
+ :param previous_rules:
+ The list of at-rules and rulesets that have been parsed so far
+ in this context. This list can be used to decide if the current
+ rule is valid. (For example, @import rules are only allowed
+ before anything but a @charset rule.)
+ :param context:
+ Either ``'stylesheet'`` or an at-keyword such as ``'@media'``.
+ (Most at-rules are only allowed in some contexts.)
+ :raises:
+ :class:`~.parsing.ParseError` if the rule is invalid.
+ :return:
+ A parsed at-rule
+
+ """
+ if rule.at_keyword == '@page':
+ if context != 'stylesheet':
+ raise ParseError(rule, '@page rule not allowed in ' + context)
+ selector, specificity = self.parse_page_selector(rule.head)
+ if rule.body is None:
+ raise ParseError(rule,
+ 'invalid {0} rule: missing block'.format(rule.at_keyword))
+ declarations, at_rules, rule_errors = \
+ self.parse_declarations_and_at_rules(rule.body, '@page')
+ errors.extend(rule_errors)
+ return PageRule(selector, specificity, declarations, at_rules,
+ rule.line, rule.column)
+
+ elif rule.at_keyword == '@media':
+ if context != 'stylesheet':
+ raise ParseError(rule, '@media rule not allowed in ' + context)
+ if not rule.head:
+ raise ParseError(rule, 'expected media types for @media')
+ media = self.parse_media(rule.head)
+ if rule.body is None:
+ raise ParseError(rule,
+ 'invalid {0} rule: missing block'.format(rule.at_keyword))
+ rules, rule_errors = self.parse_rules(rule.body, '@media')
+ errors.extend(rule_errors)
+ return MediaRule(media, rules, rule.line, rule.column)
+
+ elif rule.at_keyword == '@import':
+ if context != 'stylesheet':
+ raise ParseError(rule,
+ '@import rule not allowed in ' + context)
+ for previous_rule in previous_rules:
+ if previous_rule.at_keyword not in ('@charset', '@import'):
+ if previous_rule.at_keyword:
+ type_ = 'an {0} rule'.format(previous_rule.at_keyword)
+ else:
+ type_ = 'a ruleset'
+ raise ParseError(previous_rule,
+ '@import rule not allowed after ' + type_)
+ head = rule.head
+ if not head:
+ raise ParseError(rule,
+ 'expected URI or STRING for @import rule')
+ if head[0].type not in ('URI', 'STRING'):
+ raise ParseError(rule,
+ 'expected URI or STRING for @import rule, got '
+ + head[0].type)
+ uri = head[0].value
+ media = self.parse_media(strip_whitespace(head[1:]))
+ if rule.body is not None:
+ # The position of the ';' token would be best, but we don’t
+ # have it anymore here.
+ raise ParseError(head[-1], "expected ';', got a block")
+ return ImportRule(uri, media, rule.line, rule.column)
+
+ elif rule.at_keyword == '@charset':
+ raise ParseError(rule, 'mis-placed or malformed @charset rule')
+
+ else:
+ raise ParseError(rule, 'unknown at-rule in {0} context: {1}'
+ .format(context, rule.at_keyword))
+
+ def parse_media(self, tokens):
+ """For CSS 2.1, parse a list of media types.
+
+ Media Queries are expected to override this.
+
+ :param tokens:
+ A list of tokens
+ :raises:
+ :class:`~.parsing.ParseError` on invalid media types/queries
+ :returns:
+ For CSS 2.1, a list of media types as strings
+ """
+ if not tokens:
+ return ['all']
+ media_types = []
+ for part in split_on_comma(remove_whitespace(tokens)):
+ types = [token.type for token in part]
+ if types == ['IDENT']:
+ media_types.append(part[0].value)
+ else:
+ raise ParseError(tokens[0], 'expected a media type'
+ + ((', got ' + ', '.join(types)) if types else ''))
+ return media_types
+
+ def parse_page_selector(self, tokens):
+ """Parse an @page selector.
+
+ :param tokens:
+ An iterable of token, typically from the ``head`` attribute of
+ an unparsed :class:`AtRule`.
+ :returns:
+ A page selector. For CSS 2.1, this is ``'first'``, ``'left'``,
+ ``'right'`` or ``None``.
+ :raises:
+ :class:`~.parsing.ParseError` on invalid selectors
+
+ """
+ if not tokens:
+ return None, (0, 0)
+ if (len(tokens) == 2 and tokens[0].type == ':'
+ and tokens[1].type == 'IDENT'):
+ pseudo_class = tokens[1].value
+ specificity = {
+ 'first': (1, 0), 'left': (0, 1), 'right': (0, 1),
+ }.get(pseudo_class)
+ if specificity:
+ return pseudo_class, specificity
+ raise ParseError(tokens[0], 'invalid @page selector')
+
+ def parse_declarations_and_at_rules(self, tokens, context):
+ """Parse a mixed list of declarations and at rules, as found eg.
+ in the body of an @page rule.
+
+ Note that to add supported at-rules inside @page,
+ :class:`~.page3.CSSPage3Parser` extends :meth:`parse_at_rule`,
+ not this method.
+
+ :param tokens:
+ An iterable of token, typically from the ``body`` attribute of
+ an unparsed :class:`AtRule`.
+ :param context:
+ An at-keyword such as ``'@page'``.
+ (Most at-rules are only allowed in some contexts.)
+ :returns:
+ A tuple of:
+
+ * A list of :class:`Declaration`
+ * A list of parsed at-rules (empty for CSS 2.1)
+ * A list of :class:`~.parsing.ParseError`
+
+ """
+ at_rules = []
+ declarations = []
+ errors = []
+ tokens = iter(tokens)
+ for token in tokens:
+ if token.type == 'ATKEYWORD':
+ try:
+ rule = self.read_at_rule(token, tokens)
+ result = self.parse_at_rule(
+ rule, at_rules, errors, context)
+ at_rules.append(result)
+ except ParseError as err:
+ errors.append(err)
+ elif token.type != 'S':
+ declaration_tokens = []
+ while token and token.type != ';':
+ declaration_tokens.append(token)
+ token = next(tokens, None)
+ if declaration_tokens:
+ try:
+ declarations.append(
+ self.parse_declaration(declaration_tokens))
+ except ParseError as err:
+ errors.append(err)
+ return declarations, at_rules, errors
+
+ def parse_ruleset(self, first_token, tokens):
+ """Parse a ruleset: a selector followed by declaration block.
+
+ :param first_token:
+ The first token of the ruleset (probably of the selector).
+ You may have read it already to distinguish the rule
+ from an at-rule.
+ :param tokens:
+ an iterator of subsequent tokens. Will be consumed just enough
+ for one ruleset.
+ :return:
+ a tuple of a :class:`RuleSet` and an error list.
+ The errors are recovered :class:`~.parsing.ParseError` in declarations.
+ (Parsing continues from the next declaration on such errors.)
+ :raises:
+ :class:`~.parsing.ParseError` if the selector is invalid for the
+ core grammar.
+ Note a that a selector can be valid for the core grammar but
+ not for CSS 2.1 or another level.
+
+ """
+ selector = []
+ for token in chain([first_token], tokens):
+ if token.type == '{':
+ # Parse/validate once we’ve read the whole rule
+ selector = strip_whitespace(selector)
+ if not selector:
+ raise ParseError(first_token, 'empty selector')
+ for selector_token in selector:
+ validate_any(selector_token, 'selector')
+ declarations, errors = self.parse_declaration_list(
+ token.content)
+ ruleset = RuleSet(selector, declarations,
+ first_token.line, first_token.column)
+ return ruleset, errors
+ else:
+ selector.append(token)
+ raise ParseError(token, 'no declaration block found for ruleset')
+
+ def parse_declaration_list(self, tokens):
+ """Parse a ``;`` separated declaration list.
+
+ You may want to use :meth:`parse_declarations_and_at_rules` (or
+ some other method that uses :func:`parse_declaration` directly)
+ instead if you have not just declarations in the same context.
+
+ :param tokens:
+ an iterable of tokens. Should stop at (before) the end
+ of the block, as marked by ``}``.
+ :return:
+ a tuple of the list of valid :class:`Declaration` and a list
+ of :class:`~.parsing.ParseError`
+
+ """
+ # split at ';'
+ parts = []
+ this_part = []
+ for token in tokens:
+ if token.type == ';':
+ parts.append(this_part)
+ this_part = []
+ else:
+ this_part.append(token)
+ parts.append(this_part)
+
+ declarations = []
+ errors = []
+ for tokens in parts:
+ tokens = strip_whitespace(tokens)
+ if tokens:
+ try:
+ declarations.append(self.parse_declaration(tokens))
+ except ParseError as exc:
+ errors.append(exc)
+ # Skip the entire declaration
+ return declarations, errors
+
+ def parse_declaration(self, tokens):
+ """Parse a single declaration.
+
+ :param tokens:
+ an iterable of at least one token. Should stop at (before)
+ the end of the declaration, as marked by a ``;`` or ``}``.
+ Empty declarations (ie. consecutive ``;`` with only white space
+ in-between) should be skipped earlier and not passed to
+ this method.
+ :returns:
+ a :class:`Declaration`
+ :raises:
+ :class:`~.parsing.ParseError` if the tokens do not match the
+ 'declaration' production of the core grammar.
+
+ """
+ tokens = iter(tokens)
+
+ name_token = next(tokens) # assume there is at least one
+ if name_token.type == 'IDENT':
+ # CSS syntax is case-insensitive
+ property_name = name_token.value.lower()
+ else:
+ raise ParseError(name_token,
+ 'expected a property name, got {0}'.format(name_token.type))
+
+ token = name_token # In case ``tokens`` is now empty
+ for token in tokens:
+ if token.type == ':':
+ break
+ elif token.type != 'S':
+ raise ParseError(
+ token, "expected ':', got {0}".format(token.type))
+ else:
+ raise ParseError(token, "expected ':'")
+
+ value = strip_whitespace(list(tokens))
+ if not value:
+ raise ParseError(token, 'expected a property value')
+ validate_value(value)
+ value, priority = self.parse_value_priority(value)
+ return Declaration(
+ property_name, value, priority, name_token.line, name_token.column)
+
+ def parse_value_priority(self, tokens):
+ """Separate any ``!important`` marker at the end of a property value.
+
+ :param tokens:
+ A list of tokens for the property value.
+ :returns:
+ A tuple of the actual property value (a list of tokens)
+ and the :attr:`~Declaration.priority`.
+ """
+ value = list(tokens)
+ # Walk the token list from the end
+ token = value.pop()
+ if token.type == 'IDENT' and token.value.lower() == 'important':
+ while value:
+ token = value.pop()
+ if token.type == 'DELIM' and token.value == '!':
+ # Skip any white space before the '!'
+ while value and value[-1].type == 'S':
+ value.pop()
+ if not value:
+ raise ParseError(
+ token, 'expected a value before !important')
+ return value, 'important'
+ # Skip white space between '!' and 'important'
+ elif token.type != 'S':
+ break
+ return tokens, None
diff --git a/src/tinycss/decoding.py b/src/tinycss/decoding.py
new file mode 100644
index 000000000000..6303e1afda3c
--- /dev/null
+++ b/src/tinycss/decoding.py
@@ -0,0 +1,254 @@
+# coding: utf8
+"""
+ tinycss.decoding
+ ----------------
+
+ Decoding stylesheets from bytes to Unicode.
+ http://www.w3.org/TR/CSS21/syndata.html#charset
+
+ :copyright: (c) 2012 by Simon Sapin.
+ :license: BSD, see LICENSE for more details.
+"""
+
+from __future__ import unicode_literals
+
+from binascii import unhexlify
+import operator
+import re
+import sys
+
+
+__all__ = ['decode'] # Everything else is implementation detail
+
+
+def decode(css_bytes, protocol_encoding=None,
+ linking_encoding=None, document_encoding=None):
+ """
+ Determine the character encoding from the passed metadata and the
+ ``@charset`` rule in the stylesheet (if any); and decode accordingly.
+ If no encoding information is available or decoding fails,
+ decoding defaults to UTF-8 and then fall back on ISO-8859-1.
+
+ :param css_bytes:
+ a CSS stylesheet as a byte string
+ :param protocol_encoding:
+ The "charset" parameter of a "Content-Type" HTTP header (if any),
+ or similar metadata for other protocols.
+ :param linking_encoding:
+ ```` or other metadata from the linking mechanism
+ (if any)
+ :param document_encoding:
+ Encoding of the referring style sheet or document (if any)
+ :return:
+ A tuple of an Unicode string, with any BOM removed, and the
+ encoding that was used.
+
+ """
+ if protocol_encoding:
+ css_unicode = try_encoding(css_bytes, protocol_encoding)
+ if css_unicode is not None:
+ return css_unicode, protocol_encoding
+ for encoding, pattern in ENCODING_MAGIC_NUMBERS:
+ match = pattern(css_bytes)
+ if match:
+ has_at_charset = isinstance(encoding, tuple)
+ if has_at_charset:
+ extract, endianness = encoding
+ encoding = extract(match.group(1))
+ # Get an ASCII-only unicode value.
+ # This is the only thing that works on both Python 2 and 3
+ # for bytes.decode()
+ # Non-ASCII encoding names are invalid anyway,
+ # but make sure they stay invalid.
+ encoding = encoding.decode('ascii', 'replace')
+ encoding = encoding.replace('\ufffd', '?')
+ if encoding.replace('-', '').replace('_', '').lower() in [
+ 'utf16', 'utf32']:
+ encoding += endianness
+ encoding = encoding.encode('ascii', 'replace').decode('ascii')
+ css_unicode = try_encoding(css_bytes, encoding)
+ if css_unicode and not (has_at_charset and not
+ css_unicode.startswith('@charset "')):
+ return css_unicode, encoding
+ break
+ for encoding in [linking_encoding, document_encoding]:
+ if encoding:
+ css_unicode = try_encoding(css_bytes, encoding)
+ if css_unicode is not None:
+ return css_unicode, encoding
+ css_unicode = try_encoding(css_bytes, 'UTF-8')
+ if css_unicode is not None:
+ return css_unicode, 'UTF-8'
+ return try_encoding(css_bytes, 'ISO-8859-1', fallback=False), 'ISO-8859-1'
+
+
+def try_encoding(css_bytes, encoding, fallback=True):
+ if fallback:
+ try:
+ css_unicode = css_bytes.decode(encoding)
+ # LookupError means unknown encoding
+ except (UnicodeDecodeError, LookupError):
+ return None
+ else:
+ css_unicode = css_bytes.decode(encoding)
+ if css_unicode and css_unicode[0] == '\ufeff':
+ # Remove any Byte Order Mark
+ css_unicode = css_unicode[1:]
+ return css_unicode
+
+
+def hex2re(hex_data):
+ return re.escape(unhexlify(hex_data.replace(' ', '').encode('ascii')))
+
+
+class Slicer(object):
+ """Slice()[start:stop:end] == slice(start, stop, end)"""
+ def __getitem__(self, slice_):
+ return operator.itemgetter(slice_)
+
+Slice = Slicer()
+
+
+# List of (bom_size, encoding, pattern)
+# bom_size is in bytes and can be zero
+# encoding is a string or (slice_, endianness) for "as specified"
+# slice_ is a slice object.How to extract the specified
+
+ENCODING_MAGIC_NUMBERS = [
+ ((Slice[:], ''), re.compile(
+ hex2re('EF BB BF 40 63 68 61 72 73 65 74 20 22')
+ + b'([^\x22]*?)'
+ + hex2re('22 3B')).match),
+
+ ('UTF-8', re.compile(
+ hex2re('EF BB BF')).match),
+
+ ((Slice[:], ''), re.compile(
+ hex2re('40 63 68 61 72 73 65 74 20 22')
+ + b'([^\x22]*?)'
+ + hex2re('22 3B')).match),
+
+ ((Slice[1::2], '-BE'), re.compile(
+ hex2re('FE FF 00 40 00 63 00 68 00 61 00 72 00 73 00 65 00'
+ '74 00 20 00 22')
+ + b'((\x00[^\x22])*?)'
+ + hex2re('00 22 00 3B')).match),
+
+ ((Slice[1::2], '-BE'), re.compile(
+ hex2re('00 40 00 63 00 68 00 61 00 72 00 73 00 65 00 74 00'
+ '20 00 22')
+ + b'((\x00[^\x22])*?)'
+ + hex2re('00 22 00 3B')).match),
+
+ ((Slice[::2], '-LE'), re.compile(
+ hex2re('FF FE 40 00 63 00 68 00 61 00 72 00 73 00 65 00 74'
+ '00 20 00 22 00')
+ + b'(([^\x22]\x00)*?)'
+ + hex2re('22 00 3B 00')).match),
+
+ ((Slice[::2], '-LE'), re.compile(
+ hex2re('40 00 63 00 68 00 61 00 72 00 73 00 65 00 74 00 20'
+ '00 22 00')
+ + b'(([^\x22]\x00)*?)'
+ + hex2re('22 00 3B 00')).match),
+
+ ((Slice[3::4], '-BE'), re.compile(
+ hex2re('00 00 FE FF 00 00 00 40 00 00 00 63 00 00 00 68 00'
+ '00 00 61 00 00 00 72 00 00 00 73 00 00 00 65 00 00'
+ '00 74 00 00 00 20 00 00 00 22')
+ + b'((\x00\x00\x00[^\x22])*?)'
+ + hex2re('00 00 00 22 00 00 00 3B')).match),
+
+ ((Slice[3::4], '-BE'), re.compile(
+ hex2re('00 00 00 40 00 00 00 63 00 00 00 68 00 00 00 61 00'
+ '00 00 72 00 00 00 73 00 00 00 65 00 00 00 74 00 00'
+ '00 20 00 00 00 22')
+ + b'((\x00\x00\x00[^\x22])*?)'
+ + hex2re('00 00 00 22 00 00 00 3B')).match),
+
+
+# Python does not support 2143 or 3412 endianness, AFAIK.
+# I guess we could fix it up ourselves but meh. Patches welcome.
+
+# ((Slice[2::4], '-2143'), re.compile(
+# hex2re('00 00 FF FE 00 00 40 00 00 00 63 00 00 00 68 00 00'
+# '00 61 00 00 00 72 00 00 00 73 00 00 00 65 00 00 00'
+# '74 00 00 00 20 00 00 00 22 00')
+# + b'((\x00\x00[^\x22]\x00)*?)'
+# + hex2re('00 00 22 00 00 00 3B 00')).match),
+
+# ((Slice[2::4], '-2143'), re.compile(
+# hex2re('00 00 40 00 00 00 63 00 00 00 68 00 00 00 61 00 00'
+# '00 72 00 00 00 73 00 00 00 65 00 00 00 74 00 00 00'
+# '20 00 00 00 22 00')
+# + b'((\x00\x00[^\x22]\x00)*?)'
+# + hex2re('00 00 22 00 00 00 3B 00')).match),
+
+# ((Slice[1::4], '-3412'), re.compile(
+# hex2re('FE FF 00 00 00 40 00 00 00 63 00 00 00 68 00 00 00'
+# '61 00 00 00 72 00 00 00 73 00 00 00 65 00 00 00 74'
+# '00 00 00 20 00 00 00 22 00 00')
+# + b'((\x00[^\x22]\x00\x00)*?)'
+# + hex2re('00 22 00 00 00 3B 00 00')).match),
+
+# ((Slice[1::4], '-3412'), re.compile(
+# hex2re('00 40 00 00 00 63 00 00 00 68 00 00 00 61 00 00 00'
+# '72 00 00 00 73 00 00 00 65 00 00 00 74 00 00 00 20'
+# '00 00 00 22 00 00')
+# + b'((\x00[^\x22]\x00\x00)*?)'
+# + hex2re('00 22 00 00 00 3B 00 00')).match),
+
+ ((Slice[::4], '-LE'), re.compile(
+ hex2re('FF FE 00 00 40 00 00 00 63 00 00 00 68 00 00 00 61'
+ '00 00 00 72 00 00 00 73 00 00 00 65 00 00 00 74 00'
+ '00 00 20 00 00 00 22 00 00 00')
+ + b'(([^\x22]\x00\x00\x00)*?)'
+ + hex2re('22 00 00 00 3B 00 00 00')).match),
+
+ ((Slice[::4], '-LE'), re.compile(
+ hex2re('40 00 00 00 63 00 00 00 68 00 00 00 61 00 00 00 72'
+ '00 00 00 73 00 00 00 65 00 00 00 74 00 00 00 20 00'
+ '00 00 22 00 00 00')
+ + b'(([^\x22]\x00\x00\x00)*?)'
+ + hex2re('22 00 00 00 3B 00 00 00')).match),
+
+ ('UTF-32-BE', re.compile(
+ hex2re('00 00 FE FF')).match),
+
+ ('UTF-32-LE', re.compile(
+ hex2re('FF FE 00 00')).match),
+
+# ('UTF-32-2143', re.compile(
+# hex2re('00 00 FF FE')).match),
+
+# ('UTF-32-3412', re.compile(
+# hex2re('FE FF 00 00')).match),
+
+ ('UTF-16-BE', re.compile(
+ hex2re('FE FF')).match),
+
+ ('UTF-16-LE', re.compile(
+ hex2re('FF FE')).match),
+
+
+# Some of there are supported by Python, but I didn’t bother.
+# You know the story with patches ...
+
+# # as specified, transcoded from EBCDIC to ASCII
+# ('as_specified-EBCDIC', re.compile(
+# hex2re('7C 83 88 81 99 A2 85 A3 40 7F')
+# + b'([^\x7F]*?)'
+# + hex2re('7F 5E')).match),
+
+# # as specified, transcoded from IBM1026 to ASCII
+# ('as_specified-IBM1026', re.compile(
+# hex2re('AE 83 88 81 99 A2 85 A3 40 FC')
+# + b'([^\xFC]*?)'
+# + hex2re('FC 5E')).match),
+
+# # as specified, transcoded from GSM 03.38 to ASCII
+# ('as_specified-GSM_03.38', re.compile(
+# hex2re('00 63 68 61 72 73 65 74 20 22')
+# + b'([^\x22]*?)'
+# + hex2re('22 3B')).match),
+]
diff --git a/src/tinycss/page3.py b/src/tinycss/page3.py
new file mode 100644
index 000000000000..3c8786002d1c
--- /dev/null
+++ b/src/tinycss/page3.py
@@ -0,0 +1,159 @@
+# coding: utf8
+"""
+ tinycss.page3
+ ------------------
+
+ Support for CSS 3 Paged Media syntax:
+ http://dev.w3.org/csswg/css3-page/
+
+ Adds support for named page selectors and margin rules.
+
+ :copyright: (c) 2012 by Simon Sapin.
+ :license: BSD, see LICENSE for more details.
+"""
+
+from __future__ import unicode_literals, division
+from .css21 import CSS21Parser, ParseError
+
+
+class MarginRule(object):
+ """A parsed at-rule for margin box.
+
+ .. attribute:: at_keyword
+
+ One of the 16 following strings:
+
+ * ``@top-left-corner``
+ * ``@top-left``
+ * ``@top-center``
+ * ``@top-right``
+ * ``@top-right-corner``
+ * ``@bottom-left-corner``
+ * ``@bottom-left``
+ * ``@bottom-center``
+ * ``@bottom-right``
+ * ``@bottom-right-corner``
+ * ``@left-top``
+ * ``@left-middle``
+ * ``@left-bottom``
+ * ``@right-top``
+ * ``@right-middle``
+ * ``@right-bottom``
+
+ .. attribute:: declarations
+
+ A list of :class:`~.css21.Declaration` objects.
+
+ .. attribute:: line
+
+ Source line where this was read.
+
+ .. attribute:: column
+
+ Source column where this was read.
+
+ """
+
+ def __init__(self, at_keyword, declarations, line, column):
+ self.at_keyword = at_keyword
+ self.declarations = declarations
+ self.line = line
+ self.column = column
+
+
+class CSSPage3Parser(CSS21Parser):
+ """Extend :class:`~.css21.CSS21Parser` for `CSS 3 Paged Media`_ syntax.
+
+ .. _CSS 3 Paged Media: http://dev.w3.org/csswg/css3-page/
+
+ Compared to CSS 2.1, the ``at_rules`` and ``selector`` attributes of
+ :class:`~.css21.PageRule` objects are modified:
+
+ * ``at_rules`` is not always empty, it is a list of :class:`MarginRule`
+ objects.
+
+ * ``selector``, instead of a single string, is a tuple of the page name
+ and the pseudo class. Each of these may be a ``None`` or a string.
+
+ +--------------------------+------------------------+
+ | CSS | Parsed selectors |
+ +==========================+========================+
+ | .. code-block:: css | .. code-block:: python |
+ | | |
+ | @page {} | (None, None) |
+ | @page :first {} | (None, 'first') |
+ | @page chapter {} | ('chapter', None) |
+ | @page table:right {} | ('table', 'right') |
+ +--------------------------+------------------------+
+
+ """
+
+ PAGE_MARGIN_AT_KEYWORDS = [
+ '@top-left-corner',
+ '@top-left',
+ '@top-center',
+ '@top-right',
+ '@top-right-corner',
+ '@bottom-left-corner',
+ '@bottom-left',
+ '@bottom-center',
+ '@bottom-right',
+ '@bottom-right-corner',
+ '@left-top',
+ '@left-middle',
+ '@left-bottom',
+ '@right-top',
+ '@right-middle',
+ '@right-bottom',
+ ]
+
+ def parse_at_rule(self, rule, previous_rules, errors, context):
+ if rule.at_keyword in self.PAGE_MARGIN_AT_KEYWORDS:
+ if context != '@page':
+ raise ParseError(rule,
+ '%s rule not allowed in %s' % (rule.at_keyword, context))
+ if rule.head:
+ raise ParseError(rule.head[0],
+ 'unexpected %s token in %s rule header'
+ % (rule.head[0].type, rule.at_keyword))
+ declarations, body_errors = self.parse_declaration_list(rule.body)
+ errors.extend(body_errors)
+ return MarginRule(rule.at_keyword, declarations,
+ rule.line, rule.column)
+ return super(CSSPage3Parser, self).parse_at_rule(
+ rule, previous_rules, errors, context)
+
+ def parse_page_selector(self, head):
+ """Parse an @page selector.
+
+ :param head:
+ The ``head`` attribute of an unparsed :class:`AtRule`.
+ :returns:
+ A page selector. For CSS 2.1, this is 'first', 'left', 'right'
+ or None. 'blank' is added by GCPM.
+ :raises:
+ :class`~parsing.ParseError` on invalid selectors
+
+ """
+ if not head:
+ return (None, None), (0, 0, 0)
+ if head[0].type == 'IDENT':
+ name = head.pop(0).value
+ while head and head[0].type == 'S':
+ head.pop(0)
+ if not head:
+ return (name, None), (1, 0, 0)
+ name_specificity = (1,)
+ else:
+ name = None
+ name_specificity = (0,)
+ if (len(head) == 2 and head[0].type == ':'
+ and head[1].type == 'IDENT'):
+ pseudo_class = head[1].value
+ specificity = {
+ 'first': (1, 0), 'blank': (1, 0),
+ 'left': (0, 1), 'right': (0, 1),
+ }.get(pseudo_class)
+ if specificity:
+ return (name, pseudo_class), (name_specificity + specificity)
+ raise ParseError(head[0], 'invalid @page selector')
diff --git a/src/tinycss/parsing.py b/src/tinycss/parsing.py
new file mode 100644
index 000000000000..86e93c07f7b4
--- /dev/null
+++ b/src/tinycss/parsing.py
@@ -0,0 +1,165 @@
+# coding: utf8
+"""
+ tinycss.parsing
+ ---------------
+
+ Utilities for parsing lists of tokens.
+
+ :copyright: (c) 2012 by Simon Sapin.
+ :license: BSD, see LICENSE for more details.
+"""
+
+from __future__ import unicode_literals
+
+
+# TODO: unit tests
+
+def split_on_comma(tokens):
+ """Split a list of tokens on commas, ie ``,`` DELIM tokens.
+
+ Only "top-level" comma tokens are splitting points, not commas inside a
+ function or other :class:`ContainerToken`.
+
+ :param tokens:
+ An iterable of :class:`~.token_data.Token` or
+ :class:`~.token_data.ContainerToken`.
+ :returns:
+ A list of lists of tokens
+
+ """
+ parts = []
+ this_part = []
+ for token in tokens:
+ if token.type == 'DELIM' and token.value == ',':
+ parts.append(this_part)
+ this_part = []
+ else:
+ this_part.append(token)
+ parts.append(this_part)
+ return parts
+
+
+def strip_whitespace(tokens):
+ """Remove whitespace at the beggining and end of a token list.
+
+ Whitespace tokens in-between other tokens in the list are preserved.
+
+ :param tokens:
+ A list of :class:`~.token_data.Token` or
+ :class:`~.token_data.ContainerToken`.
+ :return:
+ A new sub-sequence of the list.
+
+ """
+ for i, token in enumerate(tokens):
+ if token.type != 'S':
+ break
+ else:
+ return [] # only whitespace
+ tokens = tokens[i:]
+ while tokens and tokens[-1].type == 'S':
+ tokens.pop()
+ return tokens
+
+
+def remove_whitespace(tokens):
+ """Remove any top-level whitespace in a token list.
+
+ Whitespace tokens inside recursive :class:`~.token_data.ContainerToken`
+ are preserved.
+
+ :param tokens:
+ A list of :class:`~.token_data.Token` or
+ :class:`~.token_data.ContainerToken`.
+ :return:
+ A new sub-sequence of the list.
+
+ """
+ return [token for token in tokens if token.type != 'S']
+
+
+def validate_value(tokens):
+ """Validate a property value.
+
+ :param tokens:
+ an iterable of tokens
+ :raises:
+ :class:`ParseError` if there is any invalid token for the 'value'
+ production of the core grammar.
+
+ """
+ for token in tokens:
+ type_ = token.type
+ if type_ == '{':
+ validate_block(token.content, 'property value')
+ else:
+ validate_any(token, 'property value')
+
+def validate_block(tokens, context):
+ """
+ :raises:
+ :class:`ParseError` if there is any invalid token for the 'block'
+ production of the core grammar.
+ :param tokens: an iterable of tokens
+ :param context: a string for the 'unexpected in ...' message
+
+ """
+ for token in tokens:
+ type_ = token.type
+ if type_ == '{':
+ validate_block(token.content, context)
+ elif type_ not in (';', 'ATKEYWORD'):
+ validate_any(token, context)
+
+
+def validate_any(token, context):
+ """
+ :raises:
+ :class:`ParseError` if this is an invalid token for the
+ 'any' production of the core grammar.
+ :param token: a single token
+ :param context: a string for the 'unexpected in ...' message
+
+ """
+ type_ = token.type
+ if type_ in ('FUNCTION', '(', '['):
+ for token in token.content:
+ validate_any(token, type_)
+ elif type_ not in ('S', 'IDENT', 'DIMENSION', 'PERCENTAGE', 'NUMBER',
+ 'INTEGER', 'URI', 'DELIM', 'STRING', 'HASH', ':',
+ 'UNICODE-RANGE'):
+ if type_ in ('}', ')', ']'):
+ adjective = 'unmatched'
+ else:
+ adjective = 'unexpected'
+ raise ParseError(token,
+ '{0} {1} token in {2}'.format(adjective, type_, context))
+
+
+class ParseError(ValueError):
+ """Details about a CSS syntax error. Usually indicates that something
+ (a rule or a declaration) was ignored and will not appear as a parsed
+ object.
+
+ This exception is typically logged in a list rather than being propagated
+ to the user API.
+
+ .. attribute:: line
+
+ Source line where the error occured.
+
+ .. attribute:: column
+
+ Column in the source line where the error occured.
+
+ .. attribute:: reason
+
+ What happend (a string).
+
+ """
+ def __init__(self, subject, reason):
+ self.line = subject.line
+ self.column = subject.column
+ self.reason = reason
+ super(ParseError, self).__init__(
+ 'Parse error at {0.line}:{0.column}, {0.reason}'.format(self))
diff --git a/src/tinycss/token_data.py b/src/tinycss/token_data.py
new file mode 100644
index 000000000000..dcd923229997
--- /dev/null
+++ b/src/tinycss/token_data.py
@@ -0,0 +1,441 @@
+# coding: utf8
+"""
+ tinycss.token_data
+ ------------------
+
+ Shared data for both implementations (Cython and Python) of the tokenizer.
+
+ :copyright: (c) 2012 by Simon Sapin.
+ :license: BSD, see LICENSE for more details.
+"""
+
+from __future__ import unicode_literals
+
+import re
+import sys
+import operator
+import functools
+import string
+
+
+# * Raw strings with the r'' notation are used so that \ do not need
+# to be escaped.
+# * Names and regexps are separated by a tabulation.
+# * Macros are re-ordered so that only previous definitions are needed.
+# * {} are used for macro substitution with ``string.Formatter``,
+# so other uses of { or } have been doubled.
+# * The syntax is otherwise compatible with re.compile.
+# * Some parentheses were added to add capturing groups.
+# (in unicode, DIMENSION and URI)
+
+# *** Willful violation: ***
+# Numbers can take a + or - sign, but the sign is a separate DELIM token.
+# Since comments are allowed anywhere between tokens, this makes
+# the following this is valid. It means 10 negative pixels:
+# margin-top: -/**/10px
+
+# This makes parsing numbers a pain, so instead we’ll do the same is Firefox
+# and make the sign part as of the 'num' macro. The above CSS will be invalid.
+# See discussion:
+# http://lists.w3.org/Archives/Public/www-style/2011Oct/0028.html
+MACROS = r'''
+ nl \n|\r\n|\r|\f
+ w [ \t\r\n\f]*
+ nonascii [^\0-\237]
+ unicode \\([0-9a-f]{{1,6}})(\r\n|[ \n\r\t\f])?
+ simple_escape [^\n\r\f0-9a-f]
+ escape {unicode}|\\{simple_escape}
+ nmstart [_a-z]|{nonascii}|{escape}
+ nmchar [_a-z0-9-]|{nonascii}|{escape}
+ name {nmchar}+
+ ident [-]?{nmstart}{nmchar}*
+ num [-+]?(?:[0-9]*\.[0-9]+|[0-9]+)
+ string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
+ string2 \'([^\n\r\f\\']|\\{nl}|{escape})*\'
+ string {string1}|{string2}
+ badstring1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\\?
+ badstring2 \'([^\n\r\f\\']|\\{nl}|{escape})*\\?
+ badstring {badstring1}|{badstring2}
+ badcomment1 \/\*[^*]*\*+([^/*][^*]*\*+)*
+ badcomment2 \/\*[^*]*(\*+[^/*][^*]*)*
+ badcomment {badcomment1}|{badcomment2}
+ baduri1 url\({w}([!#$%&*-~]|{nonascii}|{escape})*{w}
+ baduri2 url\({w}{string}{w}
+ baduri3 url\({w}{badstring}
+ baduri {baduri1}|{baduri2}|{baduri3}
+'''.replace(r'\0', '\0').replace(r'\237', '\237')
+
+# Removed these tokens. Instead, they’re tokenized as two DELIM each.
+# INCLUDES ~=
+# DASHMATCH |=
+# They are only used in selectors but selectors3 also have ^=, *= and $=.
+# We don’t actually parse selectors anyway
+
+# Re-ordered so that the longest match is always the first.
+# For example, "url('foo')" matches URI, BAD_URI, FUNCTION and IDENT,
+# but URI would always be a longer match than the others.
+TOKENS = r'''
+ S [ \t\r\n\f]+
+
+ URI url\({w}({string}|([!#$%&*-\[\]-~]|{nonascii}|{escape})*){w}\)
+ BAD_URI {baduri}
+ FUNCTION {ident}\(
+ UNICODE-RANGE u\+[0-9a-f?]{{1,6}}(-[0-9a-f]{{1,6}})?
+ IDENT {ident}
+
+ ATKEYWORD @{ident}
+ HASH #{name}
+
+ DIMENSION ({num})({ident})
+ PERCENTAGE {num}%
+ NUMBER {num}
+
+ STRING {string}
+ BAD_STRING {badstring}
+
+ COMMENT \/\*[^*]*\*+([^/*][^*]*\*+)*\/
+ BAD_COMMENT {badcomment}
+
+ : :
+ ; ;
+ { \{{
+ } \}}
+ ( \(
+ ) \)
+ [ \[
+ ] \]
+ CDO
+'''
+
+
+# Strings with {macro} expanded
+COMPILED_MACROS = {}
+
+
+COMPILED_TOKEN_REGEXPS = [] # [(name, regexp.match)] ordered
+COMPILED_TOKEN_INDEXES = {} # {name: i} helper for the C speedups
+
+
+# Indexed by codepoint value of the first character of a token.
+# Codepoints >= 160 (aka nonascii) all use the index 160.
+# values are (i, name, regexp.match)
+TOKEN_DISPATCH = []
+
+
+try:
+ unichr
+except NameError:
+ # Python 3
+ unichr = chr
+ unicode = str
+
+
+def _init():
+ """Import-time initialization."""
+ COMPILED_MACROS.clear()
+ for line in MACROS.splitlines():
+ if line.strip():
+ name, value = line.split('\t')
+ COMPILED_MACROS[name.strip()] = '(?:%s)' \
+ % value.format(**COMPILED_MACROS)
+
+ COMPILED_TOKEN_REGEXPS[:] = (
+ (
+ name.strip(),
+ re.compile(
+ value.format(**COMPILED_MACROS),
+ # Case-insensitive when matching eg. uRL(foo)
+ # but preserve the case in extracted groups
+ re.I
+ ).match
+ )
+ for line in TOKENS.splitlines()
+ if line.strip()
+ for name, value in [line.split('\t')]
+ )
+
+ COMPILED_TOKEN_INDEXES.clear()
+ for i, (name, regexp) in enumerate(COMPILED_TOKEN_REGEXPS):
+ COMPILED_TOKEN_INDEXES[name] = i
+
+ dispatch = [[] for i in range(161)]
+ for chars, names in [
+ (' \t\r\n\f', ['S']),
+ ('uU', ['URI', 'BAD_URI', 'UNICODE-RANGE']),
+ # \ is an escape outside of another token
+ (string.ascii_letters + '\\_-' + unichr(160), ['FUNCTION', 'IDENT']),
+ (string.digits + '.+-', ['DIMENSION', 'PERCENTAGE', 'NUMBER']),
+ ('@', ['ATKEYWORD']),
+ ('#', ['HASH']),
+ ('\'"', ['STRING', 'BAD_STRING']),
+ ('/', ['COMMENT', 'BAD_COMMENT']),
+ ('<', ['CDO']),
+ ('-', ['CDC']),
+ ]:
+ for char in chars:
+ dispatch[ord(char)].extend(names)
+ for char in ':;{}()[]':
+ dispatch[ord(char)] = [char]
+
+ TOKEN_DISPATCH[:] = (
+ [
+ (index,) + COMPILED_TOKEN_REGEXPS[index]
+ for name in names
+ for index in [COMPILED_TOKEN_INDEXES[name]]
+ ]
+ for names in dispatch
+ )
+
+_init()
+
+
+def _unicode_replace(match, int=int, unichr=unichr, maxunicode=sys.maxunicode):
+ codepoint = int(match.group(1), 16)
+ if codepoint <= maxunicode:
+ return unichr(codepoint)
+ else:
+ return '\N{REPLACEMENT CHARACTER}' # U+FFFD
+
+UNICODE_UNESCAPE = functools.partial(
+ re.compile(COMPILED_MACROS['unicode'], re.I).sub,
+ _unicode_replace)
+
+NEWLINE_UNESCAPE = functools.partial(
+ re.compile(r'()\\' + COMPILED_MACROS['nl']).sub,
+ '')
+
+SIMPLE_UNESCAPE = functools.partial(
+ re.compile(r'\\(%s)' % COMPILED_MACROS['simple_escape'] , re.I).sub,
+ # Same as r'\1', but faster on CPython
+ operator.methodcaller('group', 1))
+
+FIND_NEWLINES = re.compile(COMPILED_MACROS['nl']).finditer
+
+
+class Token(object):
+ """A single atomic token.
+
+ .. attribute:: is_container
+
+ Always ``False``.
+ Helps to tell :class:`Token` apart from :class:`ContainerToken`.
+
+ .. attribute:: type
+
+ The type of token as a string:
+
+ ``S``
+ A sequence of white space
+
+ ``IDENT``
+ An identifier: a name that does not start with a digit.
+ A name is a sequence of letters, digits, ``_``, ``-``, escaped
+ characters and non-ASCII characters. Eg: ``margin-left``
+
+ ``HASH``
+ ``#`` followed immediately by a name. Eg: ``#ff8800``
+
+ ``ATKEYWORD``
+ ``@`` followed immediately by an identifier. Eg: ``@page``
+
+ ``URI``
+ Eg: ``url(foo)`` The content may or may not be quoted.
+
+ ``UNICODE-RANGE``
+ ``U+`` followed by one or two hexadecimal
+ Unicode codepoints. Eg: ``U+20-00FF``
+
+ ``INTEGER``
+ An integer with an optional ``+`` or ``-`` sign
+
+ ``NUMBER``
+ A non-integer number with an optional ``+`` or ``-`` sign
+
+ ``DIMENSION``
+ An integer or number followed immediately by an
+ identifier (the unit). Eg: ``12px``
+
+ ``PERCENTAGE``
+ An integer or number followed immediately by ``%``
+
+ ``STRING``
+ A string, quoted with ``"`` or ``'``
+
+ ``:`` or ``;``
+ That character.
+
+ ``DELIM``
+ A single character not matched in another token. Eg: ``,``
+
+ See the source of the :mod:`.token_data` module for the precise
+ regular expressions that match various tokens.
+
+ Note that other token types exist in the early tokenization steps,
+ but these are ignored, are syntax errors, or are later transformed
+ into :class:`ContainerToken` or :class:`FunctionToken`.
+
+ .. attribute:: value
+
+ The parsed value:
+
+ * INTEGER, NUMBER, PERCENTAGE or DIMENSION tokens: the numeric value
+ as an int or float.
+ * STRING tokens: the unescaped string without quotes
+ * URI tokens: the unescaped URI without quotes or
+ ``url(`` and ``)`` markers.
+ * IDENT, ATKEYWORD or HASH tokens: the unescaped token,
+ with ``@`` or ``#`` markers left as-is
+ * Other tokens: same as :attr:`as_css`
+
+ *Unescaped* refers to the various escaping methods based on the
+ backslash ``\`` character in CSS syntax.
+
+ .. attribute:: unit
+
+ * DIMENSION tokens: the normalized (unescaped, lower-case)
+ unit name as a string. eg. ``'px'``
+ * PERCENTAGE tokens: the string ``'%'``
+ * Other tokens: ``None``
+
+ .. attribute:: line
+
+ The line number in the CSS source of the start of this token.
+
+ .. attribute:: column
+
+ The column number (inside a source line) of the start of this token.
+
+ """
+ is_container = False
+ __slots__ = 'type', '_as_css', 'value', 'unit', 'line', 'column'
+
+ def __init__(self, type_, css_value, value, unit, line, column):
+ self.type = type_
+ self._as_css = css_value
+ self.value = value
+ self.unit = unit
+ self.line = line
+ self.column = column
+
+ def as_css(self):
+ """
+ Return as an Unicode string the CSS representation of the token,
+ as parsed in the source.
+ """
+ return self._as_css
+
+ def __repr__(self):
+ return (''
+ .format(self, self.unit or ''))
+
+
+class ContainerToken(object):
+ """A token that contains other (nested) tokens.
+
+ .. attribute:: is_container
+
+ Always ``True``.
+ Helps to tell :class:`ContainerToken` apart from :class:`Token`.
+
+ .. attribute:: type
+
+ The type of token as a string. One of ``{``, ``(``, ``[`` or
+ ``FUNCTION``. For ``FUNCTION``, the object is actually a
+ :class:`FunctionToken`.
+
+ .. attribute:: unit
+
+ Always ``None``. Included to make :class:`ContainerToken` behave
+ more like :class:`Token`.
+
+ .. attribute:: content
+
+ A list of :class:`Token` or nested :class:`ContainerToken`,
+ not including the opening or closing token.
+
+ .. attribute:: line
+
+ The line number in the CSS source of the start of this token.
+
+ .. attribute:: column
+
+ The column number (inside a source line) of the start of this token.
+
+ """
+ is_container = True
+ unit = None
+ __slots__ = 'type', '_css_start', '_css_end', 'content', 'line', 'column'
+
+ def __init__(self, type_, css_start, css_end, content, line, column):
+ self.type = type_
+ self._css_start = css_start
+ self._css_end = css_end
+ self.content = content
+ self.line = line
+ self.column = column
+
+ def as_css(self):
+ """
+ Return as an Unicode string the CSS representation of the token,
+ as parsed in the source.
+ """
+ parts = [self._css_start]
+ parts.extend(token.as_css() for token in self.content)
+ parts.append(self._css_end)
+ return ''.join(parts)
+
+ format_string = ''
+
+ def __repr__(self):
+ return (self.format_string + ' {0.content}').format(self)
+
+
+class FunctionToken(ContainerToken):
+ """A specialized :class:`ContainerToken` for a ``FUNCTION`` group.
+ Has an additional attribute:
+
+ .. attribute:: function_name
+
+ The unescaped name of the function, with the ``(`` marker removed.
+
+ """
+ __slots__ = 'function_name',
+
+ def __init__(self, type_, css_start, css_end, function_name, content,
+ line, column):
+ super(FunctionToken, self).__init__(
+ type_, css_start, css_end, content, line, column)
+ # Remove the ( marker:
+ self.function_name = function_name[:-1]
+
+ format_string = ('')
+
+
+class TokenList(list):
+ """
+ A mixed list of :class:`~.token_data.Token` and
+ :class:`~.token_data.ContainerToken` objects.
+
+ This is a subclass of the builtin :class:`~builtins.list` type.
+ It can be iterated, indexed and sliced as usual, but also has some
+ additional API:
+
+ """
+ @property
+ def line(self):
+ """The line number in the CSS source of the first token."""
+ return self[0].line
+
+ @property
+ def column(self):
+ """The column number (inside a source line) of the first token."""
+ return self[0].column
+
+ def as_css(self):
+ """
+ Return as an Unicode string the CSS representation of the tokens,
+ as parsed in the source.
+ """
+ return ''.join(token.as_css() for token in self)
diff --git a/src/tinycss/tokenizer.py b/src/tinycss/tokenizer.py
new file mode 100644
index 000000000000..eba44c1b933b
--- /dev/null
+++ b/src/tinycss/tokenizer.py
@@ -0,0 +1,216 @@
+# coding: utf8
+"""
+ tinycss.tokenizer
+ -----------------
+
+ Tokenizer for the CSS core syntax:
+ http://www.w3.org/TR/CSS21/syndata.html#tokenization
+
+ This is the pure-python implementation. See also speedups.pyx
+
+ :copyright: (c) 2012 by Simon Sapin.
+ :license: BSD, see LICENSE for more details.
+"""
+
+from __future__ import unicode_literals
+
+from . import token_data
+
+
+def tokenize_flat(css_source, ignore_comments=True,
+ # Make these local variable to avoid global lookups in the loop
+ tokens_dispatch=token_data.TOKEN_DISPATCH,
+ unicode_unescape=token_data.UNICODE_UNESCAPE,
+ newline_unescape=token_data.NEWLINE_UNESCAPE,
+ simple_unescape=token_data.SIMPLE_UNESCAPE,
+ find_newlines=token_data.FIND_NEWLINES,
+ Token=token_data.Token,
+ len=len,
+ int=int,
+ float=float,
+ list=list,
+ _None=None,
+):
+ """
+ :param css_source:
+ CSS as an unicode string
+ :param ignore_comments:
+ if true (the default) comments will not be included in the
+ return value
+ :return:
+ An iterator of :class:`Token`
+
+ """
+
+ pos = 0
+ line = 1
+ column = 1
+ source_len = len(css_source)
+ tokens = []
+ while pos < source_len:
+ char = css_source[pos]
+ if char in ':;{}()[]':
+ type_ = char
+ css_value = char
+ else:
+ codepoint = min(ord(char), 160)
+ for _index, type_, regexp in tokens_dispatch[codepoint]:
+ match = regexp(css_source, pos)
+ if match:
+ # First match is the longest. See comments on TOKENS above.
+ css_value = match.group()
+ break
+ else:
+ # No match.
+ # "Any other character not matched by the above rules,
+ # and neither a single nor a double quote."
+ # ... but quotes at the start of a token are always matched
+ # by STRING or BAD_STRING. So DELIM is any single character.
+ type_ = 'DELIM'
+ css_value = char
+ length = len(css_value)
+ next_pos = pos + length
+
+ # A BAD_COMMENT is a comment at EOF. Ignore it too.
+ if not (ignore_comments and type_ in ('COMMENT', 'BAD_COMMENT')):
+ # Parse numbers, extract strings and URIs, unescape
+ unit = _None
+ if type_ == 'DIMENSION':
+ value = match.group(1)
+ value = float(value) if '.' in value else int(value)
+ unit = match.group(2)
+ unit = simple_unescape(unit)
+ unit = unicode_unescape(unit)
+ unit = unit.lower() # normalize
+ elif type_ == 'PERCENTAGE':
+ value = css_value[:-1]
+ value = float(value) if '.' in value else int(value)
+ unit = '%'
+ elif type_ == 'NUMBER':
+ value = css_value
+ if '.' in value:
+ value = float(value)
+ else:
+ value = int(value)
+ type_ = 'INTEGER'
+ elif type_ in ('IDENT', 'ATKEYWORD', 'HASH', 'FUNCTION'):
+ value = simple_unescape(css_value)
+ value = unicode_unescape(value)
+ elif type_ == 'URI':
+ value = match.group(1)
+ if value and value[0] in '"\'':
+ value = value[1:-1] # Remove quotes
+ value = newline_unescape(value)
+ value = simple_unescape(value)
+ value = unicode_unescape(value)
+ elif type_ == 'STRING':
+ value = css_value[1:-1] # Remove quotes
+ value = newline_unescape(value)
+ value = simple_unescape(value)
+ value = unicode_unescape(value)
+ # BAD_STRING can only be one of:
+ # * Unclosed string at the end of the stylesheet:
+ # Close the string, but this is not an error.
+ # Make it a "good" STRING token.
+ # * Unclosed string at the (unescaped) end of the line:
+ # Close the string, but this is an error.
+ # Leave it as a BAD_STRING, don’t bother parsing it.
+ # See http://www.w3.org/TR/CSS21/syndata.html#parsing-errors
+ elif type_ == 'BAD_STRING' and next_pos == source_len:
+ type_ = 'STRING'
+ value = css_value[1:] # Remove quote
+ value = newline_unescape(value)
+ value = simple_unescape(value)
+ value = unicode_unescape(value)
+ else:
+ value = css_value
+ tokens.append(Token(type_, css_value, value, unit, line, column))
+
+ pos = next_pos
+ newlines = list(find_newlines(css_value))
+ if newlines:
+ line += len(newlines)
+ # Add 1 to have lines start at column 1, not 0
+ column = length - newlines[-1].end() + 1
+ else:
+ column += length
+ return tokens
+
+
+def regroup(tokens):
+ """
+ Match pairs of tokens: () [] {} function()
+ (Strings in "" or '' are taken care of by the tokenizer.)
+
+ Opening tokens are replaced by a :class:`ContainerToken`.
+ Closing tokens are removed. Unmatched closing tokens are invalid
+ but left as-is. All nested structures that are still open at
+ the end of the stylesheet are implicitly closed.
+
+ :param tokens:
+ a *flat* iterable of tokens, as returned by :func:`tokenize_flat`.
+ :return:
+ A tree of tokens.
+
+ """
+ # "global" objects for the inner recursion
+ pairs = {'FUNCTION': ')', '(': ')', '[': ']', '{': '}'}
+ tokens = iter(tokens)
+ eof = [False]
+
+ def _regroup_inner(stop_at=None,
+ tokens=tokens, pairs=pairs, eof=eof,
+ ContainerToken=token_data.ContainerToken,
+ FunctionToken=token_data.FunctionToken):
+ for token in tokens:
+ type_ = token.type
+ if type_ == stop_at:
+ return
+
+ end = pairs.get(type_)
+ if end is None:
+ yield token # Not a grouping token
+ else:
+ assert not isinstance(token, ContainerToken), (
+ 'Token looks already grouped: {0}'.format(token))
+ content = list(_regroup_inner(end))
+ if eof[0]:
+ end = '' # Implicit end of structure at EOF.
+ if type_ == 'FUNCTION':
+ yield FunctionToken(token.type, token.as_css(), end,
+ token.value, content,
+ token.line, token.column)
+ else:
+ yield ContainerToken(token.type, token.as_css(), end,
+ content,
+ token.line, token.column)
+ else:
+ eof[0] = True # end of file/stylesheet
+ return _regroup_inner()
+
+
+def tokenize_grouped(css_source, ignore_comments=True):
+ """
+ :param css_source:
+ CSS as an unicode string
+ :param ignore_comments:
+ if true (the default) comments will not be included in the
+ return value
+ :return:
+ An iterator of :class:`Token`
+
+ """
+ return regroup(tokenize_flat(css_source, ignore_comments))
+
+
+# Optional Cython version of tokenize_flat
+# Make both versions available with explicit names for tests.
+python_tokenize_flat = tokenize_flat
+try:
+ from . import speedups
+except ImportError:
+ cython_tokenize_flat = None
+else:
+ cython_tokenize_flat = speedups.tokenize_flat
+ # Default to the Cython version if available
+ tokenize_flat = cython_tokenize_flat
diff --git a/src/tinycss/version.py b/src/tinycss/version.py
new file mode 100644
index 000000000000..014a8e46baa2
--- /dev/null
+++ b/src/tinycss/version.py
@@ -0,0 +1 @@
+VERSION = '0.3'