diff --git a/.coveragerc b/.coveragerc index 398ff08..a79ad05 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,2 +1,10 @@ [run] branch = True + +[report] +exclude_lines = + pragma: no cover + def __repr__ + except ImportError +omit = + tinycss/tests/speed.py diff --git a/docs/parsing.rst b/docs/parsing.rst index 6efe9f0..56b52b2 100644 --- a/docs/parsing.rst +++ b/docs/parsing.rst @@ -83,14 +83,13 @@ are not parsed by tinycss. They appear as tokens instead. .. module:: tinycss.token_data +.. autoclass:: TokenList() + :member-order: bysource + :members: .. autoclass:: Token() + :members: .. autoclass:: tinycss.speedups.CToken() .. autoclass:: ContainerToken() - - .. autoattribute:: as_css - - .. method:: __iter__, __len__ - - Shortcuts for accessing :attr:`content`. + :members: .. autoclass:: FunctionToken() diff --git a/tinycss/css21.py b/tinycss/css21.py index 01a0ccc..f9cb193 100644 --- a/tinycss/css21.py +++ b/tinycss/css21.py @@ -14,9 +14,10 @@ from itertools import chain, islice from .decoding import decode +from .token_data import TokenList from .tokenizer import tokenize_grouped -from .parsing import (strip_whitespace, validate_value, validate_block, - validate_any, ParseError) +from .parsing import (strip_whitespace, remove_whitespace, split_on_comma, + validate_value, validate_block, validate_any, ParseError) # stylesheet : [ CDO | CDC | S | statement ]*; @@ -63,7 +64,7 @@ def __init__(self, rules, errors, encoding): self.errors = errors self.encoding = encoding - def __repr__(self): # pragma: no cover + def __repr__(self): return '<{0.__class__.__name__} {1} rules {2} errors>'.format( self, len(self.rules), len(self.errors)) @@ -78,13 +79,15 @@ class AtRule(object): .. attribute:: head - The "head" of the at-rule until ``;`` or ``{``: a list of tokens - (:class:`~.token_data.Token` or :class:`~.token_data.ContainerToken`) + The part of the at-rule between the at-keyword and the ``{`` + marking the body, or the ``;`` marking the end of an at-rule without + a body. A :class:`~.token_data.TokenList`. .. attribute:: body - A block as a :class:`~.token_data.ContainerToken` with - ``token.type == '{'``, or ``None`` if the at-rule ends with ``;``. + The content of the body between ``{`` and ``}`` as a + :class:`~.token_data.TokenList`, or ``None`` if there is no body + (ie. if the rule ends with ``;``). The head was validated against the core grammar but **not** the body, as the body might contain declarations. In case of an error in a @@ -98,12 +101,12 @@ class AtRule(object): """ def __init__(self, at_keyword, head, body, line, column): self.at_keyword = at_keyword - self.head = head - self.body = body + self.head = TokenList(head) + self.body = TokenList(body) if body is not None else body self.line = line self.column = column - def __repr__(self): # pragma: no cover + def __repr__(self): return ('<{0.__class__.__name__} {0.line}:{0.column} {0.at_keyword}>' .format(self)) @@ -117,8 +120,7 @@ class RuleSet(object): .. attribute:: selector - The selector as a list of :class:`~.token_data.Token` or - :class:`~.token_data.ContainerToken`. + The selector as a :class:`~.token_data.TokenList`. In CSS 3, this is actually called a selector group. .. attribute:: declarations @@ -130,12 +132,12 @@ class RuleSet(object): at_keyword = None def __init__(self, selector, declarations, line, column): - self.selector = selector + self.selector = TokenList(selector) self.declarations = declarations self.line = line self.column = column - def __repr__(self): # pragma: no cover + def __repr__(self): return ('<{0.__class__.__name__} at {0.line}:{0.column}' ' {0.selector.as_css}>'.format(self)) @@ -149,8 +151,7 @@ class Declaration(object): .. attribute:: value - The property value as a list of :class:`~.token_data.Token` or - :class:`~.token_data.ContainerToken`. + The property value as a :class:`~.token_data.TokenList`. The value is not parsed. UAs using tinycss may only support some properties or some values and tinycss does not know which. @@ -168,12 +169,12 @@ class Declaration(object): """ def __init__(self, name, value, priority, line, column): self.name = name - self.value = value + self.value = TokenList(value) self.priority = priority self.line = line self.column = column - def __repr__(self): # pragma: no cover + def __repr__(self): priority = ' !' + self.priority if self.priority else '' return ('<{0.__class__.__name__} {0.line}:{0.column}' ' {0.name}: {0.value.as_css}{1}>'.format(self, priority)) @@ -219,7 +220,7 @@ def __init__(self, selector, specificity, declarations, at_rules, self.line = line self.column = column - def __repr__(self): # pragma: no cover + def __repr__(self): return ('<{0.__class__.__name__} {0.line}:{0.column}' ' {0.selector}>'.format(self)) @@ -250,7 +251,7 @@ def __init__(self, media, rules, line, column): self.line = line self.column = column - def __repr__(self): # pragma: no cover + def __repr__(self): return ('<{0.__class__.__name__} {0.line}:{0.column}' ' {0.media}>'.format(self)) @@ -283,7 +284,7 @@ def __init__(self, uri, media, line, column): self.line = line self.column = column - def __repr__(self): # pragma: no cover + def __repr__(self): return ('<{0.__class__.__name__} {0.line}:{0.column}' ' {0.uri}>'.format(self)) @@ -303,7 +304,7 @@ def _remove_at_charset(tokens): if [t.type for t in header] == ['ATKEYWORD', 'S', 'STRING', ';']: atkw, space, string, semicolon = header if ((atkw.value, space.value) == ('@charset', ' ') - and string.as_css[0] == '"'): + and string.as_css()[0] == '"'): # Found a valid @charset rule, only keep what’s after it. return tokens return chain(header, tokens) @@ -469,7 +470,7 @@ def read_at_rule(self, at_keyword_token, tokens): for head_token in head: validate_any(head_token, 'at-rule head') if token.type == '{': - body = token + body = token.content else: body = None return AtRule(at_keyword, head, body, @@ -512,8 +513,7 @@ def parse_at_rule(self, rule, previous_rules, errors, context): raise ParseError(rule, 'invalid {0} rule: missing block'.format(rule.at_keyword)) declarations, at_rules, rule_errors = \ - self.parse_declarations_and_at_rules( - rule.body.content, '@page') + self.parse_declarations_and_at_rules(rule.body, '@page') errors.extend(rule_errors) return PageRule(selector, specificity, declarations, at_rules, rule.line, rule.column) @@ -522,12 +522,12 @@ def parse_at_rule(self, rule, previous_rules, errors, context): if context != 'stylesheet': raise ParseError(rule, '@media rule not allowed in ' + context) if not rule.head: - raise ParseError(rule.body, 'expected media types for @media') + raise ParseError(rule, 'expected media types for @media') media = self.parse_media(rule.head) if rule.body is None: raise ParseError(rule, 'invalid {0} rule: missing block'.format(rule.at_keyword)) - rules, rule_errors = self.parse_rules(rule.body.content, '@media') + rules, rule_errors = self.parse_rules(rule.body, '@media') errors.extend(rule_errors) return MediaRule(media, rules, rule.line, rule.column) @@ -558,7 +558,9 @@ def parse_at_rule(self, rule, previous_rules, errors, context): else: media = ['all'] if rule.body is not None: - raise ParseError(rule.body, "expected ';', got a block") + # The position of the ';' token would be best, but we don’t + # have it anymore here. + raise ParseError(head[-1], "expected ';', got a block") return ImportRule(uri, media, rule.line, rule.column) elif rule.at_keyword == '@charset': @@ -574,34 +576,21 @@ def parse_media(self, tokens): Media Queries are expected to override this. :param tokens: - An non-empty iterable of tokens + A non-empty list of tokens :raises: :class:`~.parsing.ParseError` on invalid media types/queries :returns: For CSS 2.1, a list of media types as strings """ media_types = [] - tokens = iter(tokens) - token = next(tokens) - while 1: - if token.type == 'IDENT': - media_types.append(token.value.lower()) + for part in split_on_comma(remove_whitespace(tokens)): + types = [token.type for token in part] + if types == ['IDENT']: + media_types.append(part[0].value) else: - raise ParseError(token, - 'expected a media type, got {0}'.format(token.type)) - token = next(tokens, None) - if not token: - return media_types - if not (token.type == 'DELIM' and token.value == ','): - raise ParseError(token, - 'expected a comma, got {0}'.format(token.type)) - while 1: - next_token = next(tokens, None) - if not next_token: - raise ParseError(token, 'expected a media type') - token = next_token - if token.type != 'S': - break + raise ParseError(tokens[0], 'expected a media type' + + ((', got ' + ', '.join(types)) if types else '')) + return media_types def parse_page_selector(self, tokens): """Parse an @page selector. diff --git a/tinycss/decoding.py b/tinycss/decoding.py index 7102455..6303e1a 100644 --- a/tinycss/decoding.py +++ b/tinycss/decoding.py @@ -21,14 +21,6 @@ __all__ = ['decode'] # Everything else is implementation detail -if sys.version_info[0] < 3: # pragma: no cover - def _unicode_to_native(string): - return string.encode('utf8') -else: # pragma: no cover - def _unicode_to_native(string): - return string - - def decode(css_bytes, protocol_encoding=None, linking_encoding=None, document_encoding=None): """ @@ -63,11 +55,17 @@ def decode(css_bytes, protocol_encoding=None, if has_at_charset: extract, endianness = encoding encoding = extract(match.group(1)) + # Get an ASCII-only unicode value. + # This is the only thing that works on both Python 2 and 3 + # for bytes.decode() + # Non-ASCII encoding names are invalid anyway, + # but make sure they stay invalid. encoding = encoding.decode('ascii', 'replace') + encoding = encoding.replace('\ufffd', '?') if encoding.replace('-', '').replace('_', '').lower() in [ 'utf16', 'utf32']: encoding += endianness - encoding = _unicode_to_native(encoding) + encoding = encoding.encode('ascii', 'replace').decode('ascii') css_unicode = try_encoding(css_bytes, encoding) if css_unicode and not (has_at_charset and not css_unicode.startswith('@charset "')): @@ -85,13 +83,14 @@ def decode(css_bytes, protocol_encoding=None, def try_encoding(css_bytes, encoding, fallback=True): - try: + if fallback: + try: + css_unicode = css_bytes.decode(encoding) + # LookupError means unknown encoding + except (UnicodeDecodeError, LookupError): + return None + else: css_unicode = css_bytes.decode(encoding) - # LookupError means unknown encoding - except (UnicodeDecodeError, LookupError): - if not fallback: - raise - return None if css_unicode and css_unicode[0] == '\ufeff': # Remove any Byte Order Mark css_unicode = css_unicode[1:] diff --git a/tinycss/page3.py b/tinycss/page3.py index 7537ccf..05f665c 100644 --- a/tinycss/page3.py +++ b/tinycss/page3.py @@ -114,8 +114,7 @@ def parse_at_rule(self, rule, previous_rules, errors, context): raise ParseError(rule.head[0], 'unexpected %s token in %s rule header' % (rule.head[0].type, rule.at_keyword)) - declarations, body_errors = self.parse_declaration_list( - rule.body.content) + declarations, body_errors = self.parse_declaration_list(rule.body) errors.extend(body_errors) return MarginRule(rule.at_keyword, declarations, rule.line, rule.column) diff --git a/tinycss/parsing.py b/tinycss/parsing.py index e6f0c97..fd50d2e 100644 --- a/tinycss/parsing.py +++ b/tinycss/parsing.py @@ -165,5 +165,5 @@ def __init__(self, subject, reason): 'Parse error at {0.line}:{0.column}, {0.reason}'.format(self)) super(ParseError, self).__init__(self.message) - def __repr__(self): # pragma: no cover + def __repr__(self): return ('<{0.__class__.__name__}: {0.message}>'.format(self)) diff --git a/tinycss/selectors3.py b/tinycss/selectors3.py index 03ca6b9..e2213c1 100644 --- a/tinycss/selectors3.py +++ b/tinycss/selectors3.py @@ -21,7 +21,7 @@ try: from lxml import cssselect -except ImportError as exc: # pragma: no cover +except ImportError as exc: exc.message = exc.msg = ( __name__ + ' depends on lxml.cssselect. Please install lxml ' 'with "pip install lxml" or from http://lxml.de/') @@ -116,7 +116,7 @@ def parse_selector_group_string(css_string): def _parse_selector_group_tokens(group_tokens): - return [parse_selector_string(''.join(t.as_css for t in tokens)) + return [parse_selector_string(''.join(t.as_css() for t in tokens)) for tokens in split_on_comma(group_tokens)] @@ -236,5 +236,5 @@ def parse_ruleset(self, first_token, tokens): except InvalidSelectorError as exc: # Invalidate the whole ruleset even if some selectors # in the selector group are valid. - raise ParseError(ruleset, exc.args[0]) + raise ParseError(ruleset.selector, exc.args[0]) return ruleset, errors diff --git a/tinycss/speedups.pyx b/tinycss/speedups.pyx index 2d887a4..d90a09f 100644 --- a/tinycss/speedups.pyx +++ b/tinycss/speedups.pyx @@ -29,18 +29,25 @@ cdef class CToken: """ is_container = False - cdef public object type, as_css, value, unit + cdef public object type, _as_css, value, unit cdef public Py_ssize_t line, column def __init__(self, type_, css_value, value, unit, line, column): self.type = type_ - self.as_css = css_value + self._as_css = css_value self.value = value self.unit = unit self.line = line self.column = column - def __repr__(self): # pragma: no cover + def as_css(self): + """ + Return as an Unicode string the CSS representation of the token, + as parsed in the source. + """ + return self._as_css + + def __repr__(self): return ('' .format(self, self.unit or '')) diff --git a/tinycss/tests/speed.py b/tinycss/tests/speed.py index 817c351..2777d4b 100644 --- a/tinycss/tests/speed.py +++ b/tinycss/tests/speed.py @@ -57,7 +57,7 @@ def parse(tokenizer_name): stylesheet = CSS21Parser().parse_stylesheet_bytes(CSS) result = [] for rule in stylesheet.rules: - selector = ''.join(s.as_css for s in rule.selector) + selector = rule.selector.as_css() declarations = [ (declaration.name, len(list(remove_whitespace(declaration.value)))) for declaration in rule.declarations] diff --git a/tinycss/tests/test_css21.py b/tinycss/tests/test_css21.py index ebe68c9..c90a5f7 100644 --- a/tinycss/tests/test_css21.py +++ b/tinycss/tests/test_css21.py @@ -114,7 +114,7 @@ def test_at_rules(css_source, expected_rules, expected_errors): ('foo @page {} bar {}', [('bar', [])], ['unexpected ATKEYWORD token in selector']), - ('foo { content: "unclosed string;\n color:red; ; margin/**/: 2cm; }', + ('foo { content: "unclosed string;\n color:red; ; margin/**/\n: 2cm; }', [('foo', [('margin', [('DIMENSION', 2)])])], ['unexpected BAD_STRING token in property value']), @@ -151,10 +151,10 @@ def parse_at_rule(self, rule, stylesheet_rules, errors, context): assert_errors(stylesheet.errors, expected_errors) result = [ (rule.at_keyword, list(jsonify(rule.head)), - list(jsonify(rule.body.content)) + list(jsonify(rule.body)) if rule.body is not None else None) if rule.at_keyword else - (''.join(s.as_css for s in rule.selector), [ + (rule.selector.as_css(), [ (decl.name, list(jsonify(decl.value))) for decl in rule.declarations]) for rule in stylesheet.rules @@ -318,9 +318,10 @@ def test_at_page(css, expected_result, expected_errors): ('@media all;', [], ['invalid @media rule: missing block']), ('@media {}', [], ['expected media types for @media']), ('@media 4 {}', [], ['expected a media type, got INTEGER']), - ('@media , screen {}', [], ['expected a media type, got DELIM']), + ('@media , screen {}', [], ['expected a media type']), ('@media screen, {}', [], ['expected a media type']), - ('@media screen print {}', [], ['expected a comma, got S']), + ('@media screen print {}', [], + ['expected a media type, got IDENT, IDENT']), ('@media all { @page { a: 1 } @media; @import; foo { a: 1 } }', [(['all'], [('foo', [('a', [('INTEGER', 1)])])])], @@ -337,7 +338,7 @@ def test_at_media(css_source, expected_rules, expected_errors): assert rule.at_keyword == '@media' result = [ (rule.media, [ - (''.join(s.as_css for s in sub_rule.selector), [ + (sub_rule.selector.as_css(), [ (decl.name, list(jsonify(decl.value))) for decl in sub_rule.declarations]) for sub_rule in rule.rules diff --git a/tinycss/tests/test_decoding.py b/tinycss/tests/test_decoding.py index fe78929..42df0c3 100644 --- a/tinycss/tests/test_decoding.py +++ b/tinycss/tests/test_decoding.py @@ -32,7 +32,7 @@ def params(css, encoding, use_bom=False, expect_error=False, **kwargs): params('£', 'ShiftJIS', protocol_encoding='utf8', document_encoding='ShiftJIS'), params('@charset "utf8"; £', 'ShiftJIS', expect_error=True), - params('@charset "utf£"; £', 'ShiftJIS', expect_error=True), + params('@charset "utf£8"; £', 'ShiftJIS', expect_error=True), params('@charset "unknown-encoding"; £', 'ShiftJIS', expect_error=True), params('@charset "utf8"; £', 'ShiftJIS', document_encoding='ShiftJIS'), params('£', 'ShiftJIS', linking_encoding='utf8', diff --git a/tinycss/tests/test_selectors3.py b/tinycss/tests/test_selectors3.py index 5959ef0..3af7f9e 100644 --- a/tinycss/tests/test_selectors3.py +++ b/tinycss/tests/test_selectors3.py @@ -17,7 +17,7 @@ try: import lxml.cssselect -except ImportError: # pragma: no cover +except ImportError: LXML_INSTALLED = False else: LXML_INSTALLED = True diff --git a/tinycss/tests/test_tokenizer.py b/tinycss/tests/test_tokenizer.py index 8821707..8c8d1ef 100644 --- a/tinycss/tests/test_tokenizer.py +++ b/tinycss/tests/test_tokenizer.py @@ -277,7 +277,7 @@ def test_token_serialize_css(tokenize, css_source): pytest.skip('Speedups not available') for _regroup in [regroup, lambda x: x]: tokens = _regroup(tokenize(css_source, ignore_comments=False)) - result = ''.join(token.as_css for token in tokens) + result = ''.join(token.as_css() for token in tokens) assert result == css_source @@ -295,7 +295,4 @@ def test_token_api(tokenize, css_source): assert len(tokens) == 1 token = tokens[0] expected_len = 7 # 2 spaces, 2 commas, 3 others. - assert len(token) == expected_len assert len(token.content) == expected_len - for a, b in zip(iter(token), token.content): - assert a is b diff --git a/tinycss/token_data.py b/tinycss/token_data.py index f222820..d2c2cba 100644 --- a/tinycss/token_data.py +++ b/tinycss/token_data.py @@ -124,7 +124,7 @@ try: unichr -except NameError: # pragma: no cover +except NameError: # Python 3 unichr = chr unicode = str @@ -274,10 +274,6 @@ class Token(object): but these are ignored, are syntax errors, or are later transformed into :class:`ContainerToken` or :class:`FunctionToken`. - .. attribute:: as_css - - The string as it was read from the CSS source - .. attribute:: value The parsed value: @@ -303,25 +299,32 @@ class Token(object): .. attribute:: line - The line number of this token in the CSS source + The line number in the CSS source of the start of this token. .. attribute:: column - The column number inside a line of this token in the CSS source + The column number (inside a source line) of the start of this token. """ is_container = False - __slots__ = 'type', 'as_css', 'value', 'unit', 'line', 'column' + __slots__ = 'type', '_as_css', 'value', 'unit', 'line', 'column' def __init__(self, type_, css_value, value, unit, line, column): self.type = type_ - self.as_css = css_value + self._as_css = css_value self.value = value self.unit = unit self.line = line self.column = column - def __repr__(self): # pragma: no cover + def as_css(self): + """ + Return as an Unicode string the CSS representation of the token, + as parsed in the source. + """ + return self._as_css + + def __repr__(self): return ('' .format(self, self.unit or '')) @@ -340,15 +343,10 @@ class ContainerToken(object): ``FUNCTION``. For ``FUNCTION``, the object is actually a :class:`FunctionToken`. - .. attribute:: css_start - - The string for the opening token as it was read from the CSS source. - Eg: ``{`` - - .. attribute:: css_end + .. attribute:: unit - The string for the closing token as it was read from the CSS source - Eg: ``}`` + Always ``None``. Included to make :class:`ContainerToken` behave + more like :class:`Token`. .. attribute:: content @@ -357,47 +355,40 @@ class ContainerToken(object): .. attribute:: line - The line number of the opening token in the CSS source + The line number in the CSS source of the start of this token. .. attribute:: column - The column number inside a line of the opening token in the CSS source + The column number (inside a source line) of the start of this token. """ is_container = True - __slots__ = 'type', 'css_start', 'css_end', 'content', 'line', 'column' + unit = None + __slots__ = 'type', '_css_start', '_css_end', 'content', 'line', 'column' def __init__(self, type_, css_start, css_end, content, line, column): self.type = type_ - self.css_start = css_start - self.css_end = css_end + self._css_start = css_start + self._css_end = css_end self.content = content self.line = line self.column = column - @property def as_css(self): - """The (recursive) CSS representation of the token, + """ + Return as an Unicode string the CSS representation of the token, as parsed in the source. """ - parts = [self.css_start] - parts.extend(token.as_css for token in self.content) - parts.append(self.css_end) + parts = [self._css_start] + parts.extend(token.as_css() for token in self.content) + parts.append(self._css_end) return ''.join(parts) format_string = '' - def __repr__(self): # pragma: no cover + def __repr__(self): return (self.format_string + ' {0.content}').format(self) - # Sequence-like API (not the full collections.Sequence ABC, though) - - def __iter__(self): - return iter(self.content) - - def __len__(self): - return len(self.content) - class FunctionToken(ContainerToken): """A specialized :class:`ContainerToken` for a ``FUNCTION`` group. @@ -419,3 +410,31 @@ def __init__(self, type_, css_start, css_end, function_name, content, format_string = ('') + + +class TokenList(list): + """ + A mixed list of :class:`~.token_data.Token` and + :class:`~.token_data.ContainerToken` objects. + + This is a subclass of the builtin :class:`~builtins.list` type. + It can be iterated, indexed and sliced as usual, but also has some + additional API: + + """ + @property + def line(self): + """The line number in the CSS source of the first token.""" + return self[0].line + + @property + def column(self): + """The column number (inside a source line) of the first token.""" + return self[0].column + + def as_css(self): + """ + Return as an Unicode string the CSS representation of the tokens, + as parsed in the source. + """ + return ''.join(token.as_css() for token in self) diff --git a/tinycss/tokenizer.py b/tinycss/tokenizer.py index 029a632..5540027 100644 --- a/tinycss/tokenizer.py +++ b/tinycss/tokenizer.py @@ -177,11 +177,11 @@ def _regroup_inner(stop_at=None, if eof[0]: end = '' # Implicit end of structure at EOF. if type_ == 'FUNCTION': - yield FunctionToken(token.type, token.as_css, end, + yield FunctionToken(token.type, token.as_css(), end, token.value, content, token.line, token.column) else: - yield ContainerToken(token.type, token.as_css, end, + yield ContainerToken(token.type, token.as_css(), end, content, token.line, token.column) else: @@ -208,7 +208,7 @@ def tokenize_grouped(css_source, ignore_comments=True): python_tokenize_flat = tokenize_flat try: from . import speedups -except ImportError: # pragma: no cover +except ImportError: cython_tokenize_flat = None else: cython_tokenize_flat = speedups.tokenize_flat