From 0b74fd375cb5dfc10410f0db14759d43b92fc090 Mon Sep 17 00:00:00 2001 From: xzl Date: Tue, 5 Mar 2024 15:35:14 +0800 Subject: [PATCH] feat: update linkify-it-py to 2.0.3 links recognition library with FULL unicode support Issue: https://github.com/deepin-community/sig-deepin-sysdev-team/issues/543 Log: update repo --- CHANGELOG.md | 40 + LICENSE | 48 + MANIFEST.in | 3 + PKG-INFO | 251 +++ README.md | 225 ++- debian/changelog | 48 +- debian/compat | 1 - debian/control | 45 +- debian/copyright | 43 +- debian/install | 1 + debian/logo.svg | 79 + debian/rules | 12 +- debian/watch | 5 + linkify_it/__init__.py | 4 + linkify_it/main.py | 642 ++++++++ linkify_it/tlds.py | 1517 +++++++++++++++++++ linkify_it/ucre.py | 264 ++++ linkify_it_py.egg-info/PKG-INFO | 251 +++ linkify_it_py.egg-info/SOURCES.txt | 21 + linkify_it_py.egg-info/dependency_links.txt | 1 + linkify_it_py.egg-info/requires.txt | 22 + linkify_it_py.egg-info/top_level.txt | 1 + pyproject.toml | 52 + setup.cfg | 4 + test/__init__.py | 0 test/fixtures/links.txt | 335 ++++ test/fixtures/not_links.txt | 52 + test/test_apis.py | 321 ++++ test/test_linkify.py | 40 + test/utils.py | 37 + tox.ini | 12 + 31 files changed, 4341 insertions(+), 36 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 PKG-INFO delete mode 100644 debian/compat create mode 100644 debian/install create mode 100644 debian/logo.svg create mode 100644 debian/watch create mode 100644 linkify_it/__init__.py create mode 100644 linkify_it/main.py create mode 100644 linkify_it/tlds.py create mode 100644 linkify_it/ucre.py create mode 100644 linkify_it_py.egg-info/PKG-INFO create mode 100644 linkify_it_py.egg-info/SOURCES.txt create mode 100644 linkify_it_py.egg-info/dependency_links.txt create mode 100644 linkify_it_py.egg-info/requires.txt create mode 100644 linkify_it_py.egg-info/top_level.txt create mode 100644 pyproject.toml create mode 100644 setup.cfg create mode 100644 test/__init__.py create mode 100644 test/fixtures/links.txt create mode 100644 test/fixtures/not_links.txt create mode 100644 test/test_apis.py create mode 100644 test/test_linkify.py create mode 100644 test/utils.py create mode 100644 tox.ini diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..c9b6448 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,40 @@ +# Change Log + +## v2.0.3 (2024-02-04) + +- Update port.yml (linkify-it v5.0.0) ([#54](https://github.com/tsutsu3/linkify-it-py/pull/54)) +- Fix rtd ([#52](https://github.com/tsutsu3/linkify-it-py/pull/52)) +- Add linkify-it-py-demo url ([#51](https://github.com/tsutsu3/linkify-it-py/pull/51)) +- Fix package build ([#49](https://github.com/tsutsu3/linkify-it-py/pull/49)) + +## v2.0.2 (2023-05-02) + +- Fix missing files to the test ([#44](https://github.com/tsutsu3/linkify-it-py/pull/44)) + +## v2.0.1 (2023-05-02) + +- Update development tools +- Fix sdist is missing tests + +## v2.0.0 (2022-05-07) + +- Add `matchAtStart` method to match full URLs at the start of the string. +- Fixed paired symbols (`()`, `{}`, `""`, etc.) after punctuation. +- `---` option now affects parsing of emails (e.g. `user@example.com---`) + +## v1.0.3 (2021-12-18) + +- Fixed [#98](https://github.com/markdown-it/linkify-it/issues/98). Don't count `;` at the end of link (when followed with space). + +## v1.0.2 (2021-10-09) + +- Fix: Schema key containing - not producing matches (#26) + +## v1.0.1 (2020-12-18) + +- Add manifest +- Add codecov.yml + +## v1.0.0 (2020-11-15) + +- First release diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a0140d1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,48 @@ +MIT License + +Copyright (c) 2020 tsutsu3 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +=============================================================================== + +Original Javascript version: + +Copyright (c) 2015 Vitaly Puzrin. + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..645cb53 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include CHANGELOG.md +include tox.ini +recursive-include test *.txt *.py diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000..37b3186 --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,251 @@ +Metadata-Version: 2.1 +Name: linkify-it-py +Version: 2.0.3 +Summary: Links recognition library with FULL unicode support. +Author: tsutsu3 +License: MIT +Project-URL: Homepage, https://github.com/tsutsu3/linkify-it-py +Keywords: linkify,linkifier,autolink,autolinker +Classifier: Development Status :: 5 - Production/Stable +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Classifier: Intended Audience :: Developers +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Requires-Python: >=3.7 +Description-Content-Type: text/markdown +Provides-Extra: test +Provides-Extra: dev +Provides-Extra: benchmark +Provides-Extra: doc +License-File: LICENSE + +# linkify-it-py + +[![CI](https://github.com/tsutsu3/linkify-it-py/workflows/CI/badge.svg?branch=main)](https://github.com/tsutsu3/linkify-it-py/actions) +[![pypi](https://img.shields.io/pypi/v/linkify-it-py)](https://pypi.org/project/linkify-it-py/) +[![Anaconda-Server Badge](https://anaconda.org/conda-forge/linkify-it-py/badges/version.svg)](https://anaconda.org/conda-forge/linkify-it-py) +[![Documentation Status](https://readthedocs.org/projects/linkify-it-py/badge/?version=latest)](https://linkify-it-py.readthedocs.io/en/latest/?badge=latest) +[![codecov](https://codecov.io/gh/tsutsu3/linkify-it-py/branch/main/graph/badge.svg)](https://codecov.io/gh/tsutsu3/linkify-it-py) +[![Maintainability](https://api.codeclimate.com/v1/badges/6341fd3ec5f05fde392f/maintainability)](https://codeclimate.com/github/tsutsu3/linkify-it-py/maintainability) + +This is Python port of [linkify-it](https://github.com/markdown-it/linkify-it). + +> Links recognition library with FULL unicode support. +> Focused on high quality link patterns detection in plain text. + +__[Demo](https://linkify-it-py-demo.vercel.app/)__ + +__[Javascript Demo](http://markdown-it.github.io/linkify-it/)__ + +Why it's awesome: + +- Full unicode support, _with astral characters_! +- International domains support. +- Allows rules extension & custom normalizers. + + +## Install + +```bash +pip install linkify-it-py +``` + +or + +```bash +conda install -c conda-forge linkify-it-py +``` + +## Usage examples + +### Example 1. Simple use + +```python +from linkify_it import LinkifyIt + + +linkify = LinkifyIt() + +print(linkify.test("Site github.com!")) +# => True + +print(linkify.match("Site github.com!")) +# => [linkify_it.main.Match({ +# 'schema': '', +# 'index': 5, +# 'last_index': 15, +# 'raw': 'github.com', +# 'text': 'github.com', +# 'url': 'http://github.com' +# }] +``` + +### Example 2. With options + +```python +from linkify_it import LinkifyIt +from linkify_it.tlds import TLDS + + +# Reload full tlds list & add unofficial `.onion` domain. +linkify = ( + LinkifyIt() + .tlds(TLDS) # Reload with full tlds list + .tlds("onion", True) # Add unofficial `.onion` domain + .add("git:", "http:") # Add `git:` protocol as "alias" + .add("ftp:", None) # Disable `ftp:` protocol + .set({"fuzzy_ip": True}) # Enable IPs in fuzzy links (without schema) +) +print(linkify.test("Site tamanegi.onion!")) +# => True + +print(linkify.match("Site tamanegi.onion!")) +# => [linkify_it.main.Match({ +# 'schema': '', +# 'index': 5, +# 'last_index': 19, +# 'raw': 'tamanegi.onion', +# 'text': 'tamanegi.onion', +# 'url': 'http://tamanegi.onion' +# }] +``` + +### Example 3. Add twitter mentions handler + +```python +from linkify_it import LinkifyIt + + +linkify = LinkifyIt() + +def validate(obj, text, pos): + tail = text[pos:] + + if not obj.re.get("twitter"): + obj.re["twitter"] = re.compile( + "^([a-zA-Z0-9_]){1,15}(?!_)(?=$|" + obj.re["src_ZPCc"] + ")" + ) + if obj.re["twitter"].search(tail): + if pos > 2 and tail[pos - 2] == "@": + return False + return len(obj.re["twitter"].search(tail).group()) + return 0 + +def normalize(obj, match): + match.url = "https://twitter.com/" + re.sub(r"^@", "", match.url) + +linkify.add("@", {"validate": validate, "normalize": normalize}) +``` + + +## API + +[API documentation](https://linkify-it-py.readthedocs.io/en/latest/) + +### LinkifyIt(schemas, options) + +Creates new linkifier instance with optional additional schemas. + +By default understands: + +- `http(s)://...` , `ftp://...`, `mailto:...` & `//...` links +- "fuzzy" links and emails (google.com, foo@bar.com). + +`schemas` is an dict, where each key/value describes protocol/rule: + +- __key__ - link prefix (usually, protocol name with `:` at the end, `skype:` + for example). `linkify-it-py` makes sure that prefix is not preceded with + alphanumeric char. +- __value__ - rule to check tail after link prefix + - _str_ + - just alias to existing rule + - _dict_ + - _validate_ - either a `re.Pattern` (start with `^`, and don't include the + link prefix itself), or a validator `function` which, given arguments + _self_, _text_ and _pos_, returns the length of a match in _text_ + starting at index _pos_. _pos_ is the index right after the link prefix. + _self_ can be used to access the linkify object to cache data. + - _normalize_ - optional function to normalize text & url of matched result + (for example, for twitter mentions). + +`options`: + +- __fuzzy_link__ - recognize URL-s without `http(s)://` head. Default `True`. +- __fuzzy_ip__ - allow IPs in fuzzy links above. Can conflict with some texts + like version numbers. Default `False`. +- __fuzzy_email__ - recognize emails without `mailto:` prefix. Default `True`. +- __---__ - set `True` to terminate link with `---` (if it's considered as long dash). + + +### .test(text) + +Searches linkifiable pattern and returns `True` on success or `False` on fail. + + +### .pretest(text) + +Quick check if link MAY BE can exist. Can be used to optimize more expensive +`.test()` calls. Return `False` if link can not be found, `True` - if `.test()` +call needed to know exactly. + + +### .test_schema_at(text, name, position) + +Similar to `.test()` but checks only specific protocol tail exactly at given +position. Returns length of found pattern (0 on fail). + + +### .match(text) + +Returns `list` of found link matches or null if nothing found. + +Each match has: + +- __schema__ - link schema, can be empty for fuzzy links, or `//` for + protocol-neutral links. +- __index__ - offset of matched text +- __last_index__ - index of next char after mathch end +- __raw__ - matched text +- __text__ - normalized text +- __url__ - link, generated from matched text + +### .matchAtStart(text) + +Checks if a match exists at the start of the string. Returns `Match` +(see docs for `match(text)`) or null if no URL is at the start. +Doesn't work with fuzzy links. + +### .tlds(list_tlds, keep_old=False) + +Load (or merge) new tlds list. Those are needed for fuzzy links (without schema) +to avoid false positives. By default: + +- 2-letter root zones are ok. +- biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф are ok. +- encoded (`xn--...`) root zones are ok. + +If that's not enough, you can reload defaults with more detailed zones list. + +### .add(key, value) + +Add a new schema to the schemas object. As described in the constructor +definition, `key` is a link prefix (`skype:`, for example), and `value` +is a `str` to alias to another schema, or an `dict` with `validate` and +optionally `normalize` definitions. To disable an existing rule, use +`.add(key, None)`. + + +### .set(options) + +Override default options. Missed properties will not be changed. + + +## License + +[MIT](https://github.com/tsutsu3/linkify-it-py/blob/master/LICENSE) diff --git a/README.md b/README.md index 9ebb840..2b15921 100644 --- a/README.md +++ b/README.md @@ -1 +1,224 @@ -# template-repository \ No newline at end of file +# linkify-it-py + +[![CI](https://github.com/tsutsu3/linkify-it-py/workflows/CI/badge.svg?branch=main)](https://github.com/tsutsu3/linkify-it-py/actions) +[![pypi](https://img.shields.io/pypi/v/linkify-it-py)](https://pypi.org/project/linkify-it-py/) +[![Anaconda-Server Badge](https://anaconda.org/conda-forge/linkify-it-py/badges/version.svg)](https://anaconda.org/conda-forge/linkify-it-py) +[![Documentation Status](https://readthedocs.org/projects/linkify-it-py/badge/?version=latest)](https://linkify-it-py.readthedocs.io/en/latest/?badge=latest) +[![codecov](https://codecov.io/gh/tsutsu3/linkify-it-py/branch/main/graph/badge.svg)](https://codecov.io/gh/tsutsu3/linkify-it-py) +[![Maintainability](https://api.codeclimate.com/v1/badges/6341fd3ec5f05fde392f/maintainability)](https://codeclimate.com/github/tsutsu3/linkify-it-py/maintainability) + +This is Python port of [linkify-it](https://github.com/markdown-it/linkify-it). + +> Links recognition library with FULL unicode support. +> Focused on high quality link patterns detection in plain text. + +__[Demo](https://linkify-it-py-demo.vercel.app/)__ + +__[Javascript Demo](http://markdown-it.github.io/linkify-it/)__ + +Why it's awesome: + +- Full unicode support, _with astral characters_! +- International domains support. +- Allows rules extension & custom normalizers. + + +## Install + +```bash +pip install linkify-it-py +``` + +or + +```bash +conda install -c conda-forge linkify-it-py +``` + +## Usage examples + +### Example 1. Simple use + +```python +from linkify_it import LinkifyIt + + +linkify = LinkifyIt() + +print(linkify.test("Site github.com!")) +# => True + +print(linkify.match("Site github.com!")) +# => [linkify_it.main.Match({ +# 'schema': '', +# 'index': 5, +# 'last_index': 15, +# 'raw': 'github.com', +# 'text': 'github.com', +# 'url': 'http://github.com' +# }] +``` + +### Example 2. With options + +```python +from linkify_it import LinkifyIt +from linkify_it.tlds import TLDS + + +# Reload full tlds list & add unofficial `.onion` domain. +linkify = ( + LinkifyIt() + .tlds(TLDS) # Reload with full tlds list + .tlds("onion", True) # Add unofficial `.onion` domain + .add("git:", "http:") # Add `git:` protocol as "alias" + .add("ftp:", None) # Disable `ftp:` protocol + .set({"fuzzy_ip": True}) # Enable IPs in fuzzy links (without schema) +) +print(linkify.test("Site tamanegi.onion!")) +# => True + +print(linkify.match("Site tamanegi.onion!")) +# => [linkify_it.main.Match({ +# 'schema': '', +# 'index': 5, +# 'last_index': 19, +# 'raw': 'tamanegi.onion', +# 'text': 'tamanegi.onion', +# 'url': 'http://tamanegi.onion' +# }] +``` + +### Example 3. Add twitter mentions handler + +```python +from linkify_it import LinkifyIt + + +linkify = LinkifyIt() + +def validate(obj, text, pos): + tail = text[pos:] + + if not obj.re.get("twitter"): + obj.re["twitter"] = re.compile( + "^([a-zA-Z0-9_]){1,15}(?!_)(?=$|" + obj.re["src_ZPCc"] + ")" + ) + if obj.re["twitter"].search(tail): + if pos > 2 and tail[pos - 2] == "@": + return False + return len(obj.re["twitter"].search(tail).group()) + return 0 + +def normalize(obj, match): + match.url = "https://twitter.com/" + re.sub(r"^@", "", match.url) + +linkify.add("@", {"validate": validate, "normalize": normalize}) +``` + + +## API + +[API documentation](https://linkify-it-py.readthedocs.io/en/latest/) + +### LinkifyIt(schemas, options) + +Creates new linkifier instance with optional additional schemas. + +By default understands: + +- `http(s)://...` , `ftp://...`, `mailto:...` & `//...` links +- "fuzzy" links and emails (google.com, foo@bar.com). + +`schemas` is an dict, where each key/value describes protocol/rule: + +- __key__ - link prefix (usually, protocol name with `:` at the end, `skype:` + for example). `linkify-it-py` makes sure that prefix is not preceded with + alphanumeric char. +- __value__ - rule to check tail after link prefix + - _str_ + - just alias to existing rule + - _dict_ + - _validate_ - either a `re.Pattern` (start with `^`, and don't include the + link prefix itself), or a validator `function` which, given arguments + _self_, _text_ and _pos_, returns the length of a match in _text_ + starting at index _pos_. _pos_ is the index right after the link prefix. + _self_ can be used to access the linkify object to cache data. + - _normalize_ - optional function to normalize text & url of matched result + (for example, for twitter mentions). + +`options`: + +- __fuzzy_link__ - recognize URL-s without `http(s)://` head. Default `True`. +- __fuzzy_ip__ - allow IPs in fuzzy links above. Can conflict with some texts + like version numbers. Default `False`. +- __fuzzy_email__ - recognize emails without `mailto:` prefix. Default `True`. +- __---__ - set `True` to terminate link with `---` (if it's considered as long dash). + + +### .test(text) + +Searches linkifiable pattern and returns `True` on success or `False` on fail. + + +### .pretest(text) + +Quick check if link MAY BE can exist. Can be used to optimize more expensive +`.test()` calls. Return `False` if link can not be found, `True` - if `.test()` +call needed to know exactly. + + +### .test_schema_at(text, name, position) + +Similar to `.test()` but checks only specific protocol tail exactly at given +position. Returns length of found pattern (0 on fail). + + +### .match(text) + +Returns `list` of found link matches or null if nothing found. + +Each match has: + +- __schema__ - link schema, can be empty for fuzzy links, or `//` for + protocol-neutral links. +- __index__ - offset of matched text +- __last_index__ - index of next char after mathch end +- __raw__ - matched text +- __text__ - normalized text +- __url__ - link, generated from matched text + +### .matchAtStart(text) + +Checks if a match exists at the start of the string. Returns `Match` +(see docs for `match(text)`) or null if no URL is at the start. +Doesn't work with fuzzy links. + +### .tlds(list_tlds, keep_old=False) + +Load (or merge) new tlds list. Those are needed for fuzzy links (without schema) +to avoid false positives. By default: + +- 2-letter root zones are ok. +- biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф are ok. +- encoded (`xn--...`) root zones are ok. + +If that's not enough, you can reload defaults with more detailed zones list. + +### .add(key, value) + +Add a new schema to the schemas object. As described in the constructor +definition, `key` is a link prefix (`skype:`, for example), and `value` +is a `str` to alias to another schema, or an `dict` with `validate` and +optionally `normalize` definitions. To disable an existing rule, use +`.add(key, None)`. + + +### .set(options) + +Override default options. Missed properties will not be changed. + + +## License + +[MIT](https://github.com/tsutsu3/linkify-it-py/blob/master/LICENSE) diff --git a/debian/changelog b/debian/changelog index bad88e2..40b9052 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,5 +1,47 @@ -template-repository (1.0-1) unstable; urgency=medium +linkify-it-py (2.0.3-1) unstable; urgency=medium - * Initial release + * New upstream version 2.0.3 + * removed the useless patch - -- Tsic404 Sat, 28 Jan 2023 13:46:49 +0800 + -- Georges Khaznadar Thu, 08 Feb 2024 20:10:59 +0100 + +linkify-it-py (2.0.2-2) unstable; urgency=medium + + * upload to unstable + + -- Georges Khaznadar Sun, 11 Jun 2023 12:58:02 +0200 + +linkify-it-py (2.0.2-1) experimental; urgency=medium + + * New upstream version 2.0.2 + * bumped Standards-Version: 4.6.2 + * indented the list in extended description + * made a debian patch for pyproject.toml, to avoid detecting debian/ as a + potential python module + * added build dependencies on pybuild-plugin-pyproject, python3-pytest + * added an explicit dependency on python3 + + -- Georges Khaznadar Tue, 02 May 2023 11:26:28 +0200 + +linkify-it-py (2.0.0-1) unstable; urgency=medium + + * New upstream version 2.0.0 + + -- Georges Khaznadar Wed, 11 May 2022 12:56:33 +0200 + +linkify-it-py (1.0.3-2) unstable; urgency=medium + + * sent the source package to let the debian farm build it + + -- Georges Khaznadar Tue, 12 Apr 2022 20:06:25 +0200 + +linkify-it-py (1.0.3-1) unstable; urgency=medium + + * came back to release 1.0.3-1 as the package was rejected once, which + removes the source package from the upload directory + + * Fixed d/control: "Section: python3" -> "Section: python" + thank you, Thorsten Alteholz! This release Closes: #997970 + (and Closes: #1008311) + + -- Georges Khaznadar Fri, 08 Apr 2022 18:03:54 +0200 diff --git a/debian/compat b/debian/compat deleted file mode 100644 index b4de394..0000000 --- a/debian/compat +++ /dev/null @@ -1 +0,0 @@ -11 diff --git a/debian/control b/debian/control index cb7c4a0..5b7264d 100644 --- a/debian/control +++ b/debian/control @@ -1,15 +1,34 @@ -Source: template-repository -Section: unknown +Source: linkify-it-py +Section: python Priority: optional -Maintainer: Tsic404 -Build-Depends: debhelper (>= 11) -Standards-Version: 4.1.3 -Homepage: https://github.com/deepin-community/template-repository -#Vcs-Browser: https://salsa.debian.org/debian/deepin-community-template-repository -#Vcs-Git: https://salsa.debian.org/debian/deepin-community-template-repository.git +Maintainer: Georges Khaznadar +Build-Depends: debhelper-compat (= 13), + dh-python, + python3-all, + python3-setuptools, + pybuild-plugin-pyproject, + python3-pytest, + python3-sphinx, + python3-docutils, + python3-uc-micro +Standards-Version: 4.6.2 +Homepage: https://github.com/tsutsu3/linkify-it-py +Vcs-Browser: https://salsa.debian.org/debian/linkify-it-py +Vcs-Git: https://salsa.debian.org/debian/linkify-it-py.git +Rules-Requires-Root: no + +Package: python3-linkify-it +Architecture: all +Depends: ${misc:Depends}, + ${python3:Depends}, + python3, + python3-uc-micro +Description: links recognition library with FULL unicode support + linkify-it-py is focused on high quality link patterns detection in plain text. + . + Why it's awesome: + . + - Full unicode support, with astral characters! + - International domains support. + - Allows rules extension & custom normalizers. -Package: template-repository -Architecture: any -Depends: ${shlibs:Depends}, ${misc:Depends} -Description: - diff --git a/debian/copyright b/debian/copyright index f5c805e..149e12b 100644 --- a/debian/copyright +++ b/debian/copyright @@ -1,22 +1,31 @@ Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ -Upstream-Name: template-repository -Source: https://github.com/deepin-community/template-repository +Upstream-Name: linkify-it-py +Upstream-Contact: Tsutsu3 +Source: Files: * -Copyright: 2023 Tsic404 -License: GPL-2+ - This package is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - . - This package is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. +Copyright: 2020-2021 Tsutsu3 +License: MIT + +Files: debian/* +Copyright: 2022 Georges Khaznadar +License: MIT + +License: MIT + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: . - You should have received a copy of the GNU General Public License - along with this program. If not, see + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. . - On Debian systems, the complete text of the GNU General - Public License version 2 can be found in "/usr/share/common-licenses/GPL-2". + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/debian/install b/debian/install new file mode 100644 index 0000000..5377fcf --- /dev/null +++ b/debian/install @@ -0,0 +1 @@ +linkify_it usr/lib/python3/dist-packages/ diff --git a/debian/logo.svg b/debian/logo.svg new file mode 100644 index 0000000..12e0404 --- /dev/null +++ b/debian/logo.svg @@ -0,0 +1,79 @@ + + + + + + + + image/svg+xml + + + + + + + //: + + + diff --git a/debian/rules b/debian/rules index 2d33f6a..f6e7f17 100755 --- a/debian/rules +++ b/debian/rules @@ -1,4 +1,14 @@ #!/usr/bin/make -f +export PYBUILD_SYSTEM=pyproject + %: - dh $@ + dh $@ --with python3 --buildsystem=pybuild + + +override_dh_auto_clean: + rm -rf *egg* + dh_auto_clean + +override_dh_auto_test: + echo "not running the tests" diff --git a/debian/watch b/debian/watch new file mode 100644 index 0000000..74c7956 --- /dev/null +++ b/debian/watch @@ -0,0 +1,5 @@ +version=4 + +# PyPI +https://pypi.debian.net/linkify-it-py/linkify-it-py-(.+)\.(?:zip|tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz))) + diff --git a/linkify_it/__init__.py b/linkify_it/__init__.py new file mode 100644 index 0000000..c80c7cd --- /dev/null +++ b/linkify_it/__init__.py @@ -0,0 +1,4 @@ +from .main import LinkifyIt # noqa: F401p +from .main import SchemaError # noqa: F401p + +__version__ = "2.0.3" diff --git a/linkify_it/main.py b/linkify_it/main.py new file mode 100644 index 0000000..6aa8903 --- /dev/null +++ b/linkify_it/main.py @@ -0,0 +1,642 @@ +import copy +import re +import types + +from .ucre import build_re + +# py>=37: re.Pattern, else: _sre.SRE_Pattern +RE_TYPE = type(re.compile(r"")) + + +def _escape_re(string): + return re.sub(r"([.?*+^$[\]\\(){}|-])", r"\\\1", string) + + +def _index_of(text, search_value): + try: + result = text.index(search_value) + except ValueError: + result = -1 + + return result + + +class SchemaError(Exception): + """Linkify schema error""" + + def __init__(self, name, val): + message = "(LinkifyIt) Invalid schema '{}': '{}'".format(name, val) + super().__init__(message) + + +class Match: + """Match result. + + Attributes: + schema (str): Prefix (protocol) for matched string. + index (int): First position of matched string. + last_index (int): Next position after matched string. + raw (str): Matched string. + text (str): Notmalized text of matched string. + url (str): Normalized url of matched string. + + Args: + linkifyit (:class:`linkify_it.main.LinkifyIt`) LinkifyIt object + shift (int): text searh position + """ + + def __repr__(self): + return "{}.{}({!r})".format( + self.__class__.__module__, self.__class__.__name__, self.__dict__ + ) + + def __init__(self, linkifyit, shift): + start = linkifyit._index + end = linkifyit._last_index + text = linkifyit._text_cache[start:end] + + self.schema = linkifyit._schema.lower() + self.index = start + shift + self.last_index = end + shift + self.raw = text + self.text = text + self.url = text + + +class LinkifyIt: + """Creates new linkifier instance with optional additional schemas. + + By default understands: + + - ``http(s)://...`` , ``ftp://...``, ``mailto:...`` & ``//...`` links + - "fuzzy" links and emails (example.com, foo@bar.com). + + ``schemas`` is an dict where each key/value describes protocol/rule: + + - **key** - link prefix (usually, protocol name with ``:`` at the end, ``skype:`` + for example). `linkify-it` makes shure that prefix is not preceeded with + alphanumeric char. Only whitespaces and punctuation allowed. + + - **value** - rule to check tail after link prefix + + - *str* - just alias to existing rule + - *dict* + + - *validate* - either a ``re.Pattern``, ``re str`` (start with ``^``, and don't + include the link prefix itself), or a validator ``function`` which, given + arguments *self*, *text* and *pos* returns the length of a match in *text* + starting at index *pos*. *pos* is the index right after the link prefix. + - *normalize* - optional function to normalize text & url of matched + result (for example, for @twitter mentions). + + ``options`` is an dict: + + - **fuzzyLink** - recognige URL-s without ``http(s):`` prefix. Default ``True``. + - **fuzzyIP** - allow IPs in fuzzy links above. Can conflict with some texts + like version numbers. Default ``False``. + - **fuzzyEmail** - recognize emails without ``mailto:`` prefix. + - **---** - set `True` to terminate link with `---` (if it's considered as long + dash). + + Args: + schemas (dict): Optional. Additional schemas to validate (prefix/validator) + options (dict): { fuzzy_link | fuzzy_email | fuzzy_ip: True | False }. + Default: {"fuzzy_link": True, "fuzzy_email": True, "fuzzy_ip": False}. + """ + + def _validate_http(self, text, pos): + tail = text[pos:] + if not self.re.get("http"): + # compile lazily, because "host"-containing variables can change on + # tlds update. + self.re["http"] = ( + "^\\/\\/" + + self.re["src_auth"] + + self.re["src_host_port_strict"] + + self.re["src_path"] + ) + + founds = re.search(self.re["http"], tail, flags=re.IGNORECASE) + if founds: + return len(founds.group()) + + return 0 + + def _validate_double_slash(self, text, pos): + tail = text[pos:] + + if not self.re.get("not_http"): + # compile lazily, because "host"-containing variables can change on + # tlds update. + self.re["not_http"] = ( + "^" + + self.re["src_auth"] + + "(?:localhost|(?:(?:" + + self.re["src_domain"] + + ")\\.)+" + + self.re["src_domain_root"] + + ")" + + self.re["src_port"] + + self.re["src_host_terminator"] + + self.re["src_path"] + ) + + founds = re.search(self.re["not_http"], tail, flags=re.IGNORECASE) + if founds: + if pos >= 3 and text[pos - 3] == ":": + return 0 + + if pos >= 3 and text[pos - 3] == "/": + return 0 + + return len(founds.group(0)) + + return 0 + + def _validate_mailto(self, text, pos): + tail = text[pos:] + + if not self.re.get("mailto"): + self.re["mailto"] = ( + "^" + self.re["src_email_name"] + "@" + self.re["src_host_strict"] + ) + + founds = re.search(self.re["mailto"], tail, flags=re.IGNORECASE) + if founds: + return len(founds.group(0)) + + return 0 + + def _reset_scan_cache(self): + self._index = -1 + self._text_cache = "" + + def _create_validator(self, regex): + def func(text, pos): + tail = text[pos:] + if isinstance(regex, str): + founds = re.search(regex, tail, flags=re.IGNORECASE) + else: + # re.Pattern + founds = re.search(regex, tail) + + if founds: + return len(founds.group(0)) + + return 0 + + return func + + def _create_normalizer(self): + def func(match): + self.normalize(match) + + return func + + def _create_match(self, shift): + match = Match(self, shift) + self._compiled[match.schema]["normalize"](match) + return match + + def __init__(self, schemas=None, options=None): + self.default_options = { + "fuzzy_link": True, + "fuzzy_email": True, + "fuzzy_ip": False, + } + + self.default_schemas = { + "http:": {"validate": self._validate_http}, + "https:": "http:", + "ftp:": "http:", + "//": {"validate": self._validate_double_slash}, + "mailto:": {"validate": self._validate_mailto}, + } + + # RE pattern for 2-character tlds (autogenerated by ./support/tlds_2char_gen.js) + self.tlds_2ch_src_re = "a[cdefgilmnoqrstuwxz]|b[abdefghijmnorstvwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[abdefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghijklmnortuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw]" # noqa: E501 + + # DON'T try to make PRs with changes. Extend TLDs with LinkifyIt.tlds() instead + self.tlds_default = "biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф".split( # noqa: E501 + "|" + ) + + if options: + self.default_options.update(options) + self._opts = self.default_options + else: + self._opts = self.default_options + + # Cache last tested result. Used to skip repeating steps on next `match` call. + self._index = -1 + self._last_index = -1 # Next scan position + self._schema = "" + self._text_cache = "" + + if schemas: + self.default_schemas.update(schemas) + self._schemas = self.default_schemas + else: + self._schemas = self.default_schemas + + self._compiled = {} + + self._tlds = self.tlds_default + self._tlds_replaced = False + + self.re = {} + + self._compile() + + def _compile(self): + """Schemas compiler. Build regexps.""" + + # Load & clone RE patterns. + self.re = build_re(self._opts) + + # Define dynamic patterns + tlds = copy.deepcopy(self._tlds) + + self._on_compile() + + if not self._tlds_replaced: + tlds.append(self.tlds_2ch_src_re) + tlds.append(self.re["src_xn"]) + + self.re["src_tlds"] = "|".join(tlds) + + def untpl(tpl): + return tpl.replace("%TLDS%", self.re["src_tlds"]) + + self.re["email_fuzzy"] = untpl(self.re["tpl_email_fuzzy"]) + + self.re["link_fuzzy"] = untpl(self.re["tpl_link_fuzzy"]) + + self.re["link_no_ip_fuzzy"] = untpl(self.re["tpl_link_no_ip_fuzzy"]) + + self.re["host_fuzzy_test"] = untpl(self.re["tpl_host_fuzzy_test"]) + + # + # Compile each schema + # + + aliases = [] + + self._compiled = {} + + for name, val in self._schemas.items(): + # skip disabled methods + if val is None: + continue + + compiled = {"validate": None, "link": None} + + self._compiled[name] = compiled + + if isinstance(val, dict): + if isinstance(val.get("validate"), RE_TYPE): + compiled["validate"] = self._create_validator(val.get("validate")) + elif isinstance(val.get("validate"), str): + compiled["validate"] = self._create_validator(val.get("validate")) + elif isinstance(val.get("validate"), types.MethodType): + compiled["validate"] = val.get("validate") + # Add custom handler + elif isinstance(val.get("validate"), types.FunctionType): + setattr(LinkifyIt, "func", val.get("validate")) + compiled["validate"] = self.func + else: + raise SchemaError(name, val) + + if isinstance(val.get("normalize"), types.MethodType): + compiled["normalize"] = val.get("normalize") + # Add custom handler + elif isinstance(val.get("normalize"), types.FunctionType): + setattr(LinkifyIt, "func", val.get("normalize")) + compiled["normalize"] = self.func + elif not val.get("normalize"): + compiled["normalize"] = self._create_normalizer() + else: + raise SchemaError(name, val) + + continue + + if isinstance(val, str): + aliases.append(name) + continue + + raise SchemaError(name, val) + + # + # Compile postponed aliases + # + for alias in aliases: + if not self._compiled.get(self._schemas.get(alias)): + continue + + self._compiled[alias]["validate"] = self._compiled[self._schemas[alias]][ + "validate" + ] + self._compiled[alias]["normalize"] = self._compiled[self._schemas[alias]][ + "normalize" + ] + + # Fake record for guessed links + self._compiled[""] = {"validate": None, "normalize": self._create_normalizer()} + + # + # Build schema condition + # + slist = "|".join( + [ + _escape_re(name) + for name, val in self._compiled.items() + if len(name) > 0 and val + ] + ) + + re_schema_test = ( + "(^|(?!_)(?:[><\uff5c]|" + self.re["src_ZPCc"] + "))(" + slist + ")" + ) + + # (?!_) cause 1.5x slowdown + self.re["schema_test"] = re_schema_test + self.re["schema_search"] = re_schema_test + self.re["schema_at_start"] = "^" + self.re["schema_search"] + + self.re["pretest"] = ( + "(" + re_schema_test + ")|(" + self.re["host_fuzzy_test"] + ")|@" + ) + + # Cleanup + + self._reset_scan_cache() + + def add(self, schema, definition): + """Add new rule definition. (chainable) + + See :class:`linkify_it.main.LinkifyIt` init description for details. + ``schema`` is a link prefix (``skype:``, for example), and ``definition`` + is a ``str`` to alias to another schema, or an ``dict`` with ``validate`` and + optionally `normalize` definitions. To disable an existing rule, use + ``.add(, None)``. + + Args: + schema (str): rule name (fixed pattern prefix) + definition (`str` or `re.Pattern`): schema definition + + Return: + :class:`linkify_it.main.LinkifyIt` + """ + self._schemas[schema] = definition + self._compile() + return self + + def set(self, options): + """Override default options. (chainable) + + Missed properties will not be changed. + + Args: + options (dict): ``keys``: [``fuzzy_link`` | ``fuzzy_email`` | ``fuzzy_ip``]. + ``values``: [``True`` | ``False``] + + Return: + :class:`linkify_it.main.LinkifyIt` + """ + self._opts.update(options) + return self + + def test(self, text): + """Searches linkifiable pattern and returns ``True`` on success or ``False`` + on fail. + + Args: + text (str): text to search + + Returns: + bool: ``True`` if a linkable pattern was found, otherwise it is ``False``. + """ + self._text_cache = text + self._index = -1 + + if not len(text): + return False + + if re.search(self.re["schema_test"], text, flags=re.IGNORECASE): + regex = self.re["schema_search"] + last_index = 0 + matched_iter = re.finditer(regex, text[last_index:], flags=re.IGNORECASE) + for matched in matched_iter: + last_index = matched.end(0) + m = (matched.group(), matched.groups()[0], matched.groups()[1]) + length = self.test_schema_at(text, m[2], last_index) + if length: + self._schema = m[2] + self._index = matched.start(0) + len(m[1]) + self._last_index = matched.start(0) + len(m[0]) + length + break + + if self._opts.get("fuzzy_link") and self._compiled.get("http:"): + # guess schemaless links + matched_tld = re.search( + self.re["host_fuzzy_test"], text, flags=re.IGNORECASE + ) + if matched_tld: + tld_pos = matched_tld.start(0) + else: + tld_pos = -1 + if tld_pos >= 0: + # if tld is located after found link - no need to check fuzzy pattern + if self._index < 0 or tld_pos < self._index: + if self._opts.get("fuzzy_ip"): + pattern = self.re["link_fuzzy"] + else: + pattern = self.re["link_no_ip_fuzzy"] + + ml = re.search(pattern, text, flags=re.IGNORECASE) + if ml: + shift = ml.start(0) + len(ml.groups()[0]) + + if self._index < 0 or shift < self._index: + self._schema = "" + self._index = shift + self._last_index = ml.start(0) + len(ml.group()) + + if self._opts.get("fuzzy_email") and self._compiled.get("mailto:"): + # guess schemaless emails + at_pos = _index_of(text, "@") + if at_pos >= 0: + # We can't skip this check, because this cases are possible: + # 192.168.1.1@gmail.com, my.in@example.com + me = re.search(self.re["email_fuzzy"], text, flags=re.IGNORECASE) + if me: + shift = me.start(0) + len(me.groups()[0]) + next_shift = me.start(0) + len(me.group()) + + if ( + self._index < 0 + or shift < self._index + or (shift == self._index and next_shift > self._last_index) + ): + self._schema = "mailto:" + self._index = shift + self._last_index = next_shift + + return self._index >= 0 + + def pretest(self, text): + """Very quick check, that can give false positives. + + Returns true if link MAY BE can exists. Can be used for speed optimization, + when you need to check that link NOT exists. + + Args: + text (str): text to search + + Returns: + bool: ``True`` if a linkable pattern was found, otherwise it is ``False``. + """ + if re.search(self.re["pretest"], text, flags=re.IGNORECASE): + return True + + return False + + def test_schema_at(self, text, name, position): + """Similar to :meth:`linkify_it.main.LinkifyIt.test` but checks only + specific protocol tail exactly at given position. + + Args: + text (str): text to scan + name (str): rule (schema) name + position (int): length of found pattern (0 on fail). + + Returns: + int: text (str): text to search + """ + # If not supported schema check requested - terminate + if not self._compiled.get(name.lower()): + return 0 + return self._compiled.get(name.lower()).get("validate")(text, position) + + def match(self, text): + """Returns ``list`` of found link descriptions or ``None`` on fail. + + We strongly recommend to use :meth:`linkify_it.main.LinkifyIt.test` + first, for best speed. + + Args: + text (str): text to search + + Returns: + ``list`` or ``None``: Result match description: + * **schema** - link schema, can be empty for fuzzy links, or ``//`` + for protocol-neutral links. + * **index** - offset of matched text + * **last_index** - offset of matched text + * **raw** - offset of matched text + * **text** - normalized text + * **url** - link, generated from matched text + """ + shift = 0 + result = [] + + # try to take previous element from cache, if .test() called before + if self._index >= 0 and self._text_cache == text: + result.append(self._create_match(shift)) + shift = self._last_index + + # Cut head if cache was used + tail = text[shift:] if shift else text + + # Scan string until end reached + while self.test(tail): + result.append(self._create_match(shift)) + + tail = tail[self._last_index :] + shift += self._last_index + + if len(result): + return result + + return None + + def match_at_start(self, text): + """Returns fully-formed (not fuzzy) link if it starts at the beginning + of the string, and null otherwise. + + Args: + text (str): text to search + + Retuns: + ``Match`` or ``None`` + """ + # Reset scan cache + self._text_cache = text + self._index = -1 + + if not len(text): + return None + + founds = re.search(self.re["schema_at_start"], text, flags=re.IGNORECASE) + if not founds: + return None + + m = (founds.group(), founds.groups()[0], founds.groups()[1]) + length = self.test_schema_at(text, m[2], len(m[0])) + if not length: + return None + + self._schema = m[2] + self._index = founds.start(0) + len(m[1]) + self._last_index = founds.start(0) + len(m[0]) + length + + return self._create_match(0) + + def tlds(self, list_tlds, keep_old=False): + """Load (or merge) new tlds list. (chainable) + + Those are user for fuzzy links (without prefix) to avoid false positives. + By default this algorythm used: + + * hostname with any 2-letter root zones are ok. + * biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф + are ok. + * encoded (`xn--...`) root zones are ok. + + If list is replaced, then exact match for 2-chars root zones will be checked. + + Args: + list_tlds (list or str): ``list of tlds`` or ``tlds string`` + keep_old (bool): merge with current list if q`True`q (q`Falseq` by default) + """ + _list = list_tlds if isinstance(list_tlds, list) else [list_tlds] + + if not keep_old: + self._tlds = _list + self._tlds_replaced = True + self._compile() + return self + + self._tlds.extend(_list) + self._tlds = sorted(list(set(self._tlds)), reverse=True) + + self._compile() + return self + + def normalize(self, match): + """Default normalizer (if schema does not define it's own). + + Args: + match (:class:`linkify_it.main.Match`): Match result + """ + if not match.schema: + match.url = "http://" + match.url + + if match.schema == "mailto:" and not re.search( + "^mailto:", match.url, flags=re.IGNORECASE + ): + match.url = "mailto:" + match.url + + def _on_compile(self): + """Override to modify basic RegExp-s.""" + pass diff --git a/linkify_it/tlds.py b/linkify_it/tlds.py new file mode 100644 index 0000000..7f8053d --- /dev/null +++ b/linkify_it/tlds.py @@ -0,0 +1,1517 @@ +"""TLDS + +Version 2020110600, Last Updated Fri Nov 6 07:07:02 2020 UTC + +References: + http://data.iana.org/TLD/tlds-alpha-by-domain.txt +""" +TLDS = [ + "AAA", + "AARP", + "ABARTH", + "ABB", + "ABBOTT", + "ABBVIE", + "ABC", + "ABLE", + "ABOGADO", + "ABUDHABI", + "AC", + "ACADEMY", + "ACCENTURE", + "ACCOUNTANT", + "ACCOUNTANTS", + "ACO", + "ACTOR", + "AD", + "ADAC", + "ADS", + "ADULT", + "AE", + "AEG", + "AERO", + "AETNA", + "AF", + "AFAMILYCOMPANY", + "AFL", + "AFRICA", + "AG", + "AGAKHAN", + "AGENCY", + "AI", + "AIG", + "AIRBUS", + "AIRFORCE", + "AIRTEL", + "AKDN", + "AL", + "ALFAROMEO", + "ALIBABA", + "ALIPAY", + "ALLFINANZ", + "ALLSTATE", + "ALLY", + "ALSACE", + "ALSTOM", + "AM", + "AMAZON", + "AMERICANEXPRESS", + "AMERICANFAMILY", + "AMEX", + "AMFAM", + "AMICA", + "AMSTERDAM", + "ANALYTICS", + "ANDROID", + "ANQUAN", + "ANZ", + "AO", + "AOL", + "APARTMENTS", + "APP", + "APPLE", + "AQ", + "AQUARELLE", + "AR", + "ARAB", + "ARAMCO", + "ARCHI", + "ARMY", + "ARPA", + "ART", + "ARTE", + "AS", + "ASDA", + "ASIA", + "ASSOCIATES", + "AT", + "ATHLETA", + "ATTORNEY", + "AU", + "AUCTION", + "AUDI", + "AUDIBLE", + "AUDIO", + "AUSPOST", + "AUTHOR", + "AUTO", + "AUTOS", + "AVIANCA", + "AW", + "AWS", + "AX", + "AXA", + "AZ", + "AZURE", + "BA", + "BABY", + "BAIDU", + "BANAMEX", + "BANANAREPUBLIC", + "BAND", + "BANK", + "BAR", + "BARCELONA", + "BARCLAYCARD", + "BARCLAYS", + "BAREFOOT", + "BARGAINS", + "BASEBALL", + "BASKETBALL", + "BAUHAUS", + "BAYERN", + "BB", + "BBC", + "BBT", + "BBVA", + "BCG", + "BCN", + "BD", + "BE", + "BEATS", + "BEAUTY", + "BEER", + "BENTLEY", + "BERLIN", + "BEST", + "BESTBUY", + "BET", + "BF", + "BG", + "BH", + "BHARTI", + "BI", + "BIBLE", + "BID", + "BIKE", + "BING", + "BINGO", + "BIO", + "BIZ", + "BJ", + "BLACK", + "BLACKFRIDAY", + "BLOCKBUSTER", + "BLOG", + "BLOOMBERG", + "BLUE", + "BM", + "BMS", + "BMW", + "BN", + "BNPPARIBAS", + "BO", + "BOATS", + "BOEHRINGER", + "BOFA", + "BOM", + "BOND", + "BOO", + "BOOK", + "BOOKING", + "BOSCH", + "BOSTIK", + "BOSTON", + "BOT", + "BOUTIQUE", + "BOX", + "BR", + "BRADESCO", + "BRIDGESTONE", + "BROADWAY", + "BROKER", + "BROTHER", + "BRUSSELS", + "BS", + "BT", + "BUDAPEST", + "BUGATTI", + "BUILD", + "BUILDERS", + "BUSINESS", + "BUY", + "BUZZ", + "BV", + "BW", + "BY", + "BZ", + "BZH", + "CA", + "CAB", + "CAFE", + "CAL", + "CALL", + "CALVINKLEIN", + "CAM", + "CAMERA", + "CAMP", + "CANCERRESEARCH", + "CANON", + "CAPETOWN", + "CAPITAL", + "CAPITALONE", + "CAR", + "CARAVAN", + "CARDS", + "CARE", + "CAREER", + "CAREERS", + "CARS", + "CASA", + "CASE", + "CASEIH", + "CASH", + "CASINO", + "CAT", + "CATERING", + "CATHOLIC", + "CBA", + "CBN", + "CBRE", + "CBS", + "CC", + "CD", + "CEB", + "CENTER", + "CEO", + "CERN", + "CF", + "CFA", + "CFD", + "CG", + "CH", + "CHANEL", + "CHANNEL", + "CHARITY", + "CHASE", + "CHAT", + "CHEAP", + "CHINTAI", + "CHRISTMAS", + "CHROME", + "CHURCH", + "CI", + "CIPRIANI", + "CIRCLE", + "CISCO", + "CITADEL", + "CITI", + "CITIC", + "CITY", + "CITYEATS", + "CK", + "CL", + "CLAIMS", + "CLEANING", + "CLICK", + "CLINIC", + "CLINIQUE", + "CLOTHING", + "CLOUD", + "CLUB", + "CLUBMED", + "CM", + "CN", + "CO", + "COACH", + "CODES", + "COFFEE", + "COLLEGE", + "COLOGNE", + "COM", + "COMCAST", + "COMMBANK", + "COMMUNITY", + "COMPANY", + "COMPARE", + "COMPUTER", + "COMSEC", + "CONDOS", + "CONSTRUCTION", + "CONSULTING", + "CONTACT", + "CONTRACTORS", + "COOKING", + "COOKINGCHANNEL", + "COOL", + "COOP", + "CORSICA", + "COUNTRY", + "COUPON", + "COUPONS", + "COURSES", + "CPA", + "CR", + "CREDIT", + "CREDITCARD", + "CREDITUNION", + "CRICKET", + "CROWN", + "CRS", + "CRUISE", + "CRUISES", + "CSC", + "CU", + "CUISINELLA", + "CV", + "CW", + "CX", + "CY", + "CYMRU", + "CYOU", + "CZ", + "DABUR", + "DAD", + "DANCE", + "DATA", + "DATE", + "DATING", + "DATSUN", + "DAY", + "DCLK", + "DDS", + "DE", + "DEAL", + "DEALER", + "DEALS", + "DEGREE", + "DELIVERY", + "DELL", + "DELOITTE", + "DELTA", + "DEMOCRAT", + "DENTAL", + "DENTIST", + "DESI", + "DESIGN", + "DEV", + "DHL", + "DIAMONDS", + "DIET", + "DIGITAL", + "DIRECT", + "DIRECTORY", + "DISCOUNT", + "DISCOVER", + "DISH", + "DIY", + "DJ", + "DK", + "DM", + "DNP", + "DO", + "DOCS", + "DOCTOR", + "DOG", + "DOMAINS", + "DOT", + "DOWNLOAD", + "DRIVE", + "DTV", + "DUBAI", + "DUCK", + "DUNLOP", + "DUPONT", + "DURBAN", + "DVAG", + "DVR", + "DZ", + "EARTH", + "EAT", + "EC", + "ECO", + "EDEKA", + "EDU", + "EDUCATION", + "EE", + "EG", + "EMAIL", + "EMERCK", + "ENERGY", + "ENGINEER", + "ENGINEERING", + "ENTERPRISES", + "EPSON", + "EQUIPMENT", + "ER", + "ERICSSON", + "ERNI", + "ES", + "ESQ", + "ESTATE", + "ET", + "ETISALAT", + "EU", + "EUROVISION", + "EUS", + "EVENTS", + "EXCHANGE", + "EXPERT", + "EXPOSED", + "EXPRESS", + "EXTRASPACE", + "FAGE", + "FAIL", + "FAIRWINDS", + "FAITH", + "FAMILY", + "FAN", + "FANS", + "FARM", + "FARMERS", + "FASHION", + "FAST", + "FEDEX", + "FEEDBACK", + "FERRARI", + "FERRERO", + "FI", + "FIAT", + "FIDELITY", + "FIDO", + "FILM", + "FINAL", + "FINANCE", + "FINANCIAL", + "FIRE", + "FIRESTONE", + "FIRMDALE", + "FISH", + "FISHING", + "FIT", + "FITNESS", + "FJ", + "FK", + "FLICKR", + "FLIGHTS", + "FLIR", + "FLORIST", + "FLOWERS", + "FLY", + "FM", + "FO", + "FOO", + "FOOD", + "FOODNETWORK", + "FOOTBALL", + "FORD", + "FOREX", + "FORSALE", + "FORUM", + "FOUNDATION", + "FOX", + "FR", + "FREE", + "FRESENIUS", + "FRL", + "FROGANS", + "FRONTDOOR", + "FRONTIER", + "FTR", + "FUJITSU", + "FUJIXEROX", + "FUN", + "FUND", + "FURNITURE", + "FUTBOL", + "FYI", + "GA", + "GAL", + "GALLERY", + "GALLO", + "GALLUP", + "GAME", + "GAMES", + "GAP", + "GARDEN", + "GAY", + "GB", + "GBIZ", + "GD", + "GDN", + "GE", + "GEA", + "GENT", + "GENTING", + "GEORGE", + "GF", + "GG", + "GGEE", + "GH", + "GI", + "GIFT", + "GIFTS", + "GIVES", + "GIVING", + "GL", + "GLADE", + "GLASS", + "GLE", + "GLOBAL", + "GLOBO", + "GM", + "GMAIL", + "GMBH", + "GMO", + "GMX", + "GN", + "GODADDY", + "GOLD", + "GOLDPOINT", + "GOLF", + "GOO", + "GOODYEAR", + "GOOG", + "GOOGLE", + "GOP", + "GOT", + "GOV", + "GP", + "GQ", + "GR", + "GRAINGER", + "GRAPHICS", + "GRATIS", + "GREEN", + "GRIPE", + "GROCERY", + "GROUP", + "GS", + "GT", + "GU", + "GUARDIAN", + "GUCCI", + "GUGE", + "GUIDE", + "GUITARS", + "GURU", + "GW", + "GY", + "HAIR", + "HAMBURG", + "HANGOUT", + "HAUS", + "HBO", + "HDFC", + "HDFCBANK", + "HEALTH", + "HEALTHCARE", + "HELP", + "HELSINKI", + "HERE", + "HERMES", + "HGTV", + "HIPHOP", + "HISAMITSU", + "HITACHI", + "HIV", + "HK", + "HKT", + "HM", + "HN", + "HOCKEY", + "HOLDINGS", + "HOLIDAY", + "HOMEDEPOT", + "HOMEGOODS", + "HOMES", + "HOMESENSE", + "HONDA", + "HORSE", + "HOSPITAL", + "HOST", + "HOSTING", + "HOT", + "HOTELES", + "HOTELS", + "HOTMAIL", + "HOUSE", + "HOW", + "HR", + "HSBC", + "HT", + "HU", + "HUGHES", + "HYATT", + "HYUNDAI", + "IBM", + "ICBC", + "ICE", + "ICU", + "ID", + "IE", + "IEEE", + "IFM", + "IKANO", + "IL", + "IM", + "IMAMAT", + "IMDB", + "IMMO", + "IMMOBILIEN", + "IN", + "INC", + "INDUSTRIES", + "INFINITI", + "INFO", + "ING", + "INK", + "INSTITUTE", + "INSURANCE", + "INSURE", + "INT", + "INTERNATIONAL", + "INTUIT", + "INVESTMENTS", + "IO", + "IPIRANGA", + "IQ", + "IR", + "IRISH", + "IS", + "ISMAILI", + "IST", + "ISTANBUL", + "IT", + "ITAU", + "ITV", + "IVECO", + "JAGUAR", + "JAVA", + "JCB", + "JCP", + "JE", + "JEEP", + "JETZT", + "JEWELRY", + "JIO", + "JLL", + "JM", + "JMP", + "JNJ", + "JO", + "JOBS", + "JOBURG", + "JOT", + "JOY", + "JP", + "JPMORGAN", + "JPRS", + "JUEGOS", + "JUNIPER", + "KAUFEN", + "KDDI", + "KE", + "KERRYHOTELS", + "KERRYLOGISTICS", + "KERRYPROPERTIES", + "KFH", + "KG", + "KH", + "KI", + "KIA", + "KIM", + "KINDER", + "KINDLE", + "KITCHEN", + "KIWI", + "KM", + "KN", + "KOELN", + "KOMATSU", + "KOSHER", + "KP", + "KPMG", + "KPN", + "KR", + "KRD", + "KRED", + "KUOKGROUP", + "KW", + "KY", + "KYOTO", + "KZ", + "LA", + "LACAIXA", + "LAMBORGHINI", + "LAMER", + "LANCASTER", + "LANCIA", + "LAND", + "LANDROVER", + "LANXESS", + "LASALLE", + "LAT", + "LATINO", + "LATROBE", + "LAW", + "LAWYER", + "LB", + "LC", + "LDS", + "LEASE", + "LECLERC", + "LEFRAK", + "LEGAL", + "LEGO", + "LEXUS", + "LGBT", + "LI", + "LIDL", + "LIFE", + "LIFEINSURANCE", + "LIFESTYLE", + "LIGHTING", + "LIKE", + "LILLY", + "LIMITED", + "LIMO", + "LINCOLN", + "LINDE", + "LINK", + "LIPSY", + "LIVE", + "LIVING", + "LIXIL", + "LK", + "LLC", + "LLP", + "LOAN", + "LOANS", + "LOCKER", + "LOCUS", + "LOFT", + "LOL", + "LONDON", + "LOTTE", + "LOTTO", + "LOVE", + "LPL", + "LPLFINANCIAL", + "LR", + "LS", + "LT", + "LTD", + "LTDA", + "LU", + "LUNDBECK", + "LUPIN", + "LUXE", + "LUXURY", + "LV", + "LY", + "MA", + "MACYS", + "MADRID", + "MAIF", + "MAISON", + "MAKEUP", + "MAN", + "MANAGEMENT", + "MANGO", + "MAP", + "MARKET", + "MARKETING", + "MARKETS", + "MARRIOTT", + "MARSHALLS", + "MASERATI", + "MATTEL", + "MBA", + "MC", + "MCKINSEY", + "MD", + "ME", + "MED", + "MEDIA", + "MEET", + "MELBOURNE", + "MEME", + "MEMORIAL", + "MEN", + "MENU", + "MERCKMSD", + "MG", + "MH", + "MIAMI", + "MICROSOFT", + "MIL", + "MINI", + "MINT", + "MIT", + "MITSUBISHI", + "MK", + "ML", + "MLB", + "MLS", + "MM", + "MMA", + "MN", + "MO", + "MOBI", + "MOBILE", + "MODA", + "MOE", + "MOI", + "MOM", + "MONASH", + "MONEY", + "MONSTER", + "MORMON", + "MORTGAGE", + "MOSCOW", + "MOTO", + "MOTORCYCLES", + "MOV", + "MOVIE", + "MP", + "MQ", + "MR", + "MS", + "MSD", + "MT", + "MTN", + "MTR", + "MU", + "MUSEUM", + "MUTUAL", + "MV", + "MW", + "MX", + "MY", + "MZ", + "NA", + "NAB", + "NAGOYA", + "NAME", + "NATIONWIDE", + "NATURA", + "NAVY", + "NBA", + "NC", + "NE", + "NEC", + "NET", + "NETBANK", + "NETFLIX", + "NETWORK", + "NEUSTAR", + "NEW", + "NEWHOLLAND", + "NEWS", + "NEXT", + "NEXTDIRECT", + "NEXUS", + "NF", + "NFL", + "NG", + "NGO", + "NHK", + "NI", + "NICO", + "NIKE", + "NIKON", + "NINJA", + "NISSAN", + "NISSAY", + "NL", + "NO", + "NOKIA", + "NORTHWESTERNMUTUAL", + "NORTON", + "NOW", + "NOWRUZ", + "NOWTV", + "NP", + "NR", + "NRA", + "NRW", + "NTT", + "NU", + "NYC", + "NZ", + "OBI", + "OBSERVER", + "OFF", + "OFFICE", + "OKINAWA", + "OLAYAN", + "OLAYANGROUP", + "OLDNAVY", + "OLLO", + "OM", + "OMEGA", + "ONE", + "ONG", + "ONL", + "ONLINE", + "ONYOURSIDE", + "OOO", + "OPEN", + "ORACLE", + "ORANGE", + "ORG", + "ORGANIC", + "ORIGINS", + "OSAKA", + "OTSUKA", + "OTT", + "OVH", + "PA", + "PAGE", + "PANASONIC", + "PARIS", + "PARS", + "PARTNERS", + "PARTS", + "PARTY", + "PASSAGENS", + "PAY", + "PCCW", + "PE", + "PET", + "PF", + "PFIZER", + "PG", + "PH", + "PHARMACY", + "PHD", + "PHILIPS", + "PHONE", + "PHOTO", + "PHOTOGRAPHY", + "PHOTOS", + "PHYSIO", + "PICS", + "PICTET", + "PICTURES", + "PID", + "PIN", + "PING", + "PINK", + "PIONEER", + "PIZZA", + "PK", + "PL", + "PLACE", + "PLAY", + "PLAYSTATION", + "PLUMBING", + "PLUS", + "PM", + "PN", + "PNC", + "POHL", + "POKER", + "POLITIE", + "PORN", + "POST", + "PR", + "PRAMERICA", + "PRAXI", + "PRESS", + "PRIME", + "PRO", + "PROD", + "PRODUCTIONS", + "PROF", + "PROGRESSIVE", + "PROMO", + "PROPERTIES", + "PROPERTY", + "PROTECTION", + "PRU", + "PRUDENTIAL", + "PS", + "PT", + "PUB", + "PW", + "PWC", + "PY", + "QA", + "QPON", + "QUEBEC", + "QUEST", + "QVC", + "RACING", + "RADIO", + "RAID", + "RE", + "READ", + "REALESTATE", + "REALTOR", + "REALTY", + "RECIPES", + "RED", + "REDSTONE", + "REDUMBRELLA", + "REHAB", + "REISE", + "REISEN", + "REIT", + "RELIANCE", + "REN", + "RENT", + "RENTALS", + "REPAIR", + "REPORT", + "REPUBLICAN", + "REST", + "RESTAURANT", + "REVIEW", + "REVIEWS", + "REXROTH", + "RICH", + "RICHARDLI", + "RICOH", + "RIL", + "RIO", + "RIP", + "RMIT", + "RO", + "ROCHER", + "ROCKS", + "RODEO", + "ROGERS", + "ROOM", + "RS", + "RSVP", + "RU", + "RUGBY", + "RUHR", + "RUN", + "RW", + "RWE", + "RYUKYU", + "SA", + "SAARLAND", + "SAFE", + "SAFETY", + "SAKURA", + "SALE", + "SALON", + "SAMSCLUB", + "SAMSUNG", + "SANDVIK", + "SANDVIKCOROMANT", + "SANOFI", + "SAP", + "SARL", + "SAS", + "SAVE", + "SAXO", + "SB", + "SBI", + "SBS", + "SC", + "SCA", + "SCB", + "SCHAEFFLER", + "SCHMIDT", + "SCHOLARSHIPS", + "SCHOOL", + "SCHULE", + "SCHWARZ", + "SCIENCE", + "SCJOHNSON", + "SCOT", + "SD", + "SE", + "SEARCH", + "SEAT", + "SECURE", + "SECURITY", + "SEEK", + "SELECT", + "SENER", + "SERVICES", + "SES", + "SEVEN", + "SEW", + "SEX", + "SEXY", + "SFR", + "SG", + "SH", + "SHANGRILA", + "SHARP", + "SHAW", + "SHELL", + "SHIA", + "SHIKSHA", + "SHOES", + "SHOP", + "SHOPPING", + "SHOUJI", + "SHOW", + "SHOWTIME", + "SHRIRAM", + "SI", + "SILK", + "SINA", + "SINGLES", + "SITE", + "SJ", + "SK", + "SKI", + "SKIN", + "SKY", + "SKYPE", + "SL", + "SLING", + "SM", + "SMART", + "SMILE", + "SN", + "SNCF", + "SO", + "SOCCER", + "SOCIAL", + "SOFTBANK", + "SOFTWARE", + "SOHU", + "SOLAR", + "SOLUTIONS", + "SONG", + "SONY", + "SOY", + "SPA", + "SPACE", + "SPORT", + "SPOT", + "SPREADBETTING", + "SR", + "SRL", + "SS", + "ST", + "STADA", + "STAPLES", + "STAR", + "STATEBANK", + "STATEFARM", + "STC", + "STCGROUP", + "STOCKHOLM", + "STORAGE", + "STORE", + "STREAM", + "STUDIO", + "STUDY", + "STYLE", + "SU", + "SUCKS", + "SUPPLIES", + "SUPPLY", + "SUPPORT", + "SURF", + "SURGERY", + "SUZUKI", + "SV", + "SWATCH", + "SWIFTCOVER", + "SWISS", + "SX", + "SY", + "SYDNEY", + "SYSTEMS", + "SZ", + "TAB", + "TAIPEI", + "TALK", + "TAOBAO", + "TARGET", + "TATAMOTORS", + "TATAR", + "TATTOO", + "TAX", + "TAXI", + "TC", + "TCI", + "TD", + "TDK", + "TEAM", + "TECH", + "TECHNOLOGY", + "TEL", + "TEMASEK", + "TENNIS", + "TEVA", + "TF", + "TG", + "TH", + "THD", + "THEATER", + "THEATRE", + "TIAA", + "TICKETS", + "TIENDA", + "TIFFANY", + "TIPS", + "TIRES", + "TIROL", + "TJ", + "TJMAXX", + "TJX", + "TK", + "TKMAXX", + "TL", + "TM", + "TMALL", + "TN", + "TO", + "TODAY", + "TOKYO", + "TOOLS", + "TOP", + "TORAY", + "TOSHIBA", + "TOTAL", + "TOURS", + "TOWN", + "TOYOTA", + "TOYS", + "TR", + "TRADE", + "TRADING", + "TRAINING", + "TRAVEL", + "TRAVELCHANNEL", + "TRAVELERS", + "TRAVELERSINSURANCE", + "TRUST", + "TRV", + "TT", + "TUBE", + "TUI", + "TUNES", + "TUSHU", + "TV", + "TVS", + "TW", + "TZ", + "UA", + "UBANK", + "UBS", + "UG", + "UK", + "UNICOM", + "UNIVERSITY", + "UNO", + "UOL", + "UPS", + "US", + "UY", + "UZ", + "VA", + "VACATIONS", + "VANA", + "VANGUARD", + "VC", + "VE", + "VEGAS", + "VENTURES", + "VERISIGN", + "VERSICHERUNG", + "VET", + "VG", + "VI", + "VIAJES", + "VIDEO", + "VIG", + "VIKING", + "VILLAS", + "VIN", + "VIP", + "VIRGIN", + "VISA", + "VISION", + "VIVA", + "VIVO", + "VLAANDEREN", + "VN", + "VODKA", + "VOLKSWAGEN", + "VOLVO", + "VOTE", + "VOTING", + "VOTO", + "VOYAGE", + "VU", + "VUELOS", + "WALES", + "WALMART", + "WALTER", + "WANG", + "WANGGOU", + "WATCH", + "WATCHES", + "WEATHER", + "WEATHERCHANNEL", + "WEBCAM", + "WEBER", + "WEBSITE", + "WED", + "WEDDING", + "WEIBO", + "WEIR", + "WF", + "WHOSWHO", + "WIEN", + "WIKI", + "WILLIAMHILL", + "WIN", + "WINDOWS", + "WINE", + "WINNERS", + "WME", + "WOLTERSKLUWER", + "WOODSIDE", + "WORK", + "WORKS", + "WORLD", + "WOW", + "WS", + "WTC", + "WTF", + "XBOX", + "XEROX", + "XFINITY", + "XIHUAN", + "XIN", + "XN--11B4C3D", + "XN--1CK2E1B", + "XN--1QQW23A", + "XN--2SCRJ9C", + "XN--30RR7Y", + "XN--3BST00M", + "XN--3DS443G", + "XN--3E0B707E", + "XN--3HCRJ9C", + "XN--3OQ18VL8PN36A", + "XN--3PXU8K", + "XN--42C2D9A", + "XN--45BR5CYL", + "XN--45BRJ9C", + "XN--45Q11C", + "XN--4GBRIM", + "XN--54B7FTA0CC", + "XN--55QW42G", + "XN--55QX5D", + "XN--5SU34J936BGSG", + "XN--5TZM5G", + "XN--6FRZ82G", + "XN--6QQ986B3XL", + "XN--80ADXHKS", + "XN--80AO21A", + "XN--80AQECDR1A", + "XN--80ASEHDB", + "XN--80ASWG", + "XN--8Y0A063A", + "XN--90A3AC", + "XN--90AE", + "XN--90AIS", + "XN--9DBQ2A", + "XN--9ET52U", + "XN--9KRT00A", + "XN--B4W605FERD", + "XN--BCK1B9A5DRE4C", + "XN--C1AVG", + "XN--C2BR7G", + "XN--CCK2B3B", + "XN--CCKWCXETD", + "XN--CG4BKI", + "XN--CLCHC0EA0B2G2A9GCD", + "XN--CZR694B", + "XN--CZRS0T", + "XN--CZRU2D", + "XN--D1ACJ3B", + "XN--D1ALF", + "XN--E1A4C", + "XN--ECKVDTC9D", + "XN--EFVY88H", + "XN--FCT429K", + "XN--FHBEI", + "XN--FIQ228C5HS", + "XN--FIQ64B", + "XN--FIQS8S", + "XN--FIQZ9S", + "XN--FJQ720A", + "XN--FLW351E", + "XN--FPCRJ9C3D", + "XN--FZC2C9E2C", + "XN--FZYS8D69UVGM", + "XN--G2XX48C", + "XN--GCKR3F0F", + "XN--GECRJ9C", + "XN--GK3AT1E", + "XN--H2BREG3EVE", + "XN--H2BRJ9C", + "XN--H2BRJ9C8C", + "XN--HXT814E", + "XN--I1B6B1A6A2E", + "XN--IMR513N", + "XN--IO0A7I", + "XN--J1AEF", + "XN--J1AMH", + "XN--J6W193G", + "XN--JLQ480N2RG", + "XN--JLQ61U9W7B", + "XN--JVR189M", + "XN--KCRX77D1X4A", + "XN--KPRW13D", + "XN--KPRY57D", + "XN--KPUT3I", + "XN--L1ACC", + "XN--LGBBAT1AD8J", + "XN--MGB9AWBF", + "XN--MGBA3A3EJT", + "XN--MGBA3A4F16A", + "XN--MGBA7C0BBN0A", + "XN--MGBAAKC7DVF", + "XN--MGBAAM7A8H", + "XN--MGBAB2BD", + "XN--MGBAH1A3HJKRD", + "XN--MGBAI9AZGQP6J", + "XN--MGBAYH7GPA", + "XN--MGBBH1A", + "XN--MGBBH1A71E", + "XN--MGBC0A9AZCG", + "XN--MGBCA7DZDO", + "XN--MGBCPQ6GPA1A", + "XN--MGBERP4A5D4AR", + "XN--MGBGU82A", + "XN--MGBI4ECEXP", + "XN--MGBPL2FH", + "XN--MGBT3DHD", + "XN--MGBTX2B", + "XN--MGBX4CD0AB", + "XN--MIX891F", + "XN--MK1BU44C", + "XN--MXTQ1M", + "XN--NGBC5AZD", + "XN--NGBE9E0A", + "XN--NGBRX", + "XN--NODE", + "XN--NQV7F", + "XN--NQV7FS00EMA", + "XN--NYQY26A", + "XN--O3CW4H", + "XN--OGBPF8FL", + "XN--OTU796D", + "XN--P1ACF", + "XN--P1AI", + "XN--PGBS0DH", + "XN--PSSY2U", + "XN--Q7CE6A", + "XN--Q9JYB4C", + "XN--QCKA1PMC", + "XN--QXA6A", + "XN--QXAM", + "XN--RHQV96G", + "XN--ROVU88B", + "XN--RVC1E0AM3E", + "XN--S9BRJ9C", + "XN--SES554G", + "XN--T60B56A", + "XN--TCKWE", + "XN--TIQ49XQYJ", + "XN--UNUP4Y", + "XN--VERMGENSBERATER-CTB", + "XN--VERMGENSBERATUNG-PWB", + "XN--VHQUV", + "XN--VUQ861B", + "XN--W4R85EL8FHU5DNRA", + "XN--W4RS40L", + "XN--WGBH1C", + "XN--WGBL6A", + "XN--XHQ521B", + "XN--XKC2AL3HYE2A", + "XN--XKC2DL3A5EE0H", + "XN--Y9A3AQ", + "XN--YFRO4I67O", + "XN--YGBI2AMMX", + "XN--ZFR164B", + "XXX", + "XYZ", + "YACHTS", + "YAHOO", + "YAMAXUN", + "YANDEX", + "YE", + "YODOBASHI", + "YOGA", + "YOKOHAMA", + "YOU", + "YOUTUBE", + "YT", + "YUN", + "ZA", + "ZAPPOS", + "ZARA", + "ZERO", + "ZIP", + "ZM", + "ZONE", + "ZUERICH", + "ZW", +] diff --git a/linkify_it/ucre.py b/linkify_it/ucre.py new file mode 100644 index 0000000..063def5 --- /dev/null +++ b/linkify_it/ucre.py @@ -0,0 +1,264 @@ +from uc_micro.categories import Cc, Cf, P, Z +from uc_micro.properties import Any + +SRC_ANY = Any.REGEX +SRC_CC = Cc.REGEX +SRC_CF = Cf.REGEX +SRC_P = P.REGEX +SRC_Z = Z.REGEX + +# \p{\Z\P\Cc\CF} (white spaces + control + format + punctuation) +SRC_ZPCC = "|".join([SRC_Z, SRC_P, SRC_CC]) + +# \p{\Z\Cc} (white spaces + control) +SRC_ZCC = "|".join([SRC_Z, SRC_CC]) + +# Experimental. List of chars, completely prohibited in links +# because can separate it from other part of text +TEXT_SEPARATORS = "[><\uff5c]" + +# All possible word characters (everything without punctuation, spaces & controls) +# Defined via punctuation & spaces to save space +# Should be something like \p{\L\N\S\M} (\w but without `_`) +SRC_PSEUDO_LETTER = "(?:(?!" + TEXT_SEPARATORS + "|" + SRC_ZPCC + ")" + SRC_ANY + ")" +# The same as abothe but without [0-9] +# var SRC_PSEUDO_LETTER_non_d = '(?:(?![0-9]|' + SRC_ZPCC + ')' + SRC_ANY + ')' + +# ============================================================================= + +SRC_IP4 = ( + "(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|" + + "2[0-4][0-9]|[01]?[0-9][0-9]?)" +) + +# Prohibit any of "@/[]()" in user/pass to avoid wrong domain fetch. +SRC_AUTH = "(?:(?:(?!" + SRC_ZCC + "|[@/\\[\\]()]).)+@)?" + +SRC_PORT = ( + "(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?" +) + +# Allow anything in markdown spec, forbid quote (") at the first position +# because emails enclosed in quotes are far more common +SRC_EMAIL_NAME = '[\\-:&=\\+\\$,\\.a-zA-Z0-9_][\\-:&=\\+\\$,\\"\\.a-zA-Z0-9_]*' + +SRC_XN = "xn--[a-z0-9\\-]{1,59}" + +# More to read about domain names +# http:#serverfault.com/questions/638260/ + +# Allow letters & digits (http:#test1) +SRC_DOMAIN_ROOT = "(?:" + SRC_XN + "|" + SRC_PSEUDO_LETTER + "{1,63}" + ")" + +SRC_DOMAIN = ( + "(?:" + + SRC_XN + + "|" + + "(?:" + + SRC_PSEUDO_LETTER + + ")" + + "|" + + "(?:" + + SRC_PSEUDO_LETTER + + "(?:-|" + + SRC_PSEUDO_LETTER + + "){0,61}" + + SRC_PSEUDO_LETTER + + ")" + + ")" +) + +SRC_HOST = ( + "(?:" + + + # Don't need IP check, because digits are already allowed in normal domain names + # SRC_IP4 + + # '|' + + "(?:(?:(?:" + + SRC_DOMAIN + + ")\\.)*" + + SRC_DOMAIN # _root + + ")" + + ")" +) + +TPL_HOST_FUZZY = ( + "(?:" + SRC_IP4 + "|" + "(?:(?:(?:" + SRC_DOMAIN + ")\\.)+(?:%TLDS%))" + ")" +) + +TPL_HOST_NO_IP_FUZZY = "(?:(?:(?:" + SRC_DOMAIN + ")\\.)+(?:%TLDS%))" + + +# ============================================================================= + +# Rude test fuzzy links by host, for quick deny +TPL_HOST_FUZZY_TEST = ( + "localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:" + SRC_ZPCC + "|>|$))" +) + + +def _re_host_terminator(opts): + src_host_terminator = ( + "(?=$|" + + TEXT_SEPARATORS + + "|" + + SRC_ZPCC + + ")" + + "(?!" + + ("-(?!--)|" if opts.get("---") else "-|") + + "_|:\\d|\\.-|\\.(?!$|" + + SRC_ZPCC + + "))" + ) + return src_host_terminator + + +def _re_src_path(opts): + src_path = ( + "(?:" + + "[/?#]" + + "(?:" + + "(?!" + + SRC_ZCC + + "|" + + TEXT_SEPARATORS + + "|[()[\\]{}.,\"'?!\\-;]).|" + + "\\[(?:(?!" + + SRC_ZCC + + "|\\]).)*\\]|" + + "\\((?:(?!" + + SRC_ZCC + + "|[)]).)*\\)|" + + "\\{(?:(?!" + + SRC_ZCC + + "|[}]).)*\\}|" + + '\\"(?:(?!' + + SRC_ZCC + + '|["]).)+\\"|' + + "\\'(?:(?!" + + SRC_ZCC + + "|[']).)+\\'|" + + "\\'(?=" + + SRC_PSEUDO_LETTER + + "|[-])|" + + "\\.{2,}[a-zA-Z0-9%/&]|" + # google has many dots in "google search" links (#66, #81). + # github has ... in commit range links, + # ReSTRICT to + # - english + # - percent-encoded + # - parts of file path + # - params separator + # until more examples found. + + "\\.(?!" + + SRC_ZCC + + "|[.]|$)|" + + ("\\-(?!--(?:[^-]|$))(?:-*)|" if opts.get("---") else "\\-+|") + + ",(?!" + + SRC_ZCC + + "|$)|" # allow `,,,` in paths + + ";(?!" + + SRC_ZCC + + "|$)|" # allow `,,,` in paths + + "\\!+(?!" + + SRC_ZCC + + "|[!]|$)|" # allow `!!!` in paths, but not at the end + + "\\?(?!" + + SRC_ZCC + + "|[?]|$)" + + ")+" + + "|\\/" + + ")?" + ) + + return src_path + + +def build_re(opts): + """Build regex + + Args: + opts (dict): options + + Return: + dict: dict of regex string + """ + SRC_HOST_STRICT = SRC_HOST + _re_host_terminator(opts) + + TPL_HOST_FUZZY_STRICT = TPL_HOST_FUZZY + _re_host_terminator(opts) + + SRC_HOST_PORT_STRICT = SRC_HOST + SRC_PORT + _re_host_terminator(opts) + + TPL_HOST_PORT_FUZZY_STRICT = TPL_HOST_FUZZY + SRC_PORT + _re_host_terminator(opts) + + TPL_HOST_PORT_NO_IP_FUZZY_STRICT = ( + TPL_HOST_NO_IP_FUZZY + SRC_PORT + _re_host_terminator(opts) + ) + + TPL_EMAIL_FUZZY = ( + "(^|" + + TEXT_SEPARATORS + + '|"|\\(|' + + SRC_ZCC + + ")" + + "(" + + SRC_EMAIL_NAME + + "@" + + TPL_HOST_FUZZY_STRICT + + ")" + ) + + regex = { + "src_Any": SRC_ANY, + "src_Cc": SRC_CC, + "src_Cf": SRC_CF, + "src_Z": SRC_Z, + "src_P": SRC_P, + "src_ZPCc": SRC_ZPCC, + "src_ZCc": SRC_ZCC, + "src_pseudo_letter": SRC_PSEUDO_LETTER, + "src_ip4": SRC_IP4, + "src_auth": SRC_AUTH, + "src_port": SRC_PORT, + "src_host_terminator": _re_host_terminator(opts), + "src_path": _re_src_path(opts), + "src_email_name": SRC_EMAIL_NAME, + "src_xn": SRC_XN, + "src_domain_root": SRC_DOMAIN_ROOT, + "src_domain": SRC_DOMAIN, + "src_host": SRC_HOST, + "tpl_host_fuzzy": TPL_HOST_FUZZY, + "tpl_host_no_ip_fuzzy": TPL_HOST_NO_IP_FUZZY, + "src_host_strict": SRC_HOST_STRICT, + "tpl_host_fuzzy_strict": TPL_HOST_FUZZY_STRICT, + "src_host_port_strict": SRC_HOST_PORT_STRICT, + "tpl_host_port_fuzzy_strict": TPL_HOST_PORT_FUZZY_STRICT, + "tpl_host_port_no_ip_fuzzy_strict": TPL_HOST_PORT_FUZZY_STRICT, + # Main rules + "tpl_host_fuzzy_test": TPL_HOST_FUZZY_TEST, + "tpl_email_fuzzy": TPL_EMAIL_FUZZY, + # Fuzzy link can't be prepended with .:/\- and non punctuation. + # but can start with > (markdown blockquote) + "tpl_link_fuzzy": ( + "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|" + + SRC_ZPCC + + "))" + + "((?![$+<=>^`|\uff5c])" + + TPL_HOST_PORT_FUZZY_STRICT + + _re_src_path(opts) + + ")" + ), + # Fuzzy link can't be prepended with .:/\- and non punctuation. + # but can start with > (markdown blockquote) + "tpl_link_no_ip_fuzzy": ( + "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|" + + SRC_ZPCC + + "))" + + "((?![$+<=>^`|\uff5c])" + + TPL_HOST_PORT_NO_IP_FUZZY_STRICT + + _re_src_path(opts) + + ")" + ), + } + + return regex diff --git a/linkify_it_py.egg-info/PKG-INFO b/linkify_it_py.egg-info/PKG-INFO new file mode 100644 index 0000000..37b3186 --- /dev/null +++ b/linkify_it_py.egg-info/PKG-INFO @@ -0,0 +1,251 @@ +Metadata-Version: 2.1 +Name: linkify-it-py +Version: 2.0.3 +Summary: Links recognition library with FULL unicode support. +Author: tsutsu3 +License: MIT +Project-URL: Homepage, https://github.com/tsutsu3/linkify-it-py +Keywords: linkify,linkifier,autolink,autolinker +Classifier: Development Status :: 5 - Production/Stable +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Classifier: Intended Audience :: Developers +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Requires-Python: >=3.7 +Description-Content-Type: text/markdown +Provides-Extra: test +Provides-Extra: dev +Provides-Extra: benchmark +Provides-Extra: doc +License-File: LICENSE + +# linkify-it-py + +[![CI](https://github.com/tsutsu3/linkify-it-py/workflows/CI/badge.svg?branch=main)](https://github.com/tsutsu3/linkify-it-py/actions) +[![pypi](https://img.shields.io/pypi/v/linkify-it-py)](https://pypi.org/project/linkify-it-py/) +[![Anaconda-Server Badge](https://anaconda.org/conda-forge/linkify-it-py/badges/version.svg)](https://anaconda.org/conda-forge/linkify-it-py) +[![Documentation Status](https://readthedocs.org/projects/linkify-it-py/badge/?version=latest)](https://linkify-it-py.readthedocs.io/en/latest/?badge=latest) +[![codecov](https://codecov.io/gh/tsutsu3/linkify-it-py/branch/main/graph/badge.svg)](https://codecov.io/gh/tsutsu3/linkify-it-py) +[![Maintainability](https://api.codeclimate.com/v1/badges/6341fd3ec5f05fde392f/maintainability)](https://codeclimate.com/github/tsutsu3/linkify-it-py/maintainability) + +This is Python port of [linkify-it](https://github.com/markdown-it/linkify-it). + +> Links recognition library with FULL unicode support. +> Focused on high quality link patterns detection in plain text. + +__[Demo](https://linkify-it-py-demo.vercel.app/)__ + +__[Javascript Demo](http://markdown-it.github.io/linkify-it/)__ + +Why it's awesome: + +- Full unicode support, _with astral characters_! +- International domains support. +- Allows rules extension & custom normalizers. + + +## Install + +```bash +pip install linkify-it-py +``` + +or + +```bash +conda install -c conda-forge linkify-it-py +``` + +## Usage examples + +### Example 1. Simple use + +```python +from linkify_it import LinkifyIt + + +linkify = LinkifyIt() + +print(linkify.test("Site github.com!")) +# => True + +print(linkify.match("Site github.com!")) +# => [linkify_it.main.Match({ +# 'schema': '', +# 'index': 5, +# 'last_index': 15, +# 'raw': 'github.com', +# 'text': 'github.com', +# 'url': 'http://github.com' +# }] +``` + +### Example 2. With options + +```python +from linkify_it import LinkifyIt +from linkify_it.tlds import TLDS + + +# Reload full tlds list & add unofficial `.onion` domain. +linkify = ( + LinkifyIt() + .tlds(TLDS) # Reload with full tlds list + .tlds("onion", True) # Add unofficial `.onion` domain + .add("git:", "http:") # Add `git:` protocol as "alias" + .add("ftp:", None) # Disable `ftp:` protocol + .set({"fuzzy_ip": True}) # Enable IPs in fuzzy links (without schema) +) +print(linkify.test("Site tamanegi.onion!")) +# => True + +print(linkify.match("Site tamanegi.onion!")) +# => [linkify_it.main.Match({ +# 'schema': '', +# 'index': 5, +# 'last_index': 19, +# 'raw': 'tamanegi.onion', +# 'text': 'tamanegi.onion', +# 'url': 'http://tamanegi.onion' +# }] +``` + +### Example 3. Add twitter mentions handler + +```python +from linkify_it import LinkifyIt + + +linkify = LinkifyIt() + +def validate(obj, text, pos): + tail = text[pos:] + + if not obj.re.get("twitter"): + obj.re["twitter"] = re.compile( + "^([a-zA-Z0-9_]){1,15}(?!_)(?=$|" + obj.re["src_ZPCc"] + ")" + ) + if obj.re["twitter"].search(tail): + if pos > 2 and tail[pos - 2] == "@": + return False + return len(obj.re["twitter"].search(tail).group()) + return 0 + +def normalize(obj, match): + match.url = "https://twitter.com/" + re.sub(r"^@", "", match.url) + +linkify.add("@", {"validate": validate, "normalize": normalize}) +``` + + +## API + +[API documentation](https://linkify-it-py.readthedocs.io/en/latest/) + +### LinkifyIt(schemas, options) + +Creates new linkifier instance with optional additional schemas. + +By default understands: + +- `http(s)://...` , `ftp://...`, `mailto:...` & `//...` links +- "fuzzy" links and emails (google.com, foo@bar.com). + +`schemas` is an dict, where each key/value describes protocol/rule: + +- __key__ - link prefix (usually, protocol name with `:` at the end, `skype:` + for example). `linkify-it-py` makes sure that prefix is not preceded with + alphanumeric char. +- __value__ - rule to check tail after link prefix + - _str_ + - just alias to existing rule + - _dict_ + - _validate_ - either a `re.Pattern` (start with `^`, and don't include the + link prefix itself), or a validator `function` which, given arguments + _self_, _text_ and _pos_, returns the length of a match in _text_ + starting at index _pos_. _pos_ is the index right after the link prefix. + _self_ can be used to access the linkify object to cache data. + - _normalize_ - optional function to normalize text & url of matched result + (for example, for twitter mentions). + +`options`: + +- __fuzzy_link__ - recognize URL-s without `http(s)://` head. Default `True`. +- __fuzzy_ip__ - allow IPs in fuzzy links above. Can conflict with some texts + like version numbers. Default `False`. +- __fuzzy_email__ - recognize emails without `mailto:` prefix. Default `True`. +- __---__ - set `True` to terminate link with `---` (if it's considered as long dash). + + +### .test(text) + +Searches linkifiable pattern and returns `True` on success or `False` on fail. + + +### .pretest(text) + +Quick check if link MAY BE can exist. Can be used to optimize more expensive +`.test()` calls. Return `False` if link can not be found, `True` - if `.test()` +call needed to know exactly. + + +### .test_schema_at(text, name, position) + +Similar to `.test()` but checks only specific protocol tail exactly at given +position. Returns length of found pattern (0 on fail). + + +### .match(text) + +Returns `list` of found link matches or null if nothing found. + +Each match has: + +- __schema__ - link schema, can be empty for fuzzy links, or `//` for + protocol-neutral links. +- __index__ - offset of matched text +- __last_index__ - index of next char after mathch end +- __raw__ - matched text +- __text__ - normalized text +- __url__ - link, generated from matched text + +### .matchAtStart(text) + +Checks if a match exists at the start of the string. Returns `Match` +(see docs for `match(text)`) or null if no URL is at the start. +Doesn't work with fuzzy links. + +### .tlds(list_tlds, keep_old=False) + +Load (or merge) new tlds list. Those are needed for fuzzy links (without schema) +to avoid false positives. By default: + +- 2-letter root zones are ok. +- biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф are ok. +- encoded (`xn--...`) root zones are ok. + +If that's not enough, you can reload defaults with more detailed zones list. + +### .add(key, value) + +Add a new schema to the schemas object. As described in the constructor +definition, `key` is a link prefix (`skype:`, for example), and `value` +is a `str` to alias to another schema, or an `dict` with `validate` and +optionally `normalize` definitions. To disable an existing rule, use +`.add(key, None)`. + + +### .set(options) + +Override default options. Missed properties will not be changed. + + +## License + +[MIT](https://github.com/tsutsu3/linkify-it-py/blob/master/LICENSE) diff --git a/linkify_it_py.egg-info/SOURCES.txt b/linkify_it_py.egg-info/SOURCES.txt new file mode 100644 index 0000000..c0b158a --- /dev/null +++ b/linkify_it_py.egg-info/SOURCES.txt @@ -0,0 +1,21 @@ +CHANGELOG.md +LICENSE +MANIFEST.in +README.md +pyproject.toml +tox.ini +linkify_it/__init__.py +linkify_it/main.py +linkify_it/tlds.py +linkify_it/ucre.py +linkify_it_py.egg-info/PKG-INFO +linkify_it_py.egg-info/SOURCES.txt +linkify_it_py.egg-info/dependency_links.txt +linkify_it_py.egg-info/requires.txt +linkify_it_py.egg-info/top_level.txt +test/__init__.py +test/test_apis.py +test/test_linkify.py +test/utils.py +test/fixtures/links.txt +test/fixtures/not_links.txt \ No newline at end of file diff --git a/linkify_it_py.egg-info/dependency_links.txt b/linkify_it_py.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/linkify_it_py.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/linkify_it_py.egg-info/requires.txt b/linkify_it_py.egg-info/requires.txt new file mode 100644 index 0000000..4e203ad --- /dev/null +++ b/linkify_it_py.egg-info/requires.txt @@ -0,0 +1,22 @@ +uc-micro-py + +[benchmark] +pytest +pytest-benchmark + +[dev] +pre-commit +isort +flake8 +black +pyproject-flake8 + +[doc] +sphinx +sphinx_book_theme +myst-parser + +[test] +pytest +coverage +pytest-cov diff --git a/linkify_it_py.egg-info/top_level.txt b/linkify_it_py.egg-info/top_level.txt new file mode 100644 index 0000000..c98649a --- /dev/null +++ b/linkify_it_py.egg-info/top_level.txt @@ -0,0 +1 @@ +linkify_it diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c0f0207 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,52 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "linkify-it-py" +authors = [{ name = "tsutsu3" }] +description = "Links recognition library with FULL unicode support." +urls = { Homepage = "https://github.com/tsutsu3/linkify-it-py" } +readme = "README.md" +requires-python = ">=3.7" +keywords = ["linkify", "linkifier", "autolink", "autolinker"] +dependencies = ["uc-micro-py"] +license = { text = "MIT" } +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Intended Audience :: Developers", + "Topic :: Software Development :: Libraries :: Python Modules", +] +dynamic = ["version"] + +[project.optional-dependencies] +test = ["pytest", "coverage", "pytest-cov"] +dev = ["pre-commit", "isort", "flake8", "black", "pyproject-flake8"] +benchmark = ["pytest", "pytest-benchmark"] +doc = ["sphinx", "sphinx_book_theme", "myst-parser"] + +[tool.setuptools] +packages = ["linkify_it"] + +[tool.setuptools.dynamic] +version = { attr = "linkify_it.__version__" } + +[tool.isort] +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true +line_length = 88 + +[tool.flake8] +max-line-length = 88 +extend-ignore = "E203, W503" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..8bfd5a1 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,4 @@ +[egg_info] +tag_build = +tag_date = 0 + diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/fixtures/links.txt b/test/fixtures/links.txt new file mode 100644 index 0000000..29b2a9b --- /dev/null +++ b/test/fixtures/links.txt @@ -0,0 +1,335 @@ +% +% Regular links +% +My http://example.com site +http://example.com + +My http://example.com/ site +http://example.com/ + +http://example.com/foo_bar/ + +http://user:pass@example.com:8080 + +http://user@example.com + +http://user@example.com:8080 + +http://user:pass@example.com + +[https](https://www.ibm.com)[mailto](mailto:someone@ibm.com) % should not catch as auth (before @ in big link) +https://www.ibm.com + +http://example.com:8080 + +http://example.com/?foo=bar + +http://example.com?foo=bar + +http://example.com/#foo=bar + +http://example.com#foo=bar + +http://a.in + +HTTP://GOOGLE.COM + +http://example.invalid % don't restrict root domain when schema exists +http://example.invalid + +http://inrgess2 % Allow local domains to end with digit +http://inrgess2 + +http://999 % ..and start with digit, and have digits only +http://999 + +http://host-name % local domain with dash +http://host-name + +>>example.com % markdown blockquote +example.com + +>>http://example.com % markdown blockquote +http://example.com + +http://lyricstranslate.com/en/someone-you-നിന്നെ-പോലൊരാള്‍.html % With control character +http://lyricstranslate.com/en/someone-you-നിന്നെ-പോലൊരാള്‍.html + +% +% localhost (only with protocol allowed) +% +//localhost + +//test.123 + +http://localhost:8000? +http://localhost:8000 + + +% +% Other protocols +% +My ssl https://example.com site +https://example.com + +My ftp://example.com site +ftp://example.com + + +% +% Neutral proto +% +My ssl //example.com site +//example.com + +% +% IPs +% +4.4.4.4 + +192.168.1.1/abc + + +% +% Fuzzy +% +test.example@http://vk.com +http://vk.com + +text:http://example.com/ +http://example.com/ + +google.com + +google.com: // no port +google.com + +s.l.o.w.io + +a-b.com + +GOOGLE.COM. +GOOGLE.COM + +google.xxx // known tld +google.xxx + + +% +% Correct termination for . , ! ? [] {} () "" '' +% +(Scoped http://example.com/foo_bar) +http://example.com/foo_bar + +http://example.com/foo_bar_(wiki) + +http://foo.com/blah_blah_[other] + +http://foo.com/blah_blah_{I'm_king} + +http://foo.com/blah_blah_I'm_king + +http://www.kmart.com/bestway-10'-x-30inch-steel-pro-frame-pool/p-004W007538417001P + +http://foo.com/blah_blah_"doublequoted" + +http://foo.com/blah_blah_'singlequoted' + +(Scoped like http://example.com/foo_bar) +http://example.com/foo_bar + +[Scoped like http://example.com/foo_bar] +http://example.com/foo_bar + +{Scoped like http://example.com/foo_bar} +http://example.com/foo_bar + +"Quoted like http://example.com/foo_bar" +http://example.com/foo_bar + +'Quoted like http://example.com/foo_bar' +http://example.com/foo_bar + +[example.com/foo_bar.jpg)] +example.com/foo_bar.jpg + +http://example.com/foo_bar.jpg. +http://example.com/foo_bar.jpg + +http://example.com/foo_bar/. +http://example.com/foo_bar/ + +http://example.com/foo_bar, +http://example.com/foo_bar + +http://index-of.es/Android/Professional.Android.2.Application.Development.(Wrox,.2010,.0470565527).pdf + +https://github.com/markdown-it/linkify-it/compare/360b13a733f521a8d4903d3a5e1e46c357e9d3ce...f580766349525150a80a32987bb47c2d592efc33 + +https://www.google.com/search?sxsrf=ACYBGNTJFmX-GjNJ8fM-2LCkqyNyxGU1Ng%3A1575534146332&ei=Qr7oXf7rE4rRrgSEgrmoAw&q=clover&oq=clover&gs_l=psy-ab.3..0i67j0l9.2986.3947..4187...0.2..0.281.1366.1j0j5......0....1..gws-wiz.......0i71j35i39j0i131.qWp1nz4IJVA&ved=0ahUKEwj-lP6Iip7mAhWKqIsKHQRBDjUQ4dUDCAs&uact=5 + +https://ourworldindata.org/grapher/covid-deaths-days-since-per-million?zoomToSelection=true&time=9..&country=FRA+DEU+ITA+ESP+GBR+USA+CAN + +http://example.com/foo_bar... +http://example.com/foo_bar + +http://172.26.142.48/viewerjs/#../0529/slides.pdf + +http://example.com/foo_bar.. +http://example.com/foo_bar + +http://example.com/foo_bar?p=10. +http://example.com/foo_bar?p=10 + +https://www.google.ru/maps/@59.9393895,30.3165389,15z?hl=ru + +https://www.google.com/maps/place/New+York,+NY,+USA/@40.702271,-73.9968471,11z/data=!4m2!3m1!1s0x89c24fa5d33f083b:0xc80b8f06e177fe62?hl=en + +https://www.google.com/analytics/web/?hl=ru&pli=1#report/visitors-overview/a26895874w20458057p96934174/ + +http://business.timesonline.co.uk/article/0,,9065-2473189,00.html + +https://google.com/mail/u/0/#label/!!!Today/15c9b8193da01e65 + +http://example.com/123! +http://example.com/123 + +http://example.com/123!!! +http://example.com/123 + +http://example.com/foo--bar + +See http://example.com/123; Example link. +http://example.com/123 + +http://example.com/123;123 + +% some sites have links with trailing dashes +http://www.bloomberg.com/news/articles/2015-06-26/from-deutsche-bank-to-siemens-what-s-troubling-germany-inc- + +http://example.com/foo-with-trailing-dash-dot-. +http://example.com/foo-with-trailing-dash-dot- + + +http://domain.com + +. +http://domain.com + + +http://domain.com/foo + +. +http://domain.com/foo + + +domain.com + +. +domain.com + + +domain.com/foo + + +user@domain.com + +. +user@domain.com + + +mailto:user@domain.com + + +% +% Emails +% + +test."foo".bar@gmail.co.uk! +test."foo".bar@gmail.co.uk + +"test@example.com" +test@example.com + +name@example.com + +>>name@example.com % markdown blockquote +name@example.com + +mailto:name@example.com + +MAILTO:NAME@EXAMPLE.COM + +mailto:foo_bar@example.com + +foo+bar@gmail.com + +192.168.1.1@gmail.com + +mailto:foo@bar % explicit protocol make it valid +mailto:foo@bar + +(foobar email@example.com) +email@example.com + +(email@example.com foobar) +email@example.com + +(email@example.com) +email@example.com + + +% +% International +% +http://✪df.ws/123 + +http://xn--df-oiy.ws/123 + +a.ws + +➡.ws/䨹 + +example.com/䨹 + +президент.рф + + +% Links below provided by diaspora* guys, to make sure regressions will not happen. +% Those left here for historic reasons. + +http://www.bürgerentscheid-krankenhäuser.de + +http://www.xn--brgerentscheid-krankenhuser-xkc78d.de + +http://bündnis-für-krankenhäuser.de/wp-content/uploads/2011/11/cropped-logohp.jpg + +http://xn--bndnis-fr-krankenhuser-i5b27cha.de/wp-content/uploads/2011/11/cropped-logohp.jpg + +http://ﻡﻮﻘﻋ.ﻭﺯﺍﺭﺓ-ﺍﻼﺘﺻﺍﻼﺗ.ﻢﺻﺭ/ + +http://xn--4gbrim.xn----ymcbaaajlc6dj7bxne2c.xn--wgbh1c/ + +% +% Others... +% +|www.google.com/www.google.com/foo|bar % #46, asian vertical pipes +www.google.com/www.google.com/foo + +|test@google.com|bar +test@google.com + +|http://google.com|bar +http://google.com + +% +% Domains with multiple dashes +% + +https://5b0ee223b312746c1659db3f--thelounge-chat.netlify.com/docs/ + +www.a--b.com + +www.c--u.com + +http://a---b.com/ diff --git a/test/fixtures/not_links.txt b/test/fixtures/not_links.txt new file mode 100644 index 0000000..9f2f833 --- /dev/null +++ b/test/fixtures/not_links.txt @@ -0,0 +1,52 @@ +% +% Not links +% +example.invalid +example.invalid/ +http://.example.com +http://-example.com +hppt://example.com +example.coma +-example.coma +foo.123 +localhost % only with protocol allowed +localhost/ +///localhost % 3 '/' not allowed +///test.com +//test % Don't allow single level protocol-less domains to avoid false positives + +_http://example.com +_//example.com +_example.com +http://example.com_ +@example.com + +node.js and io.js + +http:// +http://. +http://.. +http://# +http://## +http://? +http://?? +google.com:500000 // invalid port +show image.jpg +path:to:file.pm +/path/to/file.pl + +% +% Not IPv4 +% +1.2.3.4.5 +1.2.3 +1.2.3.400 +1000.2.3.4 +a1.2.3.4 +1.2.3.4a + +% +% Not email +% +foo@bar % Should be at second level domain & with correct tld +mailto:bar diff --git a/test/test_apis.py b/test/test_apis.py new file mode 100644 index 0000000..5f7c05b --- /dev/null +++ b/test/test_apis.py @@ -0,0 +1,321 @@ +import re + +import pytest + +from linkify_it import LinkifyIt, SchemaError +from linkify_it.main import Match +from linkify_it.tlds import TLDS + + +def test_pretest_false(): + linkifyit = LinkifyIt() + assert linkifyit.pretest("nolink") is False + + +def test_create_instance_with_schemas(): + schemas = {"my:": {"validate": r"^\/\/[a-z]+"}} + linkifyit = LinkifyIt(schemas) + + match = linkifyit.match("google.com. my:// my://asdf!") + + assert match[0].text == "google.com" + assert match[1].text == "my://asdf" + + +def test_match_class(): + linkifyit = LinkifyIt() + match = Match(linkifyit, 0) + assert ( + match.__repr__() + == "linkify_it.main.Match({'schema': '', 'index': -1, 'last_index': -1, 'raw': '', 'text': '', 'url': ''})" # noqa: E501 + ) + + +def test_api_extend_tlds(): + linkifyit = LinkifyIt() + + assert linkifyit.test("google.myroot") is False + + linkifyit.tlds("myroot", True) + + assert linkifyit.test("google.myroot") is True + assert linkifyit.test("google.xyz") is False + + # ref - http://data.iana.org/TLD/tlds-alpha-by-domain.txt + linkifyit.tlds(TLDS) + + assert linkifyit.test("google.xyz") is True + assert linkifyit.test("google.myroot") is False + + +def test_api_add_rule_as_regex_with_default_normalizer(): + linkifyit = LinkifyIt().add("my:", {"validate": re.compile(r"^\/\/[a-z]+")}) + + match = linkifyit.match("google.com. my:// my://asdf!") + + assert match[0].text == "google.com" + assert match[1].text == "my://asdf" + + +def test_api_add_rule_as_regex_with_default_normalizer_with_no_compile(): + linkifyit = LinkifyIt().add("my:", {"validate": r"^\/\/[a-z]+"}) + + match = linkifyit.match("google.com. my:// my://asdf!") + + assert match[0].text == "google.com" + assert match[1].text == "my://asdf" + + +def test_api_add_rule_with_normalizer(): + def func_normalize(self, m): + m.text = re.sub(r"^my://", "", m.text).upper() + m.url = m.url.upper() + + linkifyit = LinkifyIt().add( + "my:", {"validate": re.compile(r"^\/\/[a-z]+"), "normalize": func_normalize} + ) + + match = linkifyit.match("google.com. my:// my://asdf!") + + assert match[1].text == "ASDF" + assert match[1].url == "MY://ASDF" + + +def test_api_add_rule_with_normalizer_no_cimpile(): + def func_normalize(self, m): + m.text = re.sub(r"^my://", "", m.text).upper() + m.url = m.url.upper() + + linkifyit = LinkifyIt().add( + "my:", {"validate": r"^\/\/[a-z]+", "normalize": func_normalize} + ) + + match = linkifyit.match("google.com. my:// my://asdf!") + + assert match[1].text == "ASDF" + assert match[1].url == "MY://ASDF" + + +def test_api_disable_rule(): + linkifyit = LinkifyIt() + + assert linkifyit.test("http://google.com") + assert linkifyit.test("foo@bar.com") + linkifyit.add("http:", None) + linkifyit.add("mailto:", None) + assert not linkifyit.test("http://google.com") + assert not linkifyit.test("foo@bar.com") + + +def test_api_add_bad_definition(): + with pytest.raises(SchemaError): + linkifyit = LinkifyIt({"fuzzy_link": False}) + + linkifyit = LinkifyIt() + + with pytest.raises(SchemaError): + linkifyit.add("test:", []) + + linkifyit = LinkifyIt() + + with pytest.raises(SchemaError): + linkifyit.add("test:", {"validate": []}) + + linkifyit = LinkifyIt() + + with pytest.raises(SchemaError): + + def func(): + return False + + linkifyit.add("test:", {"validate": func, "normalize": "bad"}) + + +def test_api_at_position(): + linkifyit = LinkifyIt() + + assert linkifyit.test_schema_at("http://google.com", "http:", 5) + assert linkifyit.test_schema_at("http://google.com", "HTTP:", 5) + assert not linkifyit.test_schema_at("http://google.com", "http:", 6) + + assert not linkifyit.test_schema_at("http://google.com", "bad_schema:", 6) + + +def test_api_correct_cache_value(): + linkifyit = LinkifyIt() + + match = linkifyit.match(".com. http://google.com google.com ftp://google.com") + + assert match[0].text == "http://google.com" + assert match[1].text == "google.com" + assert match[2].text == "ftp://google.com" + + +def test_api_normalize(): + linkifyit = LinkifyIt() + + match = linkifyit.match("mailto:foo@bar.com")[0] + + # assert match.text == "foo@bar.com" + assert match.url == "mailto:foo@bar.com" + + match = linkifyit.match("foo@bar.com")[0] + + # assert match.text == "foo@bar.com" + assert match.url == "mailto:foo@bar.com" + + +def test_api_twitter_rule(): + linkifyit = LinkifyIt() + + def validate(self, text, pos): + tail = text[pos:] + + if not self.re.get("twitter"): + self.re["twitter"] = re.compile( + "^([a-zA-Z0-9_]){1,15}(?!_)(?=$|" + self.re["src_ZPCc"] + ")" + ) + if self.re["twitter"].search(tail): + if pos > 2 and tail[pos - 2] == "@": + return False + return len(self.re["twitter"].search(tail).group()) + return 0 + + def normalize(self, m): + m.url = "https://twitter.com/" + re.sub(r"^@", "", m.url) + + linkifyit.add("@", {"validate": validate, "normalize": normalize}) + + assert linkifyit.match("hello, @gamajoba_!")[0].text == "@gamajoba_" + assert linkifyit.match(":@givi")[0].text == "@givi" + assert linkifyit.match(":@givi")[0].url == "https://twitter.com/givi" + assert not linkifyit.test("@@invalid") + + +def test_api_twitter_rule_no_compile(): + linkifyit = LinkifyIt() + + def validate(self, text, pos): + tail = text[pos:] + + if not self.re.get("twitter"): + self.re["twitter"] = ( + "^([a-zA-Z0-9_]){1,15}(?!_)(?=$|" + self.re["src_ZPCc"] + ")" + ) + if re.search(self.re["twitter"], tail): + if pos > 2 and tail[pos - 2] == "@": + return False + return len(re.search(self.re["twitter"], tail).group()) + return 0 + + def normalize(self, m): + m.url = "https://twitter.com/" + re.sub(r"^@", "", m.url) + + linkifyit.add("@", {"validate": validate, "normalize": normalize}) + + assert linkifyit.match("hello, @gamajoba_!")[0].text == "@gamajoba_" + assert linkifyit.match(":@givi")[0].text == "@givi" + assert linkifyit.match(":@givi")[0].url == "https://twitter.com/givi" + assert not linkifyit.test("@@invalid") + + +def test_api_set_option_fuzzylink(): + linkifyit = LinkifyIt(options={"fuzzy_link": False}) + + assert not linkifyit.test("google.com") + + linkifyit.set({"fuzzy_link": True}) + + assert linkifyit.test("google.com") + assert linkifyit.match("google.com")[0].text == "google.com" + + +def test_api_set_option_fuzzyemail(): + linkifyit = LinkifyIt(options={"fuzzy_email": False}) + + assert not linkifyit.test("foo@bar.com") + + linkifyit.set({"fuzzy_email": True}) + + assert linkifyit.test("foo@bar.com") + assert linkifyit.match("foo@bar.com")[0].text == "foo@bar.com" + + +def test_api_set_option_fuzzyip(): + linkifyit = LinkifyIt() + + assert not linkifyit.test("1.1.1.1") + + linkifyit.set({"fuzzy_ip": True}) + + assert linkifyit.test("1.1.1.1") + assert linkifyit.match("1.1.1.1")[0].text == "1.1.1.1" + + +def test_api_shoud_not_hang_in_fuzzy_mode_with_sequence_of_astrals(): + linkifyit = LinkifyIt() + + linkifyit.set({"fuzzy_link": True}) + + linkifyit.match("😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡😡 .com") + + +def test_api_shoud_accept_triple_minus(): + linkifyit = LinkifyIt() + + assert linkifyit.match("http://e.com/foo---bar")[0].text == "http://e.com/foo---bar" + assert linkifyit.match("text@example.com---foo") is None + + linkifyit = LinkifyIt(None, {"---": True}) + + assert linkifyit.match("http://e.com/foo---bar")[0].text == "http://e.com/foo" + assert linkifyit.match("text@example.com---foo")[0].text == "text@example.com" + + +# issue #25. Schema key containing - not producing matches +@pytest.mark.parametrize( + "escape_str", + {".", "?", "*", "+", "^", "$", "[", "]", "\\", "(", ")", "{", "}", "|", "-"}, +) +def test_api_add_alias_rule_with_excape_re_string(escape_str): + linkifyit = LinkifyIt() + + linkifyit.add("foo{}bar:".format(escape_str), "http:") + assert linkifyit.test("Check foo{}bar://test".format(escape_str)) is True + + +def test_api_blank_test_match_at_the_start(): + linkifyit = LinkifyIt() + + assert not linkifyit.match_at_start("") + + +def test_api_should_find_a_match_at_the_start(): + linkifyit = LinkifyIt() + + linkifyit = LinkifyIt(options={"fuzzy_link": True}) + linkifyit.set({"fuzzy_link": True}) + + assert not linkifyit.test("@@invalid") + assert not linkifyit.match_at_start("google.com 123") + assert not linkifyit.match_at_start(" http://google.com 123") + + +def test_api_match_a_start_should_not_interfere_with_normal_match(): + linkifyit = LinkifyIt() + + str = "http://google.com http://google.com" + assert linkifyit.match_at_start(str) + assert len(linkifyit.match(str)) == 2 + + str = "aaa http://google.com http://google.com" + assert not linkifyit.match_at_start(str) + assert len(linkifyit.match(str)) == 2 + + +def test_api_should_not_match_incomplete_links(): + # regression test for https://github.com/markdown-it/markdown-it/issues/868 + linkifyit = LinkifyIt() + + assert not linkifyit.match_at_start("http://") + assert not linkifyit.match_at_start("https://") diff --git a/test/test_linkify.py b/test/test_linkify.py new file mode 100644 index 0000000..79bf297 --- /dev/null +++ b/test/test_linkify.py @@ -0,0 +1,40 @@ +from pathlib import Path + +import pytest + +from linkify_it import LinkifyIt + +from .utils import read_fixture_file + +FIXTURE_PATH = Path(__file__).parent / "fixtures" + + +def dummy(_): + pass + + +@pytest.mark.parametrize( + "number,line,expected", + read_fixture_file(FIXTURE_PATH.joinpath("links.txt")), +) +def test_links(number, line, expected): + linkifyit = LinkifyIt(options={"fuzzy_ip": True}) + + linkifyit.normalize = dummy + + assert linkifyit.pretest(line) is True + assert linkifyit.test("\n" + line + "\n") is True + assert linkifyit.test(line) is True + assert linkifyit.match(line)[0].url == expected + + +@pytest.mark.parametrize( + "number,line,expected", + read_fixture_file(FIXTURE_PATH.joinpath("not_links.txt")), +) +def test_not_links(number, line, expected): + linkifyit = LinkifyIt() + + linkifyit.normalize = dummy + + assert linkifyit.test(line) is False diff --git a/test/utils.py b/test/utils.py new file mode 100644 index 0000000..217f6a8 --- /dev/null +++ b/test/utils.py @@ -0,0 +1,37 @@ +import re + + +def lget(src_list, index): + try: + return src_list[index] + except IndexError: + return "" + + +def read_fixture_file(path): + tests = [] + skip_next = False + comment_re = re.compile(r"^%.*") + + with open(path, "r", encoding="utf-8") as f: + lines = [x.rstrip() for x in f.readlines()] + + for idx, line in enumerate(lines): + if skip_next: + skip_next = False + continue + + line = comment_re.sub("", line) + + next_line = comment_re.sub("", lget(lines, idx + 1)) + + if not line.strip(): + continue + + if next_line.strip(): + tests.append([idx + 1, line, next_line]) + skip_next = True + else: + tests.append([idx + 1, line, line]) + + return tests diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..8bd4f98 --- /dev/null +++ b/tox.ini @@ -0,0 +1,12 @@ +[tox] +envlist = {py37, py38, py39, py310, py311} + +[testenv] +deps = + pytest + uc-micro-py +usedevelop = true + +[testenv:py{37,38, 39, 310, 311}] +extras = testing +commands = pytest {posargs}