From 3f7dc398394d85653c59e19feb77bd282472cb70 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 10 Nov 2023 14:19:07 +0100 Subject: [PATCH 1/4] prepare release 1.6.0 --- CHANGELOG.md | 6 ++++++ htmldate/__init__.py | 2 +- setup.py | 11 +++++------ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 46ccea5f..7b1540c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ ## Changelog + +## 1.6.0 +- focus on precision, stricter extraction patterns (#103, #105, #106) +- simplified code base (#108, #109) +- replaced lxml.html.Cleaner (#104) + ## 1.5.2 - fix for missing months keys in custom extractor (#100) - fix for None in `try_date_expr()` (#101) diff --git a/htmldate/__init__.py b/htmldate/__init__.py index 83b1c2af..72d3db4b 100644 --- a/htmldate/__init__.py +++ b/htmldate/__init__.py @@ -7,7 +7,7 @@ __author__ = "Adrien Barbaresi" __license__ = "GNU GPL v3" __copyright__ = "Copyright 2017-2023, Adrien Barbaresi" -__version__ = "1.5.2" +__version__ = "1.6.0" import logging diff --git a/setup.py b/setup.py index d2a2de78..b4589051 100644 --- a/setup.py +++ b/setup.py @@ -13,9 +13,8 @@ # some problems with installation solved this way extras = { "speed": [ - "backports-datetime-fromisoformat; python_version < '3.11'", - "cchardet >= 2.1.7; python_version < '3.11'", # build issue - "faust-cchardet >= 2.1.19; python_version >= '3.11'", # fix for build + "backports-datetime-fromisoformat >= 2.0.1; python_version < '3.11'", + "faust-cchardet >= 2.1.19", "urllib3[brotli]", ], } @@ -34,7 +33,7 @@ def get_long_description(): def get_version(package): "Return package version as listed in `__version__` in `init.py`" - initfile = Path(package, "__init__.py").read_text() # Python >= 3.5 + initfile = Path(package, "__init__.py").read_text() return re.search("__version__ = ['\"]([^'\"]+)['\"]", initfile)[1] @@ -115,9 +114,9 @@ def get_version(package): include_package_data=True, python_requires=">=3.6", install_requires=[ - "backports-datetime-fromisoformat; python_version < '3.7'", + "backports-datetime-fromisoformat >= 2.0.1; python_version < '3.7'", "charset_normalizer >= 3.0.1; python_version < '3.7'", - "charset_normalizer >= 3.3.0; python_version >= '3.7'", + "charset_normalizer >= 3.3.2; python_version >= '3.7'", "dateparser >= 1.1.2", # 1.1.3+ slower "lxml >= 4.9.3 ; platform_system != 'Darwin'", "lxml == 4.9.2 ; platform_system == 'Darwin'", From 7dd9533ccc34619ea6416ec4ac60480d2161502a Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 10 Nov 2023 14:36:21 +0100 Subject: [PATCH 2/4] fix setup --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index b4589051..27eb26a1 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ # some problems with installation solved this way extras = { "speed": [ - "backports-datetime-fromisoformat >= 2.0.1; python_version < '3.11'", + "backports-datetime-fromisoformat; python_version < '3.11'", "faust-cchardet >= 2.1.19", "urllib3[brotli]", ], @@ -114,7 +114,7 @@ def get_version(package): include_package_data=True, python_requires=">=3.6", install_requires=[ - "backports-datetime-fromisoformat >= 2.0.1; python_version < '3.7'", + "backports-datetime-fromisoformat; python_version < '3.7'", "charset_normalizer >= 3.0.1; python_version < '3.7'", "charset_normalizer >= 3.3.2; python_version >= '3.7'", "dateparser >= 1.1.2", # 1.1.3+ slower From 161467957291305f921f5746928278f3df1a8b33 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Mon, 13 Nov 2023 18:09:47 +0100 Subject: [PATCH 3/4] update benchmark --- tests/comparison.py | 7 +++++-- tests/eval-requirements.txt | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/comparison.py b/tests/comparison.py index cfcc46ef..ca88b22e 100644 --- a/tests/comparison.py +++ b/tests/comparison.py @@ -129,11 +129,14 @@ def run_dateguesser(htmlstring): def run_goose(htmlstring): """try with the goose algorithm""" - article = G.extract(raw_html=htmlstring) + try: + article = G.extract(raw_html=htmlstring) + except (AttributeError, UnicodeDecodeError): + return None if article.publish_date is None: return None - datematch = re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", article.publish_date) try: + datematch = re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", article.publish_date) return datematch[0] # illogical result except TypeError: diff --git a/tests/eval-requirements.txt b/tests/eval-requirements.txt index 09833779..6ce9379f 100644 --- a/tests/eval-requirements.txt +++ b/tests/eval-requirements.txt @@ -1,5 +1,5 @@ # package -htmldate>=1.5.0 +htmldate>=1.6.0 # alternatives articleDateExtractor==0.20 From 78f4c894fb43dcb4bc3b9eff5ab44c34074b46f5 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 21 Nov 2023 16:46:18 +0100 Subject: [PATCH 4/4] update evaluation --- CHANGELOG.md | 3 ++- README.rst | 16 ++++++++-------- docs/evaluation.rst | 31 +++++++++++++++++++++++-------- tests/comparison.py | 11 ++++------- 4 files changed, 37 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b1540c5..58cabc45 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,9 +2,10 @@ ## 1.6.0 -- focus on precision, stricter extraction patterns (#103, #105, #106) +- focus on precision, stricter extraction patterns (#103, #105, #106, #112) - simplified code base (#108, #109) - replaced lxml.html.Cleaner (#104) +- extended evaluation ## 1.5.2 - fix for missing months keys in custom extractor (#100) diff --git a/README.rst b/README.rst index 6720bc26..5443aa98 100644 --- a/README.rst +++ b/README.rst @@ -97,17 +97,17 @@ Performance ----------- =============================== ========= ========= ========= ========= ======= -500 web pages containing identifiable dates (as of 2022-11-28 on Python 3.8) +1000 web pages containing identifiable dates (as of 2023-11-13 on Python 3.10) ------------------------------------------------------------------------------- Python Package Precision Recall Accuracy F-Score Time =============================== ========= ========= ========= ========= ======= -articleDateExtractor 0.20 0.769 0.691 0.572 0.728 4x -date_guesser 2.1.4 0.738 0.544 0.456 0.626 16x -goose3 3.1.12 0.821 0.453 0.412 0.584 14x -htmldate[all] 1.4.0 (fast) **0.856** 0.921 0.798 0.888 **1x** -htmldate[all] 1.4.0 (extensive) 0.847 **0.991** **0.840** **0.913** 2.2x -newspaper3k 0.2.8 0.729 0.630 0.510 0.675 13x -news-please 1.5.22 0.769 0.691 0.572 0.728 38x +articleDateExtractor 0.20 0.803 0.734 0.622 0.767 5x +date_guesser 2.1.4 0.781 0.600 0.514 0.679 18x +goose3 3.1.17 0.869 0.532 0.493 0.660 15x +htmldate[all] 1.6.0 (fast) **0.883** 0.924 0.823 0.903 **1x** +htmldate[all] 1.6.0 (extensive) 0.870 **0.993** **0.865** **0.928** 1.7x +newspaper3k 0.2.8 0.769 0.667 0.556 0.715 15x +news-please 1.5.35 0.801 0.768 0.645 0.784 34x =============================== ========= ========= ========= ========= ======= For complete results and explanations see the `evaluation page `_. diff --git a/docs/evaluation.rst b/docs/evaluation.rst index 51dedbdd..a3638b82 100644 --- a/docs/evaluation.rst +++ b/docs/evaluation.rst @@ -42,17 +42,17 @@ The results below show that **date extraction is not a completely solved task** =============================== ========= ========= ========= ========= ======= -500 web pages containing identifiable dates (as of 2022-11-28 on Python 3.8) +1000 web pages containing identifiable dates (as of 2023-11-13 on Python 3.10) ------------------------------------------------------------------------------- Python Package Precision Recall Accuracy F-Score Time =============================== ========= ========= ========= ========= ======= -articleDateExtractor 0.20 0.769 0.691 0.572 0.728 4x -date_guesser 2.1.4 0.738 0.544 0.456 0.626 16x -goose3 3.1.12 0.821 0.453 0.412 0.584 14x -htmldate[all] 1.4.0 (fast) **0.856** 0.921 0.798 0.888 **1x** -htmldate[all] 1.4.0 (extensive) 0.847 **0.991** **0.840** **0.913** 2.2x -newspaper3k 0.2.8 0.729 0.630 0.510 0.675 13x -news-please 1.5.22 0.769 0.691 0.572 0.728 38x +articleDateExtractor 0.20 0.803 0.734 0.622 0.767 5x +date_guesser 2.1.4 0.781 0.600 0.514 0.679 18x +goose3 3.1.17 0.869 0.532 0.493 0.660 15x +htmldate[all] 1.6.0 (fast) **0.883** 0.924 0.823 0.903 **1x** +htmldate[all] 1.6.0 (extensive) 0.870 **0.993** **0.865** **0.928** 1.7x +newspaper3k 0.2.8 0.769 0.667 0.556 0.715 15x +news-please 1.5.35 0.801 0.768 0.645 0.784 34x =============================== ========= ========= ========= ========= ======= @@ -72,6 +72,21 @@ Note on the different versions: Older Results ------------- +=============================== ========= ========= ========= ========= ======= +500 web pages containing identifiable dates (as of 2022-11-28 on Python 3.8) +------------------------------------------------------------------------------- +Python Package Precision Recall Accuracy F-Score Time +=============================== ========= ========= ========= ========= ======= +articleDateExtractor 0.20 0.769 0.691 0.572 0.728 4x +date_guesser 2.1.4 0.738 0.544 0.456 0.626 16x +goose3 3.1.12 0.821 0.453 0.412 0.584 14x +htmldate[all] 1.4.0 (fast) **0.856** 0.921 0.798 0.888 **1x** +htmldate[all] 1.4.0 (extensive) 0.847 **0.991** **0.840** **0.913** 2.2x +newspaper3k 0.2.8 0.729 0.630 0.510 0.675 13x +news-please 1.5.22 0.769 0.691 0.572 0.728 38x +=============================== ========= ========= ========= ========= ======= + + =============================== ========= ========= ========= ========= ======= 500 web pages containing identifiable dates (as of 2022-03-23 on Python 3.8) diff --git a/tests/comparison.py b/tests/comparison.py index ca88b22e..ea4b54a7 100644 --- a/tests/comparison.py +++ b/tests/comparison.py @@ -86,17 +86,14 @@ def run_newspaper(htmlstring): # throws error on the eval_default dataset try: myarticle = Article(htmlstring) - except (TypeError, UnicodeDecodeError): - return None - myarticle.html = htmlstring - myarticle.download_state = ArticleDownloadState.SUCCESS - try: + myarticle.html = htmlstring + myarticle.download_state = ArticleDownloadState.SUCCESS myarticle.parse() - except UnicodeEncodeError: + except (UnicodeDecodeError, UnicodeEncodeError): return None if myarticle.publish_date is None or myarticle.publish_date == "": return None - return convert_date(myarticle.publish_date, "%Y-%m-%d %H:%M:%S", "%Y-%m-%d") + return str(myarticle.publish_date)[0:10] def run_newsplease(htmlstring):