fix failing test, version bump for release

mediacloud · Dec 3, 2023 · 680f67a · 680f67a
1 parent 44b29a8
commit 680f67a
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -72,6 +72,8 @@ When adding new tests, re-run 'scripts/get-test-web-content.py'
 Version History
 ---------------
 
+* __v0.10.0__: Support defaults and overrides in `extract`, returning execution time stats, requirements updates, more
+               handling of malformed URLs 
 * __v0.9.5__: Updated requirements, update non-news site list, fix failing unit tests, tweak title parsing logic
 * __v0.9.4__: Updated requirements to use faust-cchardet for py >3.9 support
 * __v0.9.3__: Updated content extractor dependencies, added py.typing for typing support
@@ -117,7 +119,7 @@ Version History
 Contributors
 ------------
 
-Created as part of the Media Cloud Project:
+Created as part of the Media Cloud Project. Contributes include:
 * Rahul Bhargava (Media Cloud, Northeastern University)
 * Paige Gulley (Media Cloud)
 * Phil Budne (Media Cloud)

diff --git a/mcmetadata/__init__.py b/mcmetadata/__init__.py
@@ -10,7 +10,7 @@
 from . import languages
 from . import dates
 
-__version__ = "0.9.5"
+__version__ = "0.10.0"
 
 logger = logging.getLogger(__name__)
 
@@ -23,7 +23,7 @@
 
 def extract(url: str, html_text: Optional[str] = None, include_other_metadata: Optional[bool] = False,
             defaults: Mapping[str, Any] = {}, overrides: Mapping[str, Any] = {},
-            stats_accumulator: Mapping[str, int] = stats) -> Dict:
+            stats_accumulator: Mapping[str, int] = None) -> Dict:
     """
     The core method of this library - returns all the useful information extracted from the HTML of the next
     article at the supplied URL.
@@ -46,6 +46,8 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
                  timings for the call will _not_ be added to the module-level `stats` counter. Should contain keys
                  for `STAT_NAMES` (see above).
     """
+    if stats_accumulator is None:  # can't default to global because of Python reference handling in defaults
+        stats_accumulator = stats
     t0 = time.monotonic()
     # first fetch the real content (if we need to)
     t1 = t0
@@ -149,3 +151,8 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
         )
 
     return results
+
+
+def reset_stats():
+    global stats
+    stats = {s: 0 for s in STAT_NAMES}
diff --git a/mcmetadata/test/test_extract.py b/mcmetadata/test/test_extract.py
@@ -175,6 +175,13 @@ def test_default_pub_date(self):
 
 class TestStats(unittest.TestCase):
 
+    def test_reset(self):
+        url = "https://web.archive.org/web/http://entretenimento.uol.com.br/noticias/redacao/2019/08/25/sem-feige-sem-stark-o-sera-do-homem-aranha-longe-do-mcu.htm"
+        _ = extract(url)
+        assert mcmetadata.stats.get('total') > 0
+        mcmetadata.reset_stats()
+        assert mcmetadata.stats.get('total') == 0
+
     def test_total_works(self):
         url = "https://web.archive.org/web/http://entretenimento.uol.com.br/noticias/redacao/2019/08/25/sem-feige-sem-stark-o-sera-do-homem-aranha-longe-do-mcu.htm"
         _ = extract(url)
@@ -186,6 +193,7 @@ def test_total_works(self):
                 assert mcmetadata.stats.get('url') < mcmetadata.stats.get('total')
 
     def test_passed_in_accumulator(self):
+        mcmetadata.reset_stats()
         local_stats = {s: 0 for s in mcmetadata.STAT_NAMES}
         url = "https://web.archive.org/web/http://entretenimento.uol.com.br/noticias/redacao/2019/08/25/sem-feige-sem-stark-o-sera-do-homem-aranha-longe-do-mcu.htm"
         _ = extract(url, stats_accumulator=local_stats)