Skip to content

Commit

Permalink
fix failing test, version bump for release
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulbot committed Dec 3, 2023
1 parent 44b29a8 commit 680f67a
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 3 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ When adding new tests, re-run 'scripts/get-test-web-content.py'
Version History
---------------

* __v0.10.0__: Support defaults and overrides in `extract`, returning execution time stats, requirements updates, more
handling of malformed URLs
* __v0.9.5__: Updated requirements, update non-news site list, fix failing unit tests, tweak title parsing logic
* __v0.9.4__: Updated requirements to use faust-cchardet for py >3.9 support
* __v0.9.3__: Updated content extractor dependencies, added py.typing for typing support
Expand Down Expand Up @@ -117,7 +119,7 @@ Version History
Contributors
------------

Created as part of the Media Cloud Project:
Created as part of the Media Cloud Project. Contributes include:
* Rahul Bhargava (Media Cloud, Northeastern University)
* Paige Gulley (Media Cloud)
* Phil Budne (Media Cloud)
Expand Down
11 changes: 9 additions & 2 deletions mcmetadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from . import languages
from . import dates

__version__ = "0.9.5"
__version__ = "0.10.0"

logger = logging.getLogger(__name__)

Expand All @@ -23,7 +23,7 @@

def extract(url: str, html_text: Optional[str] = None, include_other_metadata: Optional[bool] = False,
defaults: Mapping[str, Any] = {}, overrides: Mapping[str, Any] = {},
stats_accumulator: Mapping[str, int] = stats) -> Dict:
stats_accumulator: Mapping[str, int] = None) -> Dict:
"""
The core method of this library - returns all the useful information extracted from the HTML of the next
article at the supplied URL.
Expand All @@ -46,6 +46,8 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
timings for the call will _not_ be added to the module-level `stats` counter. Should contain keys
for `STAT_NAMES` (see above).
"""
if stats_accumulator is None: # can't default to global because of Python reference handling in defaults
stats_accumulator = stats
t0 = time.monotonic()
# first fetch the real content (if we need to)
t1 = t0
Expand Down Expand Up @@ -149,3 +151,8 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
)

return results


def reset_stats():
global stats
stats = {s: 0 for s in STAT_NAMES}
8 changes: 8 additions & 0 deletions mcmetadata/test/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,13 @@ def test_default_pub_date(self):

class TestStats(unittest.TestCase):

def test_reset(self):
url = "https://web.archive.org/web/http://entretenimento.uol.com.br/noticias/redacao/2019/08/25/sem-feige-sem-stark-o-sera-do-homem-aranha-longe-do-mcu.htm"
_ = extract(url)
assert mcmetadata.stats.get('total') > 0
mcmetadata.reset_stats()
assert mcmetadata.stats.get('total') == 0

def test_total_works(self):
url = "https://web.archive.org/web/http://entretenimento.uol.com.br/noticias/redacao/2019/08/25/sem-feige-sem-stark-o-sera-do-homem-aranha-longe-do-mcu.htm"
_ = extract(url)
Expand All @@ -186,6 +193,7 @@ def test_total_works(self):
assert mcmetadata.stats.get('url') < mcmetadata.stats.get('total')

def test_passed_in_accumulator(self):
mcmetadata.reset_stats()
local_stats = {s: 0 for s in mcmetadata.STAT_NAMES}
url = "https://web.archive.org/web/http://entretenimento.uol.com.br/noticias/redacao/2019/08/25/sem-feige-sem-stark-o-sera-do-homem-aranha-longe-do-mcu.htm"
_ = extract(url, stats_accumulator=local_stats)
Expand Down

0 comments on commit 680f67a

Please sign in to comment.