Skip to content

Commit

Permalink
Merge pull request #68 from mediacloud/feature-extract-stats-param
Browse files Browse the repository at this point in the history
allow capturing stats from individual extract calls
  • Loading branch information
rahulbot authored Dec 3, 2023
2 parents 69b8491 + 28d045d commit 44b29a8
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 22 deletions.
49 changes: 27 additions & 22 deletions mcmetadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@


def extract(url: str, html_text: Optional[str] = None, include_other_metadata: Optional[bool] = False,
defaults: Mapping[str, Any] = {}, overrides: Mapping[str, Any] = {}) -> Dict:
defaults: Mapping[str, Any] = {}, overrides: Mapping[str, Any] = {},
stats_accumulator: Mapping[str, int] = stats) -> Dict:
"""
The core method of this library - returns all the useful information extracted from the HTML of the next
article at the supplied URL.
Expand All @@ -40,10 +41,14 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
Keys should be the same as the keys in the returned dictionary (supports `publication_date`,
`article_title`, and `language`). This can be useful for speed optimizations,
since if an override is provided then that extraction method won't be called.
:param stats_accumulator: (optional) A dictionary of stats to accumulate. This is useful if you want to track the
stats yourself _instead_ of in the module-level `stats` counter. If you pass this in then the
timings for the call will _not_ be added to the module-level `stats` counter. Should contain keys
for `STAT_NAMES` (see above).
"""
t0 = time.time()
t0 = time.monotonic()
# first fetch the real content (if we need to)
t1 = time.time()
t1 = t0
if html_text is None:
raw_html, response = webpages.fetch(url)
# check for archived URLs
Expand All @@ -59,64 +64,64 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
else:
final_url = url # trust that the user knows which URL the content actually came from
raw_html = html_text
fetch_duration = time.time() - t1
stats['fetch'] += fetch_duration
fetch_duration = time.monotonic() - t1
stats_accumulator['fetch'] += fetch_duration

# url
t1 = time.time()
t1 = time.monotonic()
normalized_url = urls.normalize_url(final_url)
canonical_domain = urls.canonical_domain(final_url)
is_homepage_url = urls.is_homepage_url(url)
is_shortened_url = urls.is_shortened_url(url)
url_duration = time.time() - t1
stats['url'] += url_duration
url_duration = time.monotonic() - t1
stats_accumulator['url'] += url_duration

# pub date stuff
t1 = time.time()
t1 = time.monotonic()
max_pub_date = dt.datetime.now() + dt.timedelta(days=+MAX_FUTURE_PUB_DATE)
if 'publication_date' in overrides:
pub_date = overrides['publication_date']
else:
default_date = defaults.get('publication_date') if defaults else None
pub_date = dates.guess_publication_date(raw_html, final_url, max_date=max_pub_date, default_date=default_date)
pub_date_duration = time.time() - t1
stats['pub_date'] += pub_date_duration
pub_date_duration = time.monotonic() - t1
stats_accumulator['pub_date'] += pub_date_duration

# content
t1 = time.time()
t1 = time.monotonic()
if 'text_content' in overrides:
article = dict(extraction_method = content.METHOD_OVERRIDEN,
text=overrides['text_content'])
else:
article = content.from_html(final_url, raw_html, include_other_metadata)
content_duration = time.time() - t1
stats['content'] += content_duration
content_duration = time.monotonic() - t1
stats_accumulator['content'] += content_duration

# title
t1 = time.time()
t1 = time.monotonic()
if 'article_title' in overrides:
article_title = overrides['article_title']
else:
article_title = titles.from_html(raw_html, article['title'])
if article_title is None:
article_title = defaults.get('article_title') if defaults else None
normalized_title = titles.normalize_title(article_title)
title_duration = time.time() - t1
stats['title'] += title_duration
title_duration = time.monotonic() - t1
stats_accumulator['title'] += title_duration

# language
t1 = time.time()
t1 = time.monotonic()
if 'language' in overrides:
full_language = overrides['language']
else:
full_language = languages.from_html(raw_html, article['text']) # could be something like "pt-br"
if full_language is None:
full_language = defaults.get('language') if defaults else None
language_duration = time.time() - t1
stats['language'] += language_duration
language_duration = time.monotonic() - t1
stats_accumulator['language'] += language_duration

total_duration = time.time() - t0
stats['total'] += total_duration
total_duration = time.monotonic() - t0
stats_accumulator['total'] += total_duration

results = dict(
original_url=url,
Expand Down
11 changes: 11 additions & 0 deletions mcmetadata/test/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,17 @@ def test_total_works(self):
if s != 'total': # is less than total
assert mcmetadata.stats.get('url') < mcmetadata.stats.get('total')

def test_passed_in_accumulator(self):
local_stats = {s: 0 for s in mcmetadata.STAT_NAMES}
url = "https://web.archive.org/web/http://entretenimento.uol.com.br/noticias/redacao/2019/08/25/sem-feige-sem-stark-o-sera-do-homem-aranha-longe-do-mcu.htm"
_ = extract(url, stats_accumulator=local_stats)
for s in mcmetadata.STAT_NAMES: # verify global counter didn't count
assert s in mcmetadata.stats
assert mcmetadata.stats.get(s) == 0
for s in mcmetadata.STAT_NAMES: # verify local counter did count
assert s in local_stats
assert local_stats.get(s) > 0


if __name__ == "__main__":
unittest.main()

0 comments on commit 44b29a8

Please sign in to comment.