Skip to content

Commit

Permalink
add new unique_url_hash helper to centralize logic
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulbot committed Dec 13, 2023
1 parent 82ede7a commit 3f4c348
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 1 deletion.
26 changes: 25 additions & 1 deletion mcmetadata/test/test_urls.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
import hashlib
from parameterized import parameterized
import time

Expand Down Expand Up @@ -47,7 +48,7 @@ def test_ref_removal(self):
normalized_url = urls.normalize_url(url)
assert normalized_url == normalized
url = "https://upstract.com/x/fdf95bf448e1f2a8?ref=rss"
normalized_url = urls.normalize_url(normalized)
normalized_url = urls.normalize_url(url)
assert normalized_url == normalized

def test_reuters_example(self):
Expand Down Expand Up @@ -181,5 +182,28 @@ def test_inclusion(self):
assert 'cnn.com' not in urls.NON_NEWS_DOMAINS


class TestUniqueUrlHash(unittest.TestCase):

def test_basic_uniqueness(self):
url1 = "https://www.cnn.com/2023/12/12/politics/takeaways-volodymyr-zelensky-washington/index.html"
url2 = "https://www.cnn.com/2023/12/12/entertainment/andre-braugher-obit/index.html"
assert urls.unique_url_hash(url1) != urls.unique_url_hash(url2)

def test_already_normalized_url(self):
test_url = "http://thekenyatimes.com/counties/kisumu-traders-alerted-about-cartels-in-stalls-issuance/"
expected_hash = hashlib.sha256(test_url.encode('utf-8')).hexdigest()
assert urls.unique_url_hash(test_url) == expected_hash

def test_normalizing(self):
bad_hash = hashlib.sha256("https://upstract.com/x/fdf95bf448e1f2a8?ref=rss".encode('utf-8')).hexdigest()
good_hash = hashlib.sha256(urls.normalize_url("https://upstract.com/x/fdf95bf448e1f2a8").encode('utf-8'))\
.hexdigest()
assert good_hash != bad_hash
test_url = "https://upstract.com/x/fdf95bf448e1f2a8?ref=rss"
unique_hash = urls.unique_url_hash(test_url)
assert unique_hash == good_hash
assert unique_hash != bad_hash


if __name__ == "__main__":
unittest.main()
11 changes: 11 additions & 0 deletions mcmetadata/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from furl import furl
import ipaddress
import pathlib
import hashlib

from .urlshortners import URL_SHORTENER_HOSTNAMES

Expand Down Expand Up @@ -303,3 +304,13 @@ def is_shortened_url(raw_url: str) -> bool:
return True

return False


def unique_url_hash(url: str) -> str:
"""
A central method to determine how to generate a unique hash used to check for duplicate articles across our
system. Do not fiddle with this unless you _really_ know what you are doing.
"""
normalized_url = normalize_url(url)
hashed_url = hashlib.sha256(normalized_url.encode("utf-8")).hexdigest()
return hashed_url

0 comments on commit 3f4c348

Please sign in to comment.