From 3f4c348bc3714a1b7d80b90096316e6e47ed2180 Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Tue, 12 Dec 2023 21:48:12 -0500 Subject: [PATCH] add new unique_url_hash helper to centralize logic --- mcmetadata/test/test_urls.py | 26 +++++++++++++++++++++++++- mcmetadata/urls.py | 11 +++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/mcmetadata/test/test_urls.py b/mcmetadata/test/test_urls.py index a3dea3b..7b42c3d 100644 --- a/mcmetadata/test/test_urls.py +++ b/mcmetadata/test/test_urls.py @@ -1,4 +1,5 @@ import unittest +import hashlib from parameterized import parameterized import time @@ -47,7 +48,7 @@ def test_ref_removal(self): normalized_url = urls.normalize_url(url) assert normalized_url == normalized url = "https://upstract.com/x/fdf95bf448e1f2a8?ref=rss" - normalized_url = urls.normalize_url(normalized) + normalized_url = urls.normalize_url(url) assert normalized_url == normalized def test_reuters_example(self): @@ -181,5 +182,28 @@ def test_inclusion(self): assert 'cnn.com' not in urls.NON_NEWS_DOMAINS +class TestUniqueUrlHash(unittest.TestCase): + + def test_basic_uniqueness(self): + url1 = "https://www.cnn.com/2023/12/12/politics/takeaways-volodymyr-zelensky-washington/index.html" + url2 = "https://www.cnn.com/2023/12/12/entertainment/andre-braugher-obit/index.html" + assert urls.unique_url_hash(url1) != urls.unique_url_hash(url2) + + def test_already_normalized_url(self): + test_url = "http://thekenyatimes.com/counties/kisumu-traders-alerted-about-cartels-in-stalls-issuance/" + expected_hash = hashlib.sha256(test_url.encode('utf-8')).hexdigest() + assert urls.unique_url_hash(test_url) == expected_hash + + def test_normalizing(self): + bad_hash = hashlib.sha256("https://upstract.com/x/fdf95bf448e1f2a8?ref=rss".encode('utf-8')).hexdigest() + good_hash = hashlib.sha256(urls.normalize_url("https://upstract.com/x/fdf95bf448e1f2a8").encode('utf-8'))\ + .hexdigest() + assert good_hash != bad_hash + test_url = "https://upstract.com/x/fdf95bf448e1f2a8?ref=rss" + unique_hash = urls.unique_url_hash(test_url) + assert unique_hash == good_hash + assert unique_hash != bad_hash + + if __name__ == "__main__": unittest.main() diff --git a/mcmetadata/urls.py b/mcmetadata/urls.py index 5789d65..0b5c835 100644 --- a/mcmetadata/urls.py +++ b/mcmetadata/urls.py @@ -6,6 +6,7 @@ from furl import furl import ipaddress import pathlib +import hashlib from .urlshortners import URL_SHORTENER_HOSTNAMES @@ -303,3 +304,13 @@ def is_shortened_url(raw_url: str) -> bool: return True return False + + +def unique_url_hash(url: str) -> str: + """ + A central method to determine how to generate a unique hash used to check for duplicate articles across our + system. Do not fiddle with this unless you _really_ know what you are doing. + """ + normalized_url = normalize_url(url) + hashed_url = hashlib.sha256(normalized_url.encode("utf-8")).hexdigest() + return hashed_url