ignore ports & handle IP domains in normalize_url #72

mediacloud · Jan 17, 2024 · c9d1de7 · c9d1de7
1 parent aecc319
commit c9d1de7
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 1 deletion.
diff --git a/mcmetadata/test/test_urls.py b/mcmetadata/test/test_urls.py
@@ -116,6 +116,20 @@ def test_utm_removal(self):
         normalized_url = urls.normalize_url(url)
         assert normalized_url == "http://uniradioinforma.com/noticias/coronavirus/677354/covid-segunda-causa-de-muerte-en-mexico-en-2021.html"
 
+    def test_remove_port(self):
+        expected_url = "http://do.ma.in/what/ever"
+        url = "https://do.ma.in:442/what/ever"
+        normalized_url = urls.normalize_url(url)
+        assert normalized_url == expected_url
+        url = "http://do.ma.in:80/what/ever"
+        normalized_url = urls.normalize_url(url)
+        assert normalized_url == expected_url
+
+    def test_ip_based_url(self):
+        url = "http://10.2.3.4/hello/world.html"
+        normalized_url = urls.normalize_url(url)
+        assert normalized_url == url
+
 
 class TestIsHomepageUrl(unittest.TestCase):
 

diff --git a/mcmetadata/urls.py b/mcmetadata/urls.py
@@ -206,6 +206,21 @@ def _remove_query_params(url: str) -> str:
 trailing_slash_url_pattern = re.compile(r'https?://[^/]*$', re.I)
 
 
+def _url_domain_is_ip_address(url: str) -> bool:
+    parsed_domain = tldextract.extract(url)
+    try:
+        ipaddress.ip_address(parsed_domain.domain)
+        return True
+    except ValueError:
+        return False
+
+
+def _remove_port_from_url(url: str) -> str:
+    uri = furl(url)
+    uri.port = None
+    return uri.url
+
+
 def normalize_url(url: str) -> Optional[str]:
     """
     Support later deduplicaton of URLs by applying a simple set of transformations on a URL to make it match other
@@ -218,6 +233,7 @@ def normalize_url(url: str) -> Optional[str]:
     if len(url) == 0:
         return None
     url = _fix_common_url_mistakes(url)
+    url = _remove_port_from_url(url)
     if yt_generic_pattern.match(url):  # YouTube URLs video IDs are case sensitive
         url = normalize_youtube_url(url)
     else:
@@ -227,7 +243,7 @@ def normalize_url(url: str) -> Optional[str]:
     if not url.startswith('http'):
         url = 'http://' + url
     # r2.ly redirects through the hostname, ala http://543.r2.ly
-    if 'r2.ly' not in url:
+    if (not _url_domain_is_ip_address(url)) and ('r2.ly' not in url):
         url = another_url_pattern.sub(r"\1\3", url)
     # collapse the vast array of http://pronkraymond83483.podomatic.com/ urls into http://pronkpops.podomatic.com/
     url = podomatic_url_pattern.sub('http://pronkpops.podomatic.com', url)