Skip to content

Commit

Permalink
ignore ports & handle IP domains in normalize_url #72
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulbot committed Jan 17, 2024
1 parent aecc319 commit c9d1de7
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 1 deletion.
14 changes: 14 additions & 0 deletions mcmetadata/test/test_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,20 @@ def test_utm_removal(self):
normalized_url = urls.normalize_url(url)
assert normalized_url == "http://uniradioinforma.com/noticias/coronavirus/677354/covid-segunda-causa-de-muerte-en-mexico-en-2021.html"

def test_remove_port(self):
expected_url = "http://do.ma.in/what/ever"
url = "https://do.ma.in:442/what/ever"
normalized_url = urls.normalize_url(url)
assert normalized_url == expected_url
url = "http://do.ma.in:80/what/ever"
normalized_url = urls.normalize_url(url)
assert normalized_url == expected_url

def test_ip_based_url(self):
url = "http://10.2.3.4/hello/world.html"
normalized_url = urls.normalize_url(url)
assert normalized_url == url


class TestIsHomepageUrl(unittest.TestCase):

Expand Down
18 changes: 17 additions & 1 deletion mcmetadata/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,21 @@ def _remove_query_params(url: str) -> str:
trailing_slash_url_pattern = re.compile(r'https?://[^/]*$', re.I)


def _url_domain_is_ip_address(url: str) -> bool:
parsed_domain = tldextract.extract(url)
try:
ipaddress.ip_address(parsed_domain.domain)
return True
except ValueError:
return False


def _remove_port_from_url(url: str) -> str:
uri = furl(url)
uri.port = None
return uri.url


def normalize_url(url: str) -> Optional[str]:
"""
Support later deduplicaton of URLs by applying a simple set of transformations on a URL to make it match other
Expand All @@ -218,6 +233,7 @@ def normalize_url(url: str) -> Optional[str]:
if len(url) == 0:
return None
url = _fix_common_url_mistakes(url)
url = _remove_port_from_url(url)
if yt_generic_pattern.match(url): # YouTube URLs video IDs are case sensitive
url = normalize_youtube_url(url)
else:
Expand All @@ -227,7 +243,7 @@ def normalize_url(url: str) -> Optional[str]:
if not url.startswith('http'):
url = 'http://' + url
# r2.ly redirects through the hostname, ala http://543.r2.ly
if 'r2.ly' not in url:
if (not _url_domain_is_ip_address(url)) and ('r2.ly' not in url):
url = another_url_pattern.sub(r"\1\3", url)
# collapse the vast array of http://pronkraymond83483.podomatic.com/ urls into http://pronkpops.podomatic.com/
url = podomatic_url_pattern.sub('http://pronkpops.podomatic.com', url)
Expand Down

0 comments on commit c9d1de7

Please sign in to comment.