Skip to content

Commit

Permalink
remove whitespace from URLs #58
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulbot committed Nov 28, 2023
1 parent 72c4493 commit 1c89285
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 3 deletions.
7 changes: 6 additions & 1 deletion mcmetadata/test/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,17 @@ def test_other_metadata(self):
assert results['other']['top_image_url'] == "https://im.indiatimes.in/content/2022/Aug/flag_62f496a5df908.jpg"
assert len(results['other']['authors']) == 1

def test_whitespace_removal(self):
def test_content_whitespace_removal(self):
url = "https://observador.vsports.pt/embd/75404/m/9812/obsrv/53a58b677b53143428e47d43d5887139?autostart=false"
results = extract(url, include_other_metadata=True)
# the point here is that it removes all pre and post whitespace - tons of junk
assert len(results['text_content']) == 110

def test_url_whitespace_removal(self):
url = ' https://www.letras.com.br/banda-n-drive/eden '
results = extract(url)
assert results is not None

def test_memento_without_original_url(self):
try:
url = "https://web.archive.org/web/20210412063445id_/https://ehp.niehs.nih.gov/action/doUpdateAlertSettings?action=addJournal&journalCode=ehp&referrer=/action/doSearch?ContribAuthorRaw=Davis%2C+Jacquelyn&ContentItemType=research-article&startPage=&ContribRaw=Martin%2C+Denny"
Expand Down
4 changes: 4 additions & 0 deletions mcmetadata/test/test_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,10 @@ def test_shortened(self):
url = "https://bit.ly/my-url"
assert urls.is_homepage_url(url) is False

def test_whitespace(self):
url = ' https://www.letras.com.br/banda-n-drive/eden ' # a real URL from an RSS we fetched
assert urls.is_homepage_url(url) is False


class TestIsShortenedUrl(unittest.TestCase):

Expand Down
6 changes: 4 additions & 2 deletions mcmetadata/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,9 @@ def normalize_url(url: str) -> Optional[str]:
]


def is_homepage_url(url: str) -> bool:
def is_homepage_url(raw_url: str) -> bool:
"""Returns true if URL is a homepage-like URL (ie. not an article)."""
url = raw_url.strip() # remove whitespace
if is_shortened_url(url): # if it is shortened than it should get a free pass becasue we have to resolve it later
return False
uri = furl(url)
Expand All @@ -283,8 +284,9 @@ def is_homepage_url(url: str) -> bool:
return False


def is_shortened_url(url: str) -> bool:
def is_shortened_url(raw_url: str) -> bool:
"""Returns true if URL is a shortened URL (e.g. with Bit.ly)."""
url = raw_url.strip()
uri = furl(url)
if str(uri.path) is not None and str(uri.path) in ['', '/']:
# Assume that most of the URL shorteners use something like
Expand Down

0 comments on commit 1c89285

Please sign in to comment.