Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small tweaks to handle whitespace in URLs #65

Merged
merged 1 commit into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion mcmetadata/test/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,17 @@ def test_other_metadata(self):
assert results['other']['top_image_url'] == "https://im.indiatimes.in/content/2022/Aug/flag_62f496a5df908.jpg"
assert len(results['other']['authors']) == 1

def test_whitespace_removal(self):
def test_content_whitespace_removal(self):
url = "https://observador.vsports.pt/embd/75404/m/9812/obsrv/53a58b677b53143428e47d43d5887139?autostart=false"
results = extract(url, include_other_metadata=True)
# the point here is that it removes all pre and post whitespace - tons of junk
assert len(results['text_content']) == 110

def test_url_whitespace_removal(self):
url = ' https://www.letras.com.br/banda-n-drive/eden '
results = extract(url)
assert results is not None

def test_memento_without_original_url(self):
try:
url = "https://web.archive.org/web/20210412063445id_/https://ehp.niehs.nih.gov/action/doUpdateAlertSettings?action=addJournal&journalCode=ehp&referrer=/action/doSearch?ContribAuthorRaw=Davis%2C+Jacquelyn&ContentItemType=research-article&startPage=&ContribRaw=Martin%2C+Denny"
Expand Down
4 changes: 4 additions & 0 deletions mcmetadata/test/test_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,10 @@ def test_shortened(self):
url = "https://bit.ly/my-url"
assert urls.is_homepage_url(url) is False

def test_whitespace(self):
url = ' https://www.letras.com.br/banda-n-drive/eden ' # a real URL from an RSS we fetched
assert urls.is_homepage_url(url) is False


class TestIsShortenedUrl(unittest.TestCase):

Expand Down
6 changes: 4 additions & 2 deletions mcmetadata/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,9 @@ def normalize_url(url: str) -> Optional[str]:
]


def is_homepage_url(url: str) -> bool:
def is_homepage_url(raw_url: str) -> bool:
"""Returns true if URL is a homepage-like URL (ie. not an article)."""
url = raw_url.strip() # remove whitespace
if is_shortened_url(url): # if it is shortened than it should get a free pass becasue we have to resolve it later
return False
uri = furl(url)
Expand All @@ -283,8 +284,9 @@ def is_homepage_url(url: str) -> bool:
return False


def is_shortened_url(url: str) -> bool:
def is_shortened_url(raw_url: str) -> bool:
"""Returns true if URL is a shortened URL (e.g. with Bit.ly)."""
url = raw_url.strip()
uri = furl(url)
if str(uri.path) is not None and str(uri.path) in ['', '/']:
# Assume that most of the URL shorteners use something like
Expand Down