mediacloud · rahulbot · Nov 30, 2023 · Nov 28, 2023
diff --git a/mcmetadata/test/test_extract.py b/mcmetadata/test/test_extract.py
@@ -109,12 +109,17 @@ def test_other_metadata(self):
         assert results['other']['top_image_url'] == "https://im.indiatimes.in/content/2022/Aug/flag_62f496a5df908.jpg"
         assert len(results['other']['authors']) == 1
 
-    def test_whitespace_removal(self):
+    def test_content_whitespace_removal(self):
         url = "https://observador.vsports.pt/embd/75404/m/9812/obsrv/53a58b677b53143428e47d43d5887139?autostart=false"
         results = extract(url, include_other_metadata=True)
         # the point here is that it removes all pre and post whitespace - tons of junk
         assert len(results['text_content']) == 110
 
+    def test_url_whitespace_removal(self):
+        url = ' https://www.letras.com.br/banda-n-drive/eden '
+        results = extract(url)
+        assert results is not None
+
     def test_memento_without_original_url(self):
         try:
             url = "https://web.archive.org/web/20210412063445id_/https://ehp.niehs.nih.gov/action/doUpdateAlertSettings?action=addJournal&journalCode=ehp&referrer=/action/doSearch?ContribAuthorRaw=Davis%2C+Jacquelyn&ContentItemType=research-article&startPage=&ContribRaw=Martin%2C+Denny"

diff --git a/mcmetadata/test/test_urls.py b/mcmetadata/test/test_urls.py
@@ -139,6 +139,10 @@ def test_shortened(self):
         url = "https://bit.ly/my-url"
         assert urls.is_homepage_url(url) is False
 
+    def test_whitespace(self):
+        url = ' https://www.letras.com.br/banda-n-drive/eden ' # a real URL from an RSS we fetched
+        assert urls.is_homepage_url(url) is False
+
 
 class TestIsShortenedUrl(unittest.TestCase):
 

diff --git a/mcmetadata/urls.py b/mcmetadata/urls.py
@@ -271,8 +271,9 @@ def normalize_url(url: str) -> Optional[str]:
 ]
 
 
-def is_homepage_url(url: str) -> bool:
+def is_homepage_url(raw_url: str) -> bool:
     """Returns true if URL is a homepage-like URL (ie. not an article)."""
+    url = raw_url.strip() # remove whitespace
     if is_shortened_url(url):  # if it is shortened than it should get a free pass becasue we have to resolve it later
         return False
     uri = furl(url)
@@ -283,8 +284,9 @@ def is_homepage_url(url: str) -> bool:
     return False
 
 
-def is_shortened_url(url: str) -> bool:
+def is_shortened_url(raw_url: str) -> bool:
     """Returns true if URL is a shortened URL (e.g. with Bit.ly)."""
+    url = raw_url.strip()
     uri = furl(url)
     if str(uri.path) is not None and str(uri.path) in ['', '/']:
         # Assume that most of the URL shorteners use something like