Skip to content

Commit

Permalink
refactor(misc): 🚑 deprecated config.verbose. debug output moved to st…
Browse files Browse the repository at this point in the history
…andard logging module.
  • Loading branch information
AndyTheFactory committed Dec 30, 2023
1 parent 8ca3d40 commit 152ff92
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 26 deletions.
2 changes: 1 addition & 1 deletion newspaper/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def __init__(self):
# Number of threads to use for mthreaded downloads
self.number_threads = 10

# If true, it will output debugging information
# Deprecated, use standard python logging module instead (debug level)
self.verbose = False # for debugging

self.thread_timeout_seconds = 10
Expand Down
40 changes: 15 additions & 25 deletions newspaper/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,13 @@ def prepare_url(url, source_url=None):
# proper_url = remove_args(url)
proper_url = url
except ValueError as e:
log.critical("url %s failed on err %s" % (url, str(e)))
log.error("url %s failed on err %s", url, str(e))
proper_url = ""

return proper_url


def valid_url(url, verbose=False, test=False):
def valid_url(url, test=False):
"""
Is this URL a valid news-article url?
Expand Down Expand Up @@ -182,16 +182,14 @@ def valid_url(url, verbose=False, test=False):

# 11 chars is shortest valid url length, eg: http://x.co
if url is None or len(url) < 11:
if verbose:
print("\t%s rejected because len of url is less than 11" % url)
log.debug("url %s rejected due to short length < 11", url)
return False

r1 = "mailto:" in url # TODO not sure if these rules are redundant
r2 = ("http://" not in url) and ("https://" not in url)

if r1 or r2:
if verbose:
print("\t%s rejected because len of url structure" % url)
log.debug("url %s rejected due to mailto in link or no http(s) schema", url)
return False

path = urlparse(url).path
Expand All @@ -213,8 +211,7 @@ def valid_url(url, verbose=False, test=False):

# if the file type is a media type, reject instantly
if file_type and file_type not in ALLOWED_TYPES:
if verbose:
print("\t%s rejected due to bad filetype" % url)
log.debug("url %s rejected due to bad filetype (%s)", url, file_type)
return False

last_chunk = path_chunks[-1].split(".")
Expand All @@ -234,8 +231,7 @@ def valid_url(url, verbose=False, test=False):
url_slug = path_chunks[-1] if path_chunks else ""

if tld in BAD_DOMAINS:
if verbose:
print("%s caught for a bad tld" % url)
log.debug("url %s rejected due to bad domain (%s)", url, tld)
return False

if len(path_chunks) == 0:
Expand All @@ -248,46 +244,40 @@ def valid_url(url, verbose=False, test=False):
if url_slug and (dash_count > 4 or underscore_count > 4):
if dash_count >= underscore_count:
if tld not in [x.lower() for x in url_slug.split("-")]:
if verbose:
print("%s verified for being a slug" % url)
log.debug("url %s accepted due to title slug (%s)", url, url_slug)
return True

if underscore_count > dash_count:
if tld not in [x.lower() for x in url_slug.split("_")]:
if verbose:
print("%s verified for being a slug" % url)
log.debug("url %s accepted due to title slug (%s)", url, url_slug)
return True

# There must be at least 2 subpaths
if len(path_chunks) <= 1:
if verbose:
print("%s caught for path chunks too small" % url)
log.debug(
"url %s rejected due to less than two path_chunks (%s)", url, path_chunks
)
return False

# Check for subdomain & path red flags
# Eg: http://cnn.com/careers.html or careers.cnn.com --> BAD
for b in BAD_CHUNKS:
if b in path_chunks or b == subd:
if verbose:
print("%s caught for bad chunks" % url)
log.debug("url %s rejected due to bad chunk (%s)", url, b)
return False

match_date = re.search(DATE_REGEX, url)

# if we caught the verified date above, it's an article
if match_date is not None:
if verbose:
print("%s verified for date" % url)
log.debug("url %s accepted for date in path", url)
return True

for GOOD in GOOD_PATHS:
if GOOD.lower() in [p.lower() for p in path_chunks]:
if verbose:
print("%s verified for good path" % url)
log.debug("url %s accepted for good path", url)
return True

if verbose:
print("%s caught for default false" % url)
log.debug("url %s rejected for default false", url)
return False


Expand Down

0 comments on commit 152ff92

Please sign in to comment.