From 152ff92de28e6728781a28f875a5e5935b829b18 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 30 Dec 2023 02:14:12 +0200 Subject: [PATCH] refactor(misc): :ambulance: deprecated config.verbose. debug output moved to standard logging module. --- newspaper/configuration.py | 2 +- newspaper/urls.py | 40 ++++++++++++++------------------------ 2 files changed, 16 insertions(+), 26 deletions(-) diff --git a/newspaper/configuration.py b/newspaper/configuration.py index 3ad3b389..a7d56679 100644 --- a/newspaper/configuration.py +++ b/newspaper/configuration.py @@ -195,7 +195,7 @@ def __init__(self): # Number of threads to use for mthreaded downloads self.number_threads = 10 - # If true, it will output debugging information + # Deprecated, use standard python logging module instead (debug level) self.verbose = False # for debugging self.thread_timeout_seconds = 10 diff --git a/newspaper/urls.py b/newspaper/urls.py index 4e4ddb68..f7614384 100644 --- a/newspaper/urls.py +++ b/newspaper/urls.py @@ -135,13 +135,13 @@ def prepare_url(url, source_url=None): # proper_url = remove_args(url) proper_url = url except ValueError as e: - log.critical("url %s failed on err %s" % (url, str(e))) + log.error("url %s failed on err %s", url, str(e)) proper_url = "" return proper_url -def valid_url(url, verbose=False, test=False): +def valid_url(url, test=False): """ Is this URL a valid news-article url? @@ -182,16 +182,14 @@ def valid_url(url, verbose=False, test=False): # 11 chars is shortest valid url length, eg: http://x.co if url is None or len(url) < 11: - if verbose: - print("\t%s rejected because len of url is less than 11" % url) + log.debug("url %s rejected due to short length < 11", url) return False r1 = "mailto:" in url # TODO not sure if these rules are redundant r2 = ("http://" not in url) and ("https://" not in url) if r1 or r2: - if verbose: - print("\t%s rejected because len of url structure" % url) + log.debug("url %s rejected due to mailto in link or no http(s) schema", url) return False path = urlparse(url).path @@ -213,8 +211,7 @@ def valid_url(url, verbose=False, test=False): # if the file type is a media type, reject instantly if file_type and file_type not in ALLOWED_TYPES: - if verbose: - print("\t%s rejected due to bad filetype" % url) + log.debug("url %s rejected due to bad filetype (%s)", url, file_type) return False last_chunk = path_chunks[-1].split(".") @@ -234,8 +231,7 @@ def valid_url(url, verbose=False, test=False): url_slug = path_chunks[-1] if path_chunks else "" if tld in BAD_DOMAINS: - if verbose: - print("%s caught for a bad tld" % url) + log.debug("url %s rejected due to bad domain (%s)", url, tld) return False if len(path_chunks) == 0: @@ -248,46 +244,40 @@ def valid_url(url, verbose=False, test=False): if url_slug and (dash_count > 4 or underscore_count > 4): if dash_count >= underscore_count: if tld not in [x.lower() for x in url_slug.split("-")]: - if verbose: - print("%s verified for being a slug" % url) + log.debug("url %s accepted due to title slug (%s)", url, url_slug) return True if underscore_count > dash_count: if tld not in [x.lower() for x in url_slug.split("_")]: - if verbose: - print("%s verified for being a slug" % url) + log.debug("url %s accepted due to title slug (%s)", url, url_slug) return True # There must be at least 2 subpaths if len(path_chunks) <= 1: - if verbose: - print("%s caught for path chunks too small" % url) + log.debug( + "url %s rejected due to less than two path_chunks (%s)", url, path_chunks + ) return False # Check for subdomain & path red flags # Eg: http://cnn.com/careers.html or careers.cnn.com --> BAD for b in BAD_CHUNKS: if b in path_chunks or b == subd: - if verbose: - print("%s caught for bad chunks" % url) + log.debug("url %s rejected due to bad chunk (%s)", url, b) return False match_date = re.search(DATE_REGEX, url) # if we caught the verified date above, it's an article if match_date is not None: - if verbose: - print("%s verified for date" % url) + log.debug("url %s accepted for date in path", url) return True for GOOD in GOOD_PATHS: if GOOD.lower() in [p.lower() for p in path_chunks]: - if verbose: - print("%s verified for good path" % url) + log.debug("url %s accepted for good path", url) return True - - if verbose: - print("%s caught for default false" % url) + log.debug("url %s rejected for default false", url) return False