From 152ff92de28e6728781a28f875a5e5935b829b18 Mon Sep 17 00:00:00 2001
From: Andrei <andrei@thephpfactory.com>
Date: Sat, 30 Dec 2023 02:14:12 +0200
Subject: [PATCH] refactor(misc): :ambulance: deprecated config.verbose. debug
 output moved to standard logging module.

---
 newspaper/configuration.py |  2 +-
 newspaper/urls.py          | 40 ++++++++++++++------------------------
 2 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/newspaper/configuration.py b/newspaper/configuration.py
index 3ad3b389..a7d56679 100644
--- a/newspaper/configuration.py
+++ b/newspaper/configuration.py
@@ -195,7 +195,7 @@ def __init__(self):
         # Number of threads to use for mthreaded downloads
         self.number_threads = 10
 
-        # If true, it will output debugging information
+        # Deprecated, use standard python logging module instead (debug level)
         self.verbose = False  # for debugging
 
         self.thread_timeout_seconds = 10
diff --git a/newspaper/urls.py b/newspaper/urls.py
index 4e4ddb68..f7614384 100644
--- a/newspaper/urls.py
+++ b/newspaper/urls.py
@@ -135,13 +135,13 @@ def prepare_url(url, source_url=None):
             # proper_url = remove_args(url)
             proper_url = url
     except ValueError as e:
-        log.critical("url %s failed on err %s" % (url, str(e)))
+        log.error("url %s failed on err %s", url, str(e))
         proper_url = ""
 
     return proper_url
 
 
-def valid_url(url, verbose=False, test=False):
+def valid_url(url, test=False):
     """
     Is this URL a valid news-article url?
 
@@ -182,16 +182,14 @@ def valid_url(url, verbose=False, test=False):
 
     # 11 chars is shortest valid url length, eg: http://x.co
     if url is None or len(url) < 11:
-        if verbose:
-            print("\t%s rejected because len of url is less than 11" % url)
+        log.debug("url %s rejected due to short length < 11", url)
         return False
 
     r1 = "mailto:" in url  # TODO not sure if these rules are redundant
     r2 = ("http://" not in url) and ("https://" not in url)
 
     if r1 or r2:
-        if verbose:
-            print("\t%s rejected because len of url structure" % url)
+        log.debug("url %s rejected due to mailto in link or no http(s) schema", url)
         return False
 
     path = urlparse(url).path
@@ -213,8 +211,7 @@ def valid_url(url, verbose=False, test=False):
 
         # if the file type is a media type, reject instantly
         if file_type and file_type not in ALLOWED_TYPES:
-            if verbose:
-                print("\t%s rejected due to bad filetype" % url)
+            log.debug("url %s rejected due to bad filetype (%s)", url, file_type)
             return False
 
         last_chunk = path_chunks[-1].split(".")
@@ -234,8 +231,7 @@ def valid_url(url, verbose=False, test=False):
     url_slug = path_chunks[-1] if path_chunks else ""
 
     if tld in BAD_DOMAINS:
-        if verbose:
-            print("%s caught for a bad tld" % url)
+        log.debug("url %s rejected due to bad domain (%s)", url, tld)
         return False
 
     if len(path_chunks) == 0:
@@ -248,46 +244,40 @@ def valid_url(url, verbose=False, test=False):
     if url_slug and (dash_count > 4 or underscore_count > 4):
         if dash_count >= underscore_count:
             if tld not in [x.lower() for x in url_slug.split("-")]:
-                if verbose:
-                    print("%s verified for being a slug" % url)
+                log.debug("url %s accepted due to title slug (%s)", url, url_slug)
                 return True
 
         if underscore_count > dash_count:
             if tld not in [x.lower() for x in url_slug.split("_")]:
-                if verbose:
-                    print("%s verified for being a slug" % url)
+                log.debug("url %s accepted due to title slug (%s)", url, url_slug)
                 return True
 
     # There must be at least 2 subpaths
     if len(path_chunks) <= 1:
-        if verbose:
-            print("%s caught for path chunks too small" % url)
+        log.debug(
+            "url %s rejected due to less than two path_chunks (%s)", url, path_chunks
+        )
         return False
 
     # Check for subdomain & path red flags
     # Eg: http://cnn.com/careers.html or careers.cnn.com --> BAD
     for b in BAD_CHUNKS:
         if b in path_chunks or b == subd:
-            if verbose:
-                print("%s caught for bad chunks" % url)
+            log.debug("url %s rejected due to bad chunk (%s)", url, b)
             return False
 
     match_date = re.search(DATE_REGEX, url)
 
     # if we caught the verified date above, it's an article
     if match_date is not None:
-        if verbose:
-            print("%s verified for date" % url)
+        log.debug("url %s accepted for date in path", url)
         return True
 
     for GOOD in GOOD_PATHS:
         if GOOD.lower() in [p.lower() for p in path_chunks]:
-            if verbose:
-                print("%s verified for good path" % url)
+            log.debug("url %s accepted for good path", url)
             return True
-
-    if verbose:
-        print("%s caught for default false" % url)
+    log.debug("url %s rejected for default false", url)
     return False