Finale checks before launch

datawizard1337 · Sep 16, 2020 · 1c6d405 · 1c6d405
1 parent aa3a6ee
commit 1c6d405
Show file tree

Hide file tree

Showing 21 changed files with 65 additions and 754 deletions.
diff --git a/ARGUS.py b/ARGUS.py
@@ -42,6 +42,7 @@
 dependencies_2 = ["tkinter import filedialog", "tkinter import messagebox", "tkinter import ttk", "PIL import Image", "PIL import ImageTk", "twisted.internet.error import DNSLookupError, TimeoutError, TCPTimedOutError", "io import BytesIO"]
 
 
+
 for library in dependencies_1:
     try:
         exec("import {module}".format(module=library))
@@ -233,7 +234,7 @@ def change_dropdown6(*args):
 
 
 # Short URLs
-tk.Label(master, text="Prefer Short URLs:", font=("Calibri", 12)).grid(row=4, column=1, sticky=tk.W)
+tk.Label(master, text="Prefer Short URLs*:", font=("Calibri", 12)).grid(row=4, column=1, sticky=tk.W)
 
 e9 = tk.Entry(master)
 e9.insert(tk.END, "off")  # set default option
@@ -255,7 +256,7 @@ def change_dropdown9(*args):
 
 # Language
 
-tk.Label(master, text="Preferred Language:", font=("Calibri", 12)).grid(row=5, column=1, sticky=tk.W)
+tk.Label(master, text="Preferred Language*:", font=("Calibri", 12)).grid(row=5, column=1, sticky=tk.W)
 
 e10 = tk.Entry(master)
 e10.insert(tk.END, "None")  # set default option
@@ -280,6 +281,25 @@ def change_dropdown10(*args):
 tkvar10.trace('w', change_dropdown10)
 
 
+# PDF scraping
+tk.Label(master, text="Scrape PDFs*:", font=("Calibri", 12)).grid(row=6, column=1, sticky=tk.W)
+
+e15 = tk.Entry(master)
+e15.insert(tk.END, "off")  # set default option
+
+tkvar15 = tk.StringVar(master)
+tkvar15.set("Select") # set the default option
+preferences = ["on", "off"]
+popupMenu15 = tk.OptionMenu(master, tkvar15, *preferences)
+popupMenu15.grid(row=6, column=1, stick=tk.E)
+popupMenu15.config(font=("Calibri", 12))
+
+def change_dropdown15(*args):
+    e15.delete(0, 'end')
+    preference = tkvar15.get()
+    e15.insert (tk.END, preference)
+
+tkvar15.trace('w', change_dropdown15)
 
 
 
@@ -347,6 +367,7 @@ def change_dropdown11(*args):
 log_level = {}
 maxsize = {}
 timeout = {}
+pdfscrape = {}
 """
 
 scrapyd_file = """
@@ -387,9 +408,9 @@ def start_scraping():
     settings_txt = open(script_dir + r"\bin\settings.txt", "w", encoding="utf-8")
     settings_txt.truncate()
     byte_size = int(e13.get())*1000000    # convert from MB to B
-    settings_txt.write(settings_file.format(e1.get(), e2.get(), e3.get(), e4.get(), e5.get(), e6.get(), e8.get(), e9.get(), e10.get(), e11.get(), byte_size, e14.get()))
+    settings_txt.write(settings_file.format(e1.get(), e2.get(), e3.get(), e4.get(), e5.get(), e6.get(), e8.get(), e9.get(), e10.get(), e11.get(), byte_size, e14.get(), e15.get()))
     settings_txt.close()
-    scrapyd_txt = open(script_dir + r"scrapyd.conf", "w", encoding="utf-8")
+    scrapyd_txt = open(script_dir + r"/scrapyd.conf", "w", encoding="utf-8")
     scrapyd_txt.truncate()
     scrapyd_txt.write(scrapyd_file.format(e6.get()))
     scrapyd_txt.close()

diff --git a/ARGUS/__pycache__/__init__.cpython-37.pyc b/ARGUS/__pycache__/__init__.cpython-37.pyc
diff --git a/ARGUS/__pycache__/items.cpython-37.pyc b/ARGUS/__pycache__/items.cpython-37.pyc
diff --git a/ARGUS/__pycache__/pipelines.cpython-37.pyc b/ARGUS/__pycache__/pipelines.cpython-37.pyc
diff --git a/ARGUS/__pycache__/settings.cpython-37.pyc b/ARGUS/__pycache__/settings.cpython-37.pyc
diff --git a/ARGUS/spiders/__pycache__/__init__.cpython-37.pyc b/ARGUS/spiders/__pycache__/__init__.cpython-37.pyc
diff --git a/ARGUS/spiders/__pycache__/dualspider.cpython-37.pyc b/ARGUS/spiders/__pycache__/dualspider.cpython-37.pyc
diff --git a/ARGUS/spiders/__pycache__/linkspider.cpython-37.pyc b/ARGUS/spiders/__pycache__/linkspider.cpython-37.pyc
diff --git a/ARGUS/spiders/__pycache__/textspider.cpython-37.pyc b/ARGUS/spiders/__pycache__/textspider.cpython-37.pyc
diff --git a/ARGUS/spiders/dualspider.py b/ARGUS/spiders/dualspider.py
@@ -33,7 +33,7 @@ class DualSpider(scrapy.Spider):
 ##################################################################
 
     #load URLs from text file defined in given parameter
-    def __init__(self, url_chunk="", limit=5, ID="ID", url_col="url", language="", prefer_short_urls="on", *args, **kwargs):
+    def __init__(self, url_chunk="", limit=5, ID="ID", url_col="url", language="", prefer_short_urls="on", pdfscrape="off", *args, **kwargs):
         super(DualSpider, self).__init__(*args, **kwargs)
         #loads urls and IDs from text file
         data = pd.read_csv(url_chunk, delimiter="\t", encoding="utf-8", error_bad_lines=False, engine="python")
@@ -44,6 +44,7 @@ def __init__(self, url_chunk="", limit=5, ID="ID", url_col="url", language="", p
         self.url_chunk = url_chunk
         self.language = language.split("_")
         self.prefer_short_urls = prefer_short_urls
+        self.pdfscrape = pdfscrape
 
 
 ##################################################################
@@ -366,18 +367,29 @@ def processURLstack(self, response):
             domain = self.subdomainGetter(urlstack[0])
             if domain not in self.allowed_domains:
                 urlstack.pop(0)
+                continue                
             #pop some unwanted urls
-            elif urlstack[0].startswith("mail"):
+            if urlstack[0].startswith("mail"):
                 urlstack.pop(0)
-            elif urlstack[0].startswith("tel"):
+                continue
+            if urlstack[0].startswith("tel"):
+                urlstack.pop(0)
+                continue
+            if urlstack[0].startswith("javascript"):
                 urlstack.pop(0)
+                continue
             #pop unwanted filetypes
-            elif urlstack[0].split(".")[-1].lower() in self.filetypes:
+            if urlstack[0].split(".")[-1].lower() in self.filetypes:
                 urlstack.pop(0)
+                continue
+            if self.pdfscrape == "off":
+                if urlstack[0].split(".")[-1].lower() == "pdf":
+                    urlstack.pop(0)
+                    continue
             #pop visited urls.
             #also pop urls that cannot be requested
             #(potential bottleneck: Request has to be sent to generate fingerprint from)
-            elif request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
+            if request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
                     urlstack.pop(0)
             else:
                 break
@@ -416,18 +428,25 @@ def processURLstack(self, response):
                         domain = self.subdomainGetter(urlstack[0])
                         if domain not in self.allowed_domains:
                             urlstack.pop(0)
+                            continue
                         #pop some unwanted urls
-                        elif urlstack[0].startswith("mail"):
+                        if urlstack[0].startswith("mail"):
                             urlstack.pop(0)
-                        elif urlstack[0].startswith("tel"):
+                            continue
+                        if urlstack[0].startswith("tel"):
                             urlstack.pop(0)
+                            continue
+                        if urlstack[0].startswith("javascript"):
+                            urlstack.pop(0)
+                            continue
                         #pop unwanted filetypes
-                        elif urlstack[0].split(".")[-1].lower() in self.filetypes:
+                        if urlstack[0].split(".")[-1].lower() in self.filetypes:
                             urlstack.pop(0)
+                            continue
                         #pop visited urls.
                         #also pop urls that cannot be requested
                         #(potential bottleneck: Request has to be sent to generate fingerprint from)
-                        elif request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
+                        if request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
                                 urlstack.pop(0)
                         else:
                             break
@@ -446,9 +465,11 @@ def processURLstack(self, response):
 
         #if there are no urls left in the urlstack, the website was scraped completely and the item can be sent to the pipeline
         else:
+            print("ITEM TYPE: ", type(loader))
             yield loader.load_item()
 
 
+
 ##################################################################
 # PARSE SUB PAGE
 ##################################################################