Skip to content

Commit

Permalink
Finale checks before launch
Browse files Browse the repository at this point in the history
  • Loading branch information
datawizard1337 committed Sep 16, 2020
1 parent aa3a6ee commit 1c6d405
Show file tree
Hide file tree
Showing 21 changed files with 65 additions and 754 deletions.
29 changes: 25 additions & 4 deletions ARGUS.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
dependencies_2 = ["tkinter import filedialog", "tkinter import messagebox", "tkinter import ttk", "PIL import Image", "PIL import ImageTk", "twisted.internet.error import DNSLookupError, TimeoutError, TCPTimedOutError", "io import BytesIO"]



for library in dependencies_1:
try:
exec("import {module}".format(module=library))
Expand Down Expand Up @@ -233,7 +234,7 @@ def change_dropdown6(*args):


# Short URLs
tk.Label(master, text="Prefer Short URLs:", font=("Calibri", 12)).grid(row=4, column=1, sticky=tk.W)
tk.Label(master, text="Prefer Short URLs*:", font=("Calibri", 12)).grid(row=4, column=1, sticky=tk.W)

e9 = tk.Entry(master)
e9.insert(tk.END, "off") # set default option
Expand All @@ -255,7 +256,7 @@ def change_dropdown9(*args):

# Language

tk.Label(master, text="Preferred Language:", font=("Calibri", 12)).grid(row=5, column=1, sticky=tk.W)
tk.Label(master, text="Preferred Language*:", font=("Calibri", 12)).grid(row=5, column=1, sticky=tk.W)

e10 = tk.Entry(master)
e10.insert(tk.END, "None") # set default option
Expand All @@ -280,6 +281,25 @@ def change_dropdown10(*args):
tkvar10.trace('w', change_dropdown10)


# PDF scraping
tk.Label(master, text="Scrape PDFs*:", font=("Calibri", 12)).grid(row=6, column=1, sticky=tk.W)

e15 = tk.Entry(master)
e15.insert(tk.END, "off") # set default option

tkvar15 = tk.StringVar(master)
tkvar15.set("Select") # set the default option
preferences = ["on", "off"]
popupMenu15 = tk.OptionMenu(master, tkvar15, *preferences)
popupMenu15.grid(row=6, column=1, stick=tk.E)
popupMenu15.config(font=("Calibri", 12))

def change_dropdown15(*args):
e15.delete(0, 'end')
preference = tkvar15.get()
e15.insert (tk.END, preference)

tkvar15.trace('w', change_dropdown15)



Expand Down Expand Up @@ -347,6 +367,7 @@ def change_dropdown11(*args):
log_level = {}
maxsize = {}
timeout = {}
pdfscrape = {}
"""

scrapyd_file = """
Expand Down Expand Up @@ -387,9 +408,9 @@ def start_scraping():
settings_txt = open(script_dir + r"\bin\settings.txt", "w", encoding="utf-8")
settings_txt.truncate()
byte_size = int(e13.get())*1000000 # convert from MB to B
settings_txt.write(settings_file.format(e1.get(), e2.get(), e3.get(), e4.get(), e5.get(), e6.get(), e8.get(), e9.get(), e10.get(), e11.get(), byte_size, e14.get()))
settings_txt.write(settings_file.format(e1.get(), e2.get(), e3.get(), e4.get(), e5.get(), e6.get(), e8.get(), e9.get(), e10.get(), e11.get(), byte_size, e14.get(), e15.get()))
settings_txt.close()
scrapyd_txt = open(script_dir + r"scrapyd.conf", "w", encoding="utf-8")
scrapyd_txt = open(script_dir + r"/scrapyd.conf", "w", encoding="utf-8")
scrapyd_txt.truncate()
scrapyd_txt.write(scrapyd_file.format(e6.get()))
scrapyd_txt.close()
Expand Down
Binary file added ARGUS/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file added ARGUS/__pycache__/items.cpython-37.pyc
Binary file not shown.
Binary file added ARGUS/__pycache__/pipelines.cpython-37.pyc
Binary file not shown.
Binary file added ARGUS/__pycache__/settings.cpython-37.pyc
Binary file not shown.
Binary file added ARGUS/spiders/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
39 changes: 30 additions & 9 deletions ARGUS/spiders/dualspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class DualSpider(scrapy.Spider):
##################################################################

#load URLs from text file defined in given parameter
def __init__(self, url_chunk="", limit=5, ID="ID", url_col="url", language="", prefer_short_urls="on", *args, **kwargs):
def __init__(self, url_chunk="", limit=5, ID="ID", url_col="url", language="", prefer_short_urls="on", pdfscrape="off", *args, **kwargs):
super(DualSpider, self).__init__(*args, **kwargs)
#loads urls and IDs from text file
data = pd.read_csv(url_chunk, delimiter="\t", encoding="utf-8", error_bad_lines=False, engine="python")
Expand All @@ -44,6 +44,7 @@ def __init__(self, url_chunk="", limit=5, ID="ID", url_col="url", language="", p
self.url_chunk = url_chunk
self.language = language.split("_")
self.prefer_short_urls = prefer_short_urls
self.pdfscrape = pdfscrape


##################################################################
Expand Down Expand Up @@ -366,18 +367,29 @@ def processURLstack(self, response):
domain = self.subdomainGetter(urlstack[0])
if domain not in self.allowed_domains:
urlstack.pop(0)
continue
#pop some unwanted urls
elif urlstack[0].startswith("mail"):
if urlstack[0].startswith("mail"):
urlstack.pop(0)
elif urlstack[0].startswith("tel"):
continue
if urlstack[0].startswith("tel"):
urlstack.pop(0)
continue
if urlstack[0].startswith("javascript"):
urlstack.pop(0)
continue
#pop unwanted filetypes
elif urlstack[0].split(".")[-1].lower() in self.filetypes:
if urlstack[0].split(".")[-1].lower() in self.filetypes:
urlstack.pop(0)
continue
if self.pdfscrape == "off":
if urlstack[0].split(".")[-1].lower() == "pdf":
urlstack.pop(0)
continue
#pop visited urls.
#also pop urls that cannot be requested
#(potential bottleneck: Request has to be sent to generate fingerprint from)
elif request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
if request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
urlstack.pop(0)
else:
break
Expand Down Expand Up @@ -416,18 +428,25 @@ def processURLstack(self, response):
domain = self.subdomainGetter(urlstack[0])
if domain not in self.allowed_domains:
urlstack.pop(0)
continue
#pop some unwanted urls
elif urlstack[0].startswith("mail"):
if urlstack[0].startswith("mail"):
urlstack.pop(0)
elif urlstack[0].startswith("tel"):
continue
if urlstack[0].startswith("tel"):
urlstack.pop(0)
continue
if urlstack[0].startswith("javascript"):
urlstack.pop(0)
continue
#pop unwanted filetypes
elif urlstack[0].split(".")[-1].lower() in self.filetypes:
if urlstack[0].split(".")[-1].lower() in self.filetypes:
urlstack.pop(0)
continue
#pop visited urls.
#also pop urls that cannot be requested
#(potential bottleneck: Request has to be sent to generate fingerprint from)
elif request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
if request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
urlstack.pop(0)
else:
break
Expand All @@ -446,9 +465,11 @@ def processURLstack(self, response):

#if there are no urls left in the urlstack, the website was scraped completely and the item can be sent to the pipeline
else:
print("ITEM TYPE: ", type(loader))
yield loader.load_item()



##################################################################
# PARSE SUB PAGE
##################################################################
Expand Down
Loading

0 comments on commit 1c6d405

Please sign in to comment.