Skip to content

Commit

Permalink
dualspider update
Browse files Browse the repository at this point in the history
istari.ai dualspider version
  • Loading branch information
datawizard1337 committed Apr 22, 2020
1 parent 26beccb commit ef2f92f
Show file tree
Hide file tree
Showing 103 changed files with 1,231 additions and 86 deletions.
1 change: 1 addition & 0 deletions ARGUS.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ ARGUS.egg-info/dependency_links.txt
ARGUS.egg-info/entry_points.txt
ARGUS.egg-info/top_level.txt
ARGUS/spiders/__init__.py
ARGUS/spiders/dualspider.py
ARGUS/spiders/linkspider.py
ARGUS/spiders/textspider.py
2 changes: 1 addition & 1 deletion ARGUS.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def change_dropdown6(*args):

tkvar7 = tk.StringVar(master)
tkvar7.set("Select") # set the default option
spiders = ["text", "link"]
spiders = ["text", "link", "dual"]
popupMenu7 = tk.OptionMenu(master, tkvar7, *spiders)
popupMenu7.grid(row=3, column=1, sticky=tk.E)
popupMenu7.config(font=("Calibri", 12))
Expand Down
Binary file modified ARGUS/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified ARGUS/__pycache__/items.cpython-36.pyc
Binary file not shown.
Binary file modified ARGUS/__pycache__/pipelines.cpython-36.pyc
Binary file not shown.
Binary file modified ARGUS/__pycache__/settings.cpython-36.pyc
Binary file not shown.
36 changes: 35 additions & 1 deletion ARGUS/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,38 @@ class LinkExporter(scrapy.Item):
error = scrapy.Field()
links = scrapy.Field()
alias = scrapy.Field()
pass
pass

class DualCollector(scrapy.Item):
ID = scrapy.Field()
dl_slot = scrapy.Field()
start_page = scrapy.Field()
start_domain = scrapy.Field()
redirect = scrapy.Field()
scraped_urls = scrapy.Field()
scraped_text = scrapy.Field()
title = scrapy.Field()
description = scrapy.Field()
keywords = scrapy.Field()
scrape_counter = scrapy.Field()
error = scrapy.Field()
links = scrapy.Field()
alias = scrapy.Field()
pass

class DualExporter(scrapy.Item):
ID = scrapy.Field()
dl_slot = scrapy.Field()
redirect = scrapy.Field()
start_page = scrapy.Field()
url = scrapy.Field()
timestamp = scrapy.Field()
text = scrapy.Field()
error = scrapy.Field()
dl_rank = scrapy.Field()
title = scrapy.Field()
description = scrapy.Field()
keywords = scrapy.Field()
links = scrapy.Field()
alias = scrapy.Field()
pass
102 changes: 97 additions & 5 deletions ARGUS/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.exporters import CsvItemExporter
from ARGUS.items import Exporter, LinkExporter
from ARGUS.items import Exporter, LinkExporter, DualExporter
import time
import datetime
import os
import re

class TextPipeline(object):

Expand All @@ -18,17 +19,19 @@ def open_spider(self, spider):
url_chunk = spider.url_chunk
chunk = url_chunk.split(".")[0].split("_")[-1]
try:
self.fileobj = open(os.getcwd() +"\\chunks\\output_" + chunk + ".csv", "ab")
self.fileobj = open("DARGUS_chunk_" + chunk + ".csv", "ab")
except:
self.fileobj = open(os.getcwd() +"\\chunks\\output_" + chunk + ".csv", "wb")
self.fileobj = open("DARGUS_chunk_" + chunk + ".csv", "wb")
self.exporter = CsvItemExporter(self.fileobj, encoding='utf-8', delimiter="\t")
self.exporter.fields_to_export = ["ID", "dl_rank", "dl_slot", "error", "redirect", "start_page", "title", "keywords", "description", "text", "timestamp", "url"]
self.exporter.start_exporting()

#close file when finished
def close_spider(self, spider):
self.exporter.finish_exporting()
# open("finished", "wb")
self.fileobj.close()



def process_item(self, item, spider):
Expand Down Expand Up @@ -80,8 +83,6 @@ def process_item(self, item, spider):


class LinkPipeline(object):


def open_spider(self, spider):
url_chunk = spider.url_chunk
chunk = url_chunk.split(".")[0].split("_")[-1]
Expand Down Expand Up @@ -122,3 +123,94 @@ def process_item(self, item, spider):
self.exporter.export_item(site)

return


class DualPipeline(object):
def open_spider(self, spider):
url_chunk = spider.url_chunk
chunk = url_chunk.split(".")[0].split("_")[-1]
try:
self.fileobj = open(os.getcwd() +"\\chunks\\ARGUS_chunk_" + chunk + ".csv", "ab")
except:
self.fileobj = open(os.getcwd() +"\\chunks\\ARGUS_chunk_" + chunk + ".csv", "wb")
self.exporter = CsvItemExporter(self.fileobj, encoding='utf-8', delimiter="\t")
self.exporter.fields_to_export = ["ID", "dl_rank", "dl_slot", "alias", "error", "redirect", "start_page", "title", "keywords", "description", "text", "links", "timestamp", "url"]
self.exporter.start_exporting()


#close file when finished
def close_spider(self, spider):
self.exporter.finish_exporting()
self.fileobj.close()


def process_item(self, item, spider):

tag_pattern = re.compile(r"(\[->.+?<-\] ?)+?")

#get scraped text from collector item
scraped_text = item["scraped_text"]
c=0
#iterate webpage chunks
for webpage in scraped_text:
#initialise one exporter item per url and fill with info from collector item
webpage_exporter = DualExporter()
webpage_exporter["dl_slot"] = item["dl_slot"][0]
webpage_exporter["start_page"] = item["start_page"][0]
webpage_exporter["url"] = item["scraped_urls"][c]
webpage_exporter["redirect"] = item["redirect"][0]
webpage_exporter["error"] = item["error"]
webpage_exporter["ID"] = item["ID"][0]

# add title, description, and keywords to the output
title = item["title"][c]
description = item["description"][c]
keywords = item["keywords"][c]
webpage_exporter["title"] = title.replace("\n", "").replace("\t", "").replace("\r\n", "").replace("\r", "")
webpage_exporter["description"] = description.replace("\n", "").replace("\t", "").replace("\r\n", "").replace("\r", "")
webpage_exporter["keywords"] = keywords.replace("\n", "").replace("\t", "").replace("\r\n", "").replace("\r", "")

webpage_exporter["alias"] = item["alias"][0]
links = []
#iterate site chunks
for link in item["links"]:
#add collected links to link list if not included yet
if link != "":
if link not in links:
links.append(link)
#add links and export
webpage_exporter["links"] = links

#generate webpage text
webpage_text = ""
#iterate extracted tag texts, clean them and merge them
for tagchunk in webpage:
text_piece = tagchunk[-1]
text_piece = " ".join(text_piece[0].split())
text_piece = text_piece.replace("\n", "").replace("\t", "").replace("\r\n", "").replace("\r", "")

#filter empty tag pieces
splitted_text_piece = re.split(tag_pattern, text_piece)
text_piece = ""

for i, tag_element in enumerate(splitted_text_piece):
if (i % 2) == 0:
if tag_element.strip().strip('"') != "":
text_piece = text_piece + splitted_text_piece[i-1] + splitted_text_piece[i]

#if empty skip
if text_piece == "":
continue

webpage_text = webpage_text + ". " + text_piece

#add text and timestamp to exporter item and export it
webpage_exporter["text"] = webpage_text[2:] #index to get rid of ". " at beginning of string
webpage_exporter["timestamp"] = datetime.datetime.fromtimestamp(time.time()).strftime("%c")
webpage_exporter["dl_rank"] = c
self.exporter.export_item(webpage_exporter)

c+=1


return
Binary file modified ARGUS/spiders/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file not shown.
Binary file modified ARGUS/spiders/__pycache__/linkspider.cpython-36.pyc
Binary file not shown.
Binary file modified ARGUS/spiders/__pycache__/textspider.cpython-36.pyc
Binary file not shown.
Loading

0 comments on commit ef2f92f

Please sign in to comment.