dualspider update

istari.ai dualspider version
datawizard1337 · Apr 22, 2020 · ef2f92f · ef2f92f
1 parent 26beccb
commit ef2f92f
Show file tree

Hide file tree

Showing 103 changed files with 1,231 additions and 86 deletions.
diff --git a/ARGUS.egg-info/SOURCES.txt b/ARGUS.egg-info/SOURCES.txt
@@ -11,5 +11,6 @@ ARGUS.egg-info/dependency_links.txt
 ARGUS.egg-info/entry_points.txt
 ARGUS.egg-info/top_level.txt
 ARGUS/spiders/__init__.py
+ARGUS/spiders/dualspider.py
 ARGUS/spiders/linkspider.py
 ARGUS/spiders/textspider.py
diff --git a/ARGUS.py b/ARGUS.py
@@ -209,7 +209,7 @@ def change_dropdown6(*args):
 
 tkvar7 = tk.StringVar(master)
 tkvar7.set("Select") # set the default option
-spiders = ["text", "link"]
+spiders = ["text", "link", "dual"]
 popupMenu7 = tk.OptionMenu(master, tkvar7, *spiders)
 popupMenu7.grid(row=3, column=1, sticky=tk.E)
 popupMenu7.config(font=("Calibri", 12))

diff --git a/ARGUS/__pycache__/__init__.cpython-36.pyc b/ARGUS/__pycache__/__init__.cpython-36.pyc
diff --git a/ARGUS/__pycache__/items.cpython-36.pyc b/ARGUS/__pycache__/items.cpython-36.pyc
diff --git a/ARGUS/__pycache__/pipelines.cpython-36.pyc b/ARGUS/__pycache__/pipelines.cpython-36.pyc
diff --git a/ARGUS/__pycache__/settings.cpython-36.pyc b/ARGUS/__pycache__/settings.cpython-36.pyc
diff --git a/ARGUS/items.py b/ARGUS/items.py
@@ -60,4 +60,38 @@ class LinkExporter(scrapy.Item):
     error = scrapy.Field()
     links = scrapy.Field()
     alias = scrapy.Field()
-    pass 
+    pass 
+
+class DualCollector(scrapy.Item):
+    ID = scrapy.Field()
+    dl_slot = scrapy.Field()
+    start_page = scrapy.Field()
+    start_domain = scrapy.Field()
+    redirect = scrapy.Field()
+    scraped_urls = scrapy.Field()
+    scraped_text = scrapy.Field()
+    title = scrapy.Field()
+    description = scrapy.Field()
+    keywords = scrapy.Field()
+    scrape_counter = scrapy.Field()
+    error = scrapy.Field()
+    links = scrapy.Field()
+    alias = scrapy.Field()
+    pass
+
+class DualExporter(scrapy.Item):
+    ID = scrapy.Field()
+    dl_slot = scrapy.Field()
+    redirect = scrapy.Field()
+    start_page = scrapy.Field()
+    url = scrapy.Field()
+    timestamp = scrapy.Field()
+    text = scrapy.Field()
+    error = scrapy.Field()
+    dl_rank = scrapy.Field()
+    title = scrapy.Field()
+    description = scrapy.Field()
+    keywords = scrapy.Field()
+    links = scrapy.Field()
+    alias = scrapy.Field()
+    pass
diff --git a/ARGUS/pipelines.py b/ARGUS/pipelines.py
@@ -6,10 +6,11 @@
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 
 from scrapy.exporters import CsvItemExporter
-from ARGUS.items import Exporter, LinkExporter
+from ARGUS.items import Exporter, LinkExporter, DualExporter
 import time
 import datetime
 import os
+import re
 
 class TextPipeline(object):
 
@@ -18,17 +19,19 @@ def open_spider(self, spider):
         url_chunk = spider.url_chunk
         chunk = url_chunk.split(".")[0].split("_")[-1]
         try:
-            self.fileobj = open(os.getcwd() +"\\chunks\\output_" + chunk + ".csv", "ab")
+            self.fileobj = open("DARGUS_chunk_" + chunk + ".csv", "ab")
         except:
-            self.fileobj = open(os.getcwd() +"\\chunks\\output_" + chunk + ".csv", "wb")
+            self.fileobj = open("DARGUS_chunk_" + chunk + ".csv", "wb")
         self.exporter = CsvItemExporter(self.fileobj, encoding='utf-8', delimiter="\t")
         self.exporter.fields_to_export = ["ID", "dl_rank", "dl_slot", "error", "redirect", "start_page", "title", "keywords", "description", "text", "timestamp", "url"]
         self.exporter.start_exporting()
 
     #close file when finished
     def close_spider(self, spider):
         self.exporter.finish_exporting()
+#        open("finished", "wb")
         self.fileobj.close()
+
 
 
     def process_item(self, item, spider):
@@ -80,8 +83,6 @@ def process_item(self, item, spider):
 
 
 class LinkPipeline(object):
-
-
     def open_spider(self, spider):
         url_chunk = spider.url_chunk
         chunk = url_chunk.split(".")[0].split("_")[-1]
@@ -122,3 +123,94 @@ def process_item(self, item, spider):
         self.exporter.export_item(site)
 
         return
+
+
+class DualPipeline(object):
+    def open_spider(self, spider):
+        url_chunk = spider.url_chunk
+        chunk = url_chunk.split(".")[0].split("_")[-1]
+        try:
+            self.fileobj = open(os.getcwd() +"\\chunks\\ARGUS_chunk_" + chunk + ".csv", "ab")
+        except:
+            self.fileobj = open(os.getcwd() +"\\chunks\\ARGUS_chunk_" + chunk + ".csv", "wb")
+        self.exporter = CsvItemExporter(self.fileobj, encoding='utf-8', delimiter="\t")
+        self.exporter.fields_to_export = ["ID", "dl_rank", "dl_slot", "alias", "error", "redirect", "start_page", "title", "keywords", "description", "text", "links", "timestamp", "url"]
+        self.exporter.start_exporting()
+
+
+    #close file when finished
+    def close_spider(self, spider):
+        self.exporter.finish_exporting()
+        self.fileobj.close()
+
+
+    def process_item(self, item, spider):
+
+        tag_pattern = re.compile(r"(\[->.+?<-\] ?)+?")
+
+        #get scraped text from collector item
+        scraped_text = item["scraped_text"]
+        c=0
+        #iterate webpage chunks
+        for webpage in scraped_text:
+            #initialise one exporter item per url and fill with info from collector item
+            webpage_exporter = DualExporter()
+            webpage_exporter["dl_slot"] = item["dl_slot"][0]
+            webpage_exporter["start_page"] = item["start_page"][0]
+            webpage_exporter["url"] = item["scraped_urls"][c]
+            webpage_exporter["redirect"] = item["redirect"][0]
+            webpage_exporter["error"] = item["error"]
+            webpage_exporter["ID"] = item["ID"][0]
+
+            # add title, description, and keywords to the output
+            title = item["title"][c]
+            description = item["description"][c]
+            keywords = item["keywords"][c]
+            webpage_exporter["title"] = title.replace("\n", "").replace("\t", "").replace("\r\n", "").replace("\r", "")
+            webpage_exporter["description"] = description.replace("\n", "").replace("\t", "").replace("\r\n", "").replace("\r", "")
+            webpage_exporter["keywords"] = keywords.replace("\n", "").replace("\t", "").replace("\r\n", "").replace("\r", "")
+
+            webpage_exporter["alias"] = item["alias"][0]
+            links = []
+            #iterate site chunks
+            for link in item["links"]:
+                #add collected links to link list if not included yet
+                if link != "":
+                    if link not in links:
+                        links.append(link)
+            #add links and export
+            webpage_exporter["links"] = links
+
+            #generate webpage text
+            webpage_text = ""
+            #iterate extracted tag texts, clean them and merge them
+            for tagchunk in webpage:
+                text_piece = tagchunk[-1]
+                text_piece = " ".join(text_piece[0].split())
+                text_piece = text_piece.replace("\n", "").replace("\t", "").replace("\r\n", "").replace("\r", "")
+
+                #filter empty tag pieces
+                splitted_text_piece = re.split(tag_pattern, text_piece)
+                text_piece = ""
+
+                for i, tag_element in enumerate(splitted_text_piece):
+                    if (i % 2) == 0:
+                        if tag_element.strip().strip('"') != "":
+                            text_piece = text_piece + splitted_text_piece[i-1] + splitted_text_piece[i]
+
+                #if empty skip
+                if text_piece == "":
+                    continue
+
+                webpage_text = webpage_text + ". " + text_piece
+
+            #add text and timestamp to exporter item and export it
+            webpage_exporter["text"] = webpage_text[2:] #index to get rid of ". " at beginning of string
+            webpage_exporter["timestamp"] = datetime.datetime.fromtimestamp(time.time()).strftime("%c")
+            webpage_exporter["dl_rank"] = c
+            self.exporter.export_item(webpage_exporter)
+
+            c+=1
+
+
+        return
diff --git a/ARGUS/spiders/__pycache__/__init__.cpython-36.pyc b/ARGUS/spiders/__pycache__/__init__.cpython-36.pyc
diff --git a/ARGUS/spiders/__pycache__/dualspider.cpython-36.pyc b/ARGUS/spiders/__pycache__/dualspider.cpython-36.pyc
diff --git a/ARGUS/spiders/__pycache__/linkspider.cpython-36.pyc b/ARGUS/spiders/__pycache__/linkspider.cpython-36.pyc
diff --git a/ARGUS/spiders/__pycache__/textspider.cpython-36.pyc b/ARGUS/spiders/__pycache__/textspider.cpython-36.pyc