TaiwanStat · cool9203 · Oct 24, 2022 · Oct 24, 2022 · Oct 24, 2022 · Oct 25, 2022
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 120
+ignore = E402,F841,F401,E302,E305,E203,W503
diff --git a/.gitignore b/.gitignore
@@ -91,3 +91,6 @@ ENV/
 *.swp
 
 setup.py
+
+# workspace
+.vscode
diff --git a/README.md b/README.md
@@ -1,12 +1,12 @@
 # Taiwan-news-crawlers
 
 🐞 [Scrapy](https://scrapy.org)-based Crawlers for news of Taiwan including 10 media companies:
-1. 蘋果日報
-2. 中國時報
-3. 中央社
-4. 華視
-5. 東森新聞雲
-6. 自由時報
+1. 中國時報
+2. 中央社
+3. 華視
+4. 東森新聞雲
+5. 自由時報
+6. 壹蘋新聞網(原蘋果日報)
 7. 公視
 8. 三立
 9. TVBS
@@ -16,32 +16,51 @@
 ## Getting Started
 
 ```
-$ git clone https://github.com/TaiwanStat/Taiwan-news-crawlers.git
+$ git clone https://github.com/cool9203/Taiwan-news-crawlers.git
 $ cd Taiwan-news-crawlers
 $ pip install -r requirements.txt
 $ scrapy crawl apple -o apple_news.json
 ```
 
 ## Prerequisites
 
-- Python3
-- Scrapy 1.3.0
+- Python3.7+
+- Scrapy >= 1.3.0 ~ 2.7.0
+- Twisted >= 16.6.0 ~ 22.8.0
+- isort
+- flake8
+- black
 
 ## Usage
-```scrapy crawl <spider> -o <output_name>```
-### Available spiders
-1. apple
-2. appleRealtime
-3. china
-4. cna
-5. cts
-6. ettoday
-7. liberty
-8. libertyRealtime
-9. pts
-10. setn
-11. tvbs
-12. udn
+
+```python
+# normal
+scrapy crawl <spider> -o <output_name>
+
+# if can crawl assign day
+# example want to crawl 2022-10-26
+scrapy crawl <spider> -o <output_name> -a start_date=2022-10-26 -a end_date=2022-10-26
+
+# if can crawl old day
+# example today is 2022-10-27
+# will crawl '2022-10-25'~'2022-10-27'
+scrapy crawl <spider> -o <output_name> -a start_date=2022-10-25
+```
+
+### Available spiders (all 10)
+
+| Spider name | Rewrite finished and can crawl | Can crawl assign day | Can crawl old day | Key word(tag) | note |
+| :--------: | :--------: | :--------: | :--------: | :--------: | :--------: |
+| china | :heavy_check_mark: | :x: | :x: | :heavy_check_mark: |  |
+| cna | :heavy_check_mark: | :x: | :x: | :white_check_mark: | not always crawl key word |
+| cts | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | always crawl yesterday |
+| ettoday | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |  |
+| liberty | :heavy_check_mark: | :x: | :x: | :heavy_check_mark: |  |
+| nextapple(origin of apple) | :heavy_check_mark: | :x: | :heavy_check_mark: | :heavy_check_mark: |  |
+| pts | :heavy_check_mark: | :x: | :x: | :heavy_check_mark: |  |
+| setn | :heavy_check_mark: | :x: | :x: | :heavy_check_mark: |  |
+| tvbs | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |  |
+| udn | :heavy_check_mark: | :x: | :heavy_check_mark: | :heavy_check_mark: |  |
 
 ## Output
 | Key | Value |
@@ -51,6 +70,8 @@ $ scrapy crawl apple -o apple_news.json
 | title     | the news title|
 | content   | the news content      |
 | category  | the category of news |
+| description  | the description of news |
+| key_word  | the key_word of news |
 
 ## License
 The MIT License
diff --git a/TaiwanNewsCrawler/items.py b/TaiwanNewsCrawler/items.py
@@ -15,3 +15,5 @@ class TaiwannewscrawlerItem(scrapy.Item):
     date = scrapy.Field()
     content = scrapy.Field()
     category = scrapy.Field()
+    description = scrapy.Field()
+    key_word = scrapy.Field()
diff --git a/TaiwanNewsCrawler/run.py b/TaiwanNewsCrawler/run.py
@@ -0,0 +1,47 @@
+import datetime as dt
+import os
+import sys
+
+import utils
+
+ENV_PATH = "/home/localadmin/news-crawler-last-ver/Taiwan-news-crawlers/env/bin/python"
+CRAWL_TODAY = True
+START_DAY = utils.YESTERDAY.strftime("%Y-%m-%d")
+END_DAY = utils.YESTERDAY.strftime("%Y-%m-%d")
+
+
+def run(test):
+    if CRAWL_TODAY:
+        crawler_name_list = ["china", "cna", "cts", "ettoday", "liberty", "pts", "setn", "tvbs", "udn"]
+        start_date = utils.TODAY
+        end_date = utils.TODAY
+    else:
+        crawler_name_list = ["cts", "ettoday", "tvbs"]
+        start_date = utils.parse_date(START_DAY)
+        end_date = utils.parse_date(END_DAY)
+
+    date = start_date
+    while date <= end_date:
+        for name in crawler_name_list:
+            date_str = date.strftime("%Y-%m-%d")
+            if CRAWL_TODAY:
+                cmd = f"scrapy crawl {name} -o all-crawl-news/{name}/{name}_{date_str}.json -L ERROR"
+            else:
+                cmd = f"scrapy crawl {name} -o all-crawl-news/{name}/{name}_{date_str}.json -a start_date={date_str} -a end_date={date_str} -L ERROR"  # fmt: skip
+            if len(ENV_PATH) > 0:
+                cmd = f"{ENV_PATH} -m {cmd}"
+            if test:
+                cmd = f"{ENV_PATH} -m scrapy list"
+            print(cmd)
+            os.system(cmd)
+        date += dt.timedelta(days=1)
+
+
+if __name__ == "__main__":
+    test = True
+    if len(sys.argv) > 1:
+        para = sys.argv[1]
+        if para == "test":
+            test = True
+    else:
+        run(test)
diff --git a/TaiwanNewsCrawler/settings.py b/TaiwanNewsCrawler/settings.py
@@ -10,90 +10,91 @@
 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 
-BOT_NAME = 'TaiwanNewsCrawler'
-
-SPIDER_MODULES = ['TaiwanNewsCrawler.spiders']
-NEWSPIDER_MODULE = 'TaiwanNewsCrawler.spiders'
+BOT_NAME = "TaiwanNewsCrawler"
 
+SPIDER_MODULES = ["TaiwanNewsCrawler.spiders"]
+NEWSPIDER_MODULE = "TaiwanNewsCrawler.spiders"
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'TaiwanNewsCrawler (+http://www.yourdomain.com)'
+# USER_AGENT = 'TaiwanNewsCrawler (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
+# CONCURRENT_REQUESTS = 32
 
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+# DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# ONCURRENT_REQUESTS_PER_IP = 16
 
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+# COOKIES_ENABLED = False
 
 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+# TELNETCONSOLE_ENABLED = False
 
 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+# DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
-#}
+# }
 
 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
+# SPIDER_MIDDLEWARES = {
 #    'mediaParser.middlewares.MyCustomSpiderMiddleware': 543,
-#}
+# }
 
 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
+# DOWNLOADER_MIDDLEWARES = {
 #    'mediaParser.middlewares.MyCustomDownloaderMiddleware': 543,
-#}
+# }
 
 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
+# EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
+# }
 
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+# ITEM_PIPELINES = {
 #    'mediaParser.pipelines.SomePipeline': 300,
-#}
+# }
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+# AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+# AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+# AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
 
 class MyJsonItemExporter(JsonItemExporter):
     def __init__(self, file, **kwargs):
         super(MyJsonItemExporter, self).__init__(file, ensure_ascii=False, **kwargs)
 
+
 FEED_EXPORTERS = {
-    'json': 'TaiwanNewsCrawler.settings.MyJsonItemExporter',
+    "json": "TaiwanNewsCrawler.settings.MyJsonItemExporter",
 }
diff --git a/TaiwanNewsCrawler/spiders/apple_realtimenews_spider.py b/TaiwanNewsCrawler/spiders/apple_realtimenews_spider.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -91,3 +91,6 @@ ENV/ @@
     *.swp
     setup.py
+    # workspace
+    .vscode