diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..8442283 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 120 +ignore = E402,F841,F401,E302,E305,E203,W503 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1726074..7fc1dea 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,6 @@ ENV/ *.swp setup.py + +# workspace +.vscode \ No newline at end of file diff --git a/README.md b/README.md index 3e53fd2..1856571 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # Taiwan-news-crawlers 🐞 [Scrapy](https://scrapy.org)-based Crawlers for news of Taiwan including 10 media companies: -1. 蘋果日報 -2. 中國時報 -3. 中央社 -4. 華視 -5. 東森新聞雲 -6. 自由時報 +1. 中國時報 +2. 中央社 +3. 華視 +4. 東森新聞雲 +5. 自由時報 +6. 壹蘋新聞網(原蘋果日報) 7. 公視 8. 三立 9. TVBS @@ -16,7 +16,7 @@ ## Getting Started ``` -$ git clone https://github.com/TaiwanStat/Taiwan-news-crawlers.git +$ git clone https://github.com/cool9203/Taiwan-news-crawlers.git $ cd Taiwan-news-crawlers $ pip install -r requirements.txt $ scrapy crawl apple -o apple_news.json @@ -24,24 +24,43 @@ $ scrapy crawl apple -o apple_news.json ## Prerequisites -- Python3 -- Scrapy 1.3.0 +- Python3.7+ +- Scrapy >= 1.3.0 ~ 2.7.0 +- Twisted >= 16.6.0 ~ 22.8.0 +- isort +- flake8 +- black ## Usage -```scrapy crawl -o ``` -### Available spiders -1. apple -2. appleRealtime -3. china -4. cna -5. cts -6. ettoday -7. liberty -8. libertyRealtime -9. pts -10. setn -11. tvbs -12. udn + +```python +# normal +scrapy crawl -o + +# if can crawl assign day +# example want to crawl 2022-10-26 +scrapy crawl -o -a start_date=2022-10-26 -a end_date=2022-10-26 + +# if can crawl old day +# example today is 2022-10-27 +# will crawl '2022-10-25'~'2022-10-27' +scrapy crawl -o -a start_date=2022-10-25 +``` + +### Available spiders (all 10) + +| Spider name | Rewrite finished and can crawl | Can crawl assign day | Can crawl old day | Key word(tag) | note | +| :--------: | :--------: | :--------: | :--------: | :--------: | :--------: | +| china | :heavy_check_mark: | :x: | :x: | :heavy_check_mark: | | +| cna | :heavy_check_mark: | :x: | :x: | :white_check_mark: | not always crawl key word | +| cts | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | always crawl yesterday | +| ettoday | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | +| liberty | :heavy_check_mark: | :x: | :x: | :heavy_check_mark: | | +| nextapple(origin of apple) | :heavy_check_mark: | :x: | :heavy_check_mark: | :heavy_check_mark: | | +| pts | :heavy_check_mark: | :x: | :x: | :heavy_check_mark: | | +| setn | :heavy_check_mark: | :x: | :x: | :heavy_check_mark: | | +| tvbs | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | +| udn | :heavy_check_mark: | :x: | :heavy_check_mark: | :heavy_check_mark: | | ## Output | Key | Value | @@ -51,6 +70,8 @@ $ scrapy crawl apple -o apple_news.json | title | the news title| | content | the news content | | category | the category of news | +| description | the description of news | +| key_word | the key_word of news | ## License The MIT License diff --git a/TaiwanNewsCrawler/items.py b/TaiwanNewsCrawler/items.py index 166f23b..68571be 100644 --- a/TaiwanNewsCrawler/items.py +++ b/TaiwanNewsCrawler/items.py @@ -15,3 +15,5 @@ class TaiwannewscrawlerItem(scrapy.Item): date = scrapy.Field() content = scrapy.Field() category = scrapy.Field() + description = scrapy.Field() + key_word = scrapy.Field() diff --git a/TaiwanNewsCrawler/run.py b/TaiwanNewsCrawler/run.py new file mode 100644 index 0000000..75f49c5 --- /dev/null +++ b/TaiwanNewsCrawler/run.py @@ -0,0 +1,47 @@ +import datetime as dt +import os +import sys + +import utils + +ENV_PATH = "/home/localadmin/news-crawler-last-ver/Taiwan-news-crawlers/env/bin/python" +CRAWL_TODAY = True +START_DAY = utils.YESTERDAY.strftime("%Y-%m-%d") +END_DAY = utils.YESTERDAY.strftime("%Y-%m-%d") + + +def run(test): + if CRAWL_TODAY: + crawler_name_list = ["china", "cna", "cts", "ettoday", "liberty", "pts", "setn", "tvbs", "udn"] + start_date = utils.TODAY + end_date = utils.TODAY + else: + crawler_name_list = ["cts", "ettoday", "tvbs"] + start_date = utils.parse_date(START_DAY) + end_date = utils.parse_date(END_DAY) + + date = start_date + while date <= end_date: + for name in crawler_name_list: + date_str = date.strftime("%Y-%m-%d") + if CRAWL_TODAY: + cmd = f"scrapy crawl {name} -o all-crawl-news/{name}/{name}_{date_str}.json -L ERROR" + else: + cmd = f"scrapy crawl {name} -o all-crawl-news/{name}/{name}_{date_str}.json -a start_date={date_str} -a end_date={date_str} -L ERROR" # fmt: skip + if len(ENV_PATH) > 0: + cmd = f"{ENV_PATH} -m {cmd}" + if test: + cmd = f"{ENV_PATH} -m scrapy list" + print(cmd) + os.system(cmd) + date += dt.timedelta(days=1) + + +if __name__ == "__main__": + test = True + if len(sys.argv) > 1: + para = sys.argv[1] + if para == "test": + test = True + else: + run(test) diff --git a/TaiwanNewsCrawler/settings.py b/TaiwanNewsCrawler/settings.py index 3b755be..0df2456 100644 --- a/TaiwanNewsCrawler/settings.py +++ b/TaiwanNewsCrawler/settings.py @@ -10,90 +10,91 @@ # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -BOT_NAME = 'TaiwanNewsCrawler' - -SPIDER_MODULES = ['TaiwanNewsCrawler.spiders'] -NEWSPIDER_MODULE = 'TaiwanNewsCrawler.spiders' +BOT_NAME = "TaiwanNewsCrawler" +SPIDER_MODULES = ["TaiwanNewsCrawler.spiders"] +NEWSPIDER_MODULE = "TaiwanNewsCrawler.spiders" # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'TaiwanNewsCrawler (+http://www.yourdomain.com)' +# USER_AGENT = 'TaiwanNewsCrawler (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 +# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 +# DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# ONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) -#COOKIES_ENABLED = False +# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False +# TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # 'mediaParser.middlewares.MyCustomSpiderMiddleware': 543, -#} +# } # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { +# DOWNLOADER_MIDDLEWARES = { # 'mediaParser.middlewares.MyCustomDownloaderMiddleware': 543, -#} +# } # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, -#} +# } # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { +# ITEM_PIPELINES = { # 'mediaParser.pipelines.SomePipeline': 300, -#} +# } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True +# AUTOTHROTTLE_ENABLED = True # The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 +# AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 +# AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False +# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + class MyJsonItemExporter(JsonItemExporter): def __init__(self, file, **kwargs): super(MyJsonItemExporter, self).__init__(file, ensure_ascii=False, **kwargs) + FEED_EXPORTERS = { - 'json': 'TaiwanNewsCrawler.settings.MyJsonItemExporter', + "json": "TaiwanNewsCrawler.settings.MyJsonItemExporter", } diff --git a/TaiwanNewsCrawler/spiders/apple_realtimenews_spider.py b/TaiwanNewsCrawler/spiders/apple_realtimenews_spider.py deleted file mode 100644 index 54d6d7e..0000000 --- a/TaiwanNewsCrawler/spiders/apple_realtimenews_spider.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -蘋果日報即時新聞 -the crawl deal with apple's realtime news -Usage: scrapy crawl appleRealtime -o -""" -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import re -from datetime import datetime -from datetime import date - -import scrapy - -TODAY = date.today() -ROOT_URL = 'https://tw.appledaily.com/new/realtime/' - - -class AppleRealtimenewsSpider(scrapy.Spider): - name = 'appleRealtime' - start_urls = [ROOT_URL + '1'] - - def parse(self, response): - regex = r'realtime\/(\d+)' - current_index = re.search(regex, response.url).group(1) - next_index = int(current_index) + 1 - current_date = response.css('h1 time::text').extract_first() - current_date = datetime.strptime(current_date, "%Y / %m / %d") - if TODAY != current_date.date(): - return - for news_item in response.css('ul.rtddd li'): - category = news_item.css('h2::text').extract_first() - meta = {'category': category} - relative_url = news_item.css('a::attr(href)').extract_first() - abs_url = response.urljoin(relative_url) - yield scrapy.Request(abs_url, callback=self.parse_news, meta=meta) - - next_targe = ROOT_URL + str(next_index) - yield scrapy.Request(next_targe, callback=self.parse) - - def parse_news(self, response): - news_date = response.css('.ndArticle_creat::text').extract_first()[5: - -6] - news_date = datetime.strptime(news_date, "%Y/%m/%d") - if TODAY != news_date.date(): - return - title = "" - title_sel_prefix = 'hgroup' - p_sel_prefix = '.ndArticle_margin' - - if 'home' in response.url: - title_sel_prefix = '.ncbox_cont' - p_sel_prefix = '.articulum' - - t_h1 = response.css(title_sel_prefix + '>h1::text') - if t_h1: - title += t_h1.extract_first() - t_h2 = response.css(title_sel_prefix + '>h2::text') - if t_h2: - title += t_h2.extract_first() - - h2 = response.css(title_sel_prefix + '>h2::text').extract() - h2_num = len(h2) - content = "" - counter = 0 - for p in response.css(p_sel_prefix + '>p'): - if p.css('p::text'): - content += ' '.join(p.css('p::text').extract()) - if counter < h2_num: - content += " " + h2[counter] - counter += 1 - - yield { - 'website': "蘋果日報", - 'url': response.url, - 'title': title, - 'date': news_date, - 'content': content, - 'category': response.meta['category'] - } diff --git a/TaiwanNewsCrawler/spiders/apple_spider.py b/TaiwanNewsCrawler/spiders/apple_spider.py deleted file mode 100644 index 5df118e..0000000 --- a/TaiwanNewsCrawler/spiders/apple_spider.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -蘋果日報新聞 -the crawl deal with apple's news -Usage: scrapy crawl apple -o -""" -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import re -import time - -import w3lib.url - -import scrapy - - -class AppleSpider(scrapy.Spider): - name = "apple" - start_urls = [ - 'https://tw.appledaily.com/daily', - ] - - def parse(self, response): - section = response.css('section.nclnbx.slvl.clearmen, article.nclns') - for part in section: - if part.css('header.schh h1::text'): - category = part.css('header.schh h1::text').extract_first() - category = category.strip() - else: - meta = {'category': category} - for news in part.css('ul.fillup li'): - if 'eat-travel' in news.css( - "a::attr(href)").extract_first(): - continue - elif 'entertainment.appledaily' in news.css( - "a::attr(href)").extract_first(): - url = news.css("a::attr(href)").extract_first() - elif 'http' in news.css("a::attr(href)").extract_first(): - url = news.css("a::attr(href)").extract_first() - else: - url = "http://www.appledaily.com.tw{}".format( - news.css("a::attr(href)").extract_first()) - if url: - url = response.urljoin(url) - yield scrapy.Request( - url, callback=self.parse_news, meta=meta) - - def parse_news(self, response): - date = time.strftime('%Y-%m-%d') - title = "" - title_sel_prefix = 'hgroup' - p_sel_prefix = '.ndArticle_margin' - - if 'home' in response.url: - title_sel_prefix = '.ncbox_cont' - p_sel_prefix = '.articulum' - - t_h1 = response.css(title_sel_prefix + '>h1::text') - if t_h1: - title += t_h1.extract_first() - t_h2 = response.css(title_sel_prefix + '>h2::text') - if t_h2: - title += t_h2.extract_first() - - h2 = response.css(title_sel_prefix + '>h2::text').extract() - h2_num = len(h2) - content = "" - counter = 0 - for p in response.css(p_sel_prefix + '>p'): - if p.css('p::text'): - content += ' '.join(p.css('p::text').extract()) - if counter < h2_num: - content += " " + h2[counter] - counter += 1 - - yield { - 'website': "蘋果日報", - 'url': response.url, - 'title': title, - 'date': date, - 'content': content, - 'category': response.meta['category'] - } diff --git a/TaiwanNewsCrawler/spiders/china_spider.py b/TaiwanNewsCrawler/spiders/china_spider.py index 0a598f8..39afe51 100644 --- a/TaiwanNewsCrawler/spiders/china_spider.py +++ b/TaiwanNewsCrawler/spiders/china_spider.py @@ -1,56 +1,88 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- """ 中國時報 the crawl deal with chinatimes's news Usage: scrapy crawl china -o """ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -from datetime import datetime +from urllib.parse import urljoin + import scrapy -ROOT_URL = 'http://www.chinatimes.com' -PAGE_URL = 'http://www.chinatimes.com/newspapers/2601' +import TaiwanNewsCrawler.utils as utils + +ROOT_URL = "http://www.chinatimes.com" +PAGE_URL = "http://www.chinatimes.com/newspapers/2601" class ChinaSpider(scrapy.Spider): name = "china" - start_urls = ['http://www.chinatimes.com/newspapers/2601'] + start_urls = ["http://www.chinatimes.com/newspapers/2601"] + + def __init__(self, start_date: str = None, end_date: str = None): + super().__init__(start_date=start_date, end_date=end_date) - def parse(self, response): - news_in_page = response.css('.listRight li h2 a') - if not news_in_page: + def parse(self, response: scrapy.Selector): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + + crawl_next = False + all_news = response.css("ul.vertical-list li") + if not all_news: return - for news in news_in_page: - url = news.css('a::attr(href)').extract_first() - if ROOT_URL not in url: - url = ROOT_URL + url - url = response.urljoin(url) - yield scrapy.Request(url, callback=self.parse_news) - if 'next_page' in response.meta: - meta = {'next_page': response.meta['next_page'] + 1} + for news in all_news: + news_date = utils.parse_date(news.css("time::attr(datetime)").extract_first()) + if news_date is None: + continue + crawl_next = utils.can_crawl(news_date, start_date, end_date) + + if crawl_next: + url = news.css("a::attr(href)").extract_first() + if ROOT_URL not in url: + url = urljoin(ROOT_URL, url) + url = response.urljoin(url) + yield scrapy.Request(url, callback=self.parse_news) + + if "next_page" in response.meta: + meta = {"next_page": response.meta["next_page"] + 1} else: - meta = {'next_page': 2} - next_url = PAGE_URL + '?page=' + str(meta['next_page']) - yield scrapy.Request(next_url, callback=self.parse, meta=meta) - - def parse_news(self, response): - title = response.css('h1::text').extract_first() - date_of_news_str = response.css('time::attr(datetime)').extract_first() - date_of_news = datetime.strptime(date_of_news_str, '%Y/%m/%d %H:%M') + meta = {"next_page": 2} + + if crawl_next: + next_url = PAGE_URL + "?page=" + str(meta["next_page"]) + yield scrapy.Request(next_url, callback=self.parse, meta=meta) + + def parse_news(self, response: scrapy.Selector): + title = response.css("h1::text").extract_first() + date_str = response.css("time::attr(datetime)").extract_first() + date = utils.parse_date(date_str, "%Y-%m-%d %H:%M") content = "" - for p in response.css('article p'): - p_text = p.css('::text') + for p in response.css("div.article-body p"): + p_text = p.css("::text") if p_text: - content += ' '.join(p_text.extract()) + content += " ".join(p_text.extract()) + + category = response.css("meta[name=section]::attr(content)").extract_first() + + # description + try: + description = response.css("meta[property='og:description']::attr(content)").extract_first() + except Exception as e: + description = "" - category = response.css('.page_index span::text').extract()[-1].strip() + # key_word + try: + key_word = response.css("meta[name=keywords]::attr(content)").extract_first() + except Exception as e: + key_word = "" yield { - 'website': "中國時報", - 'url': response.url, - 'title': title, - 'date': date_of_news, - 'content': content, - 'category': category + "website": "中國時報", + "url": response.url, + "title": title, + "date": date, + "content": content, + "category": category, + "description": description, + "key_word": key_word, } diff --git a/TaiwanNewsCrawler/spiders/cna_spider.py b/TaiwanNewsCrawler/spiders/cna_spider.py index 990f0ff..2ab098e 100644 --- a/TaiwanNewsCrawler/spiders/cna_spider.py +++ b/TaiwanNewsCrawler/spiders/cna_spider.py @@ -1,62 +1,121 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- """ 中央社 the crawl deal with cna's news Usage: scrapy crawl cna -o """ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -from datetime import datetime + +import json +from urllib.parse import urljoin + import scrapy +import scrapy.http + +import TaiwanNewsCrawler.utils as utils -ROOT_URL = 'http://www.cna.com.tw' -TODAY = datetime.today().date() +ROOT_URL = "https://www.cna.com.tw" +API_URL = "https://www.cna.com.tw/cna2018api/api/WNewsList" +API_POST_DATA = {"action": "0", "category": "aall", "pagesize": "20", "pageidx": 1} class CnaSpider(scrapy.Spider): name = "cna" - start_urls = ['http://www.cna.com.tw/list/aall-1.aspx'] - - def parse(self, response): - current_page_index = int( - response.css('.pagination li.current a::text').extract_first()) - - newses_time_str = response.css('.article_list li span::text').extract() - newses_time = [ - datetime.strptime(i, '%Y/%m/%d %H:%M').date() - for i in newses_time_str - ] - is_over_today = False - - for t in newses_time: - if t < TODAY: - is_over_today = True - - if not is_over_today: - next_url = 'http://www.cna.com.tw/list/aall-' + str( - current_page_index + 1) + '.aspx' - yield scrapy.Request(next_url, callback=self.parse) - - for news in response.css('div.article_list li a'): - url = response.urljoin(news.css('a::attr(href)').extract_first()) - yield scrapy.Request(url, callback=self.parse_news) - - def parse_news(self, response): - title = response.css('h1::text').extract_first() - date = response.css('div.update_times p::text').extract_first()[5:] - content = '' - for p in response.css('div.article_box section p'): - p_text = p.css('::text') + start_urls = ["https://www.cna.com.tw/list/aall.aspx"] + + def __init__(self, start_date: str = None, end_date: str = None): + super().__init__(start_date=start_date, end_date=end_date) + + def parse(self, response: scrapy.Selector): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + + crawl_next = False + all_news = response.css("ul#jsMainList li") + if not all_news: + return + + for news in all_news: + news_date = utils.parse_date(news.css("div.date::text").extract_first()) + if news_date is None: + continue + crawl_next = utils.can_crawl(news_date, start_date, end_date) + + if crawl_next: + url = news.css("a::attr(href)").extract_first() + if ROOT_URL not in url: + url = urljoin(ROOT_URL, url) + yield scrapy.Request(url, callback=self.parse_news) + + if crawl_next: + API_POST_DATA["pageidx"] += 1 + # use api to get more news + yield scrapy.http.Request( + API_URL, + method="POST", + body=json.dumps(API_POST_DATA), + callback=self.parse_api, + headers={"Content-Type": "application/json", "Accept": "application/json"}, + ) + + def parse_api(self, response): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + crawl_next = False + response_data = json.loads(response.text) + + if response_data["Result"].lower() == "y": + for news in response_data["ResultData"]["Items"]: + news_date = utils.parse_date(news["CreateTime"]) + if news_date is None: + continue + crawl_next = utils.can_crawl(news_date, start_date, end_date) + + if crawl_next: + url = news["PageUrl"] + if ROOT_URL not in url: + url = urljoin(ROOT_URL, url) + yield scrapy.Request(url, callback=self.parse_news) + + if crawl_next: + API_POST_DATA["pageidx"] += 1 + yield scrapy.http.Request( + API_URL, + method="POST", + body=json.dumps(API_POST_DATA), + callback=self.parse_api, + headers={"Content-Type": "application/json", "Accept": "application/json"}, + ) + + def parse_news(self, response: scrapy.Selector): + title = response.css("h1 span::text").extract_first() + date_str = response.css("div.updatetime span::text").extract_first() + date = utils.parse_date(date_str, "%Y/%m/%d %H:%M") + content = "" + for p in response.css("div.centralContent div.paragraph p"): + p_text = p.css("::text") if p_text: - content += ' '.join(p_text.extract()) + content += " ".join(p_text.extract()) + + category = response.css("article.article::attr(data-origin-type-name)").extract_first() + + # description + try: + description = response.css("meta[property='og:description']::attr(content)").extract_first() + except Exception as e: + description = "" - category_links = response.css('div.breadcrumb span a span') - category = category_links[1].css('::text').extract_first() + # key_word + try: + key_word = response.css("div.Temasname::text").extract_first() + except Exception as e: + key_word = "" yield { - 'website': "中央通訊社", - 'url': response.url, - 'title': title, - 'date': date[:10].replace('/', '-'), - 'content': content, - 'category': category + "website": "中央通訊社", + "url": response.url, + "title": title, + "date": date, + "content": content, + "category": category, + "description": description, + "key_word": key_word, } diff --git a/TaiwanNewsCrawler/spiders/cts_spider.py b/TaiwanNewsCrawler/spiders/cts_spider.py index c8b54ac..3de6dd2 100644 --- a/TaiwanNewsCrawler/spiders/cts_spider.py +++ b/TaiwanNewsCrawler/spiders/cts_spider.py @@ -1,59 +1,81 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- """ 華視 the crawl deal with cts's news Usage: scrapy crawl cts -o """ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -from datetime import date -from datetime import timedelta + +import datetime as dt +import json +from urllib.parse import urljoin + import scrapy +import scrapy.http + +import TaiwanNewsCrawler.utils as utils -YESTERDAY = (date.today() - timedelta(1)).strftime('%Y/%m/%d') +ROOT_URL = "https://news.cts.com.tw" +PAGE_URL = "https://news.cts.com.tw/real/index.html" +API_URL = "https://news.cts.com.tw/api/news/{}/daylist-news.json" class CtsSpider(scrapy.Spider): name = "cts" - start_urls = [ - 'http://news.cts.com.tw/daylist/{}/index.html'.format(YESTERDAY) - ] - def parse(self, response): - for news in response.css('.news_right'): - url = news.css('a::attr(href)').extract_first() + def __init__(self, start_date: str = None, end_date: str = None): + super().__init__(start_date=start_date, end_date=end_date) + + def start_requests(self): + start_date, end_date = utils.parse_start_date_and_end_date( + self.start_date, self.end_date, utils.YESTERDAY, utils.YESTERDAY + ) + date = start_date + + while date < end_date: + url = API_URL.format(date.strftime("%Y/%m/%d")) + yield scrapy.http.Request(url, method="GET", callback=self.parse) + date += dt.timedelta(days=1) + + def parse(self, response: scrapy.Request): + response = json.loads(response.text) + for news in response: + url = news["news_url"] + if ROOT_URL not in url: + url = urljoin(ROOT_URL, url) yield scrapy.Request(url, callback=self.parse_news) - page_desc = response.css('.page-desc::text').extract_first() - total_pages = page_desc.split('/')[1] - total_pages = int(total_pages[2:-2]) - url_arr = response.url.split('/') - url_suffix = url_arr[-1] - current_page_index = url_suffix[5:-5] - if current_page_index is '': - current_page_index = 1 - else: - current_page_index = int(current_page_index) - - if current_page_index < total_pages: - next_page = '/'.join(url_arr[:-1]) + '/index' + str( - current_page_index + 1) + '.html' - yield scrapy.Request(next_page, callback=self.parse) - - def parse_news(self, response): - title = response.css('.newsbigtitle::text').extract_first().strip( - ' \t\n\r') - date_of_news = response.css('.timebar::text').extract_first().strip( - ' \t\n\r') - date_of_news = date_of_news[:10] - category = response.css('.active a::text').extract()[-1] - content = response.css('.newscontents p::text').extract() - content = ' '.join(content) + def parse_news(self, response: scrapy.Selector): + title = response.css("div.artical-titlebar h1.artical-title::text").extract_first() + date_str = response.css("div.news-artical div.titlebar-top time.artical-time::text").extract_first() + date = utils.parse_date(date_str, "%Y/%m/%d %H:%M") + content = "" + for p in response.css("artical.news-artical div.artical-content p"): + if len(p.css("::attr(href)")) == 0 and len(p.css("::attr(class)")) == 0: + p_text = p.css("::text") + content += " ".join(p_text.extract()) + + category = response.css("meta[name=section]::attr(content)").extract_first() + + # description + try: + description = response.css("meta[property='og:description']::attr(content)").extract_first() + except Exception as e: + description = "" + + # key_word + try: + key_word = response.css("meta[name=keywords]::attr(content)").extract_first() + except Exception as e: + key_word = "" yield { - 'website': "華視", - 'url': response.url, - 'title': title, - 'date': date_of_news, - 'content': content, - 'category': category + "website": "華視", + "url": response.url, + "title": title, + "date": date, + "content": content, + "category": category, + "description": description, + "key_word": key_word, } diff --git a/TaiwanNewsCrawler/spiders/ettoday_spider.py b/TaiwanNewsCrawler/spiders/ettoday_spider.py index 66bcf75..f5f88cf 100644 --- a/TaiwanNewsCrawler/spiders/ettoday_spider.py +++ b/TaiwanNewsCrawler/spiders/ettoday_spider.py @@ -1,79 +1,112 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- """ the crawl deal with ettoday's news Usage: scrapy crawl ettoday -o """ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import time + +import datetime as dt +from urllib.parse import urljoin + import scrapy +import scrapy.http + +import TaiwanNewsCrawler.utils as utils -TODAY = time.strftime('%Y/%m/%d') -TODAY_URL = time.strftime('%Y-%m-%d') -ROOT_URL = 'https://www.ettoday.net' +ROOT_URL = "https://www.ettoday.net" +PAGE_URL = "https://www.ettoday.net/news/news-list-{}-0.htm" +API_URL = "https://www.ettoday.net/show_roll.php" class EttodaySpider(scrapy.Spider): name = "ettoday" + def __init__(self, start_date: str = None, end_date: str = None): + super().__init__(start_date=start_date, end_date=end_date) + def start_requests(self): - urls = [ - 'https://www.ettoday.net/news/news-list-' + TODAY_URL + '-0.htm' - ] - for url in urls: - meta = {'iter_time': 0} + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + date = start_date + + while date < end_date: + meta = {"iter_time": 0, "date": date, "start_date": date, "end_date": date + dt.timedelta(days=1)} + url = PAGE_URL.format(date.strftime("%Y-%m-%d")) yield scrapy.Request(url, callback=self.parse_news_list, meta=meta) + date += dt.timedelta(days=1) def parse_news_list(self, response): - has_next_page = True - response.meta['iter_time'] += 1 - is_first_iter = response.meta['iter_time'] == 1 - prefix = '.part_list_2' if is_first_iter else '' - for news_item in response.css(prefix + ' h3'): - url = news_item.css('a::attr(href)').extract_first() - url = ROOT_URL + url - category = news_item.css('em::text').extract_first() - date_time = news_item.css('span::text').extract_first() - - if TODAY not in date_time: - has_next_page = False - continue - - response.meta['category'] = category - yield scrapy.Request( - url, callback=self.parse_news, meta=response.meta) - if has_next_page: - tFile = time.strftime('%Y%m%d') + '.xml' + start_date, end_date = response.meta["start_date"], response.meta["end_date"] + crawl_next = False + response.meta["iter_time"] += 1 + is_first_iter = response.meta["iter_time"] == 1 + prefix = ".part_list_2" if is_first_iter else "" + date_str = response.meta["date"].strftime("%Y/%m/%d") + + for news in response.css(prefix + " h3"): + news_date = utils.parse_date(news.css("span::text").extract_first()) + crawl_next = utils.can_crawl(news_date, start_date, end_date) + + url = news.css("a::attr(href)").extract_first() + if ROOT_URL not in url: + url = urljoin(ROOT_URL, url) + category = news.css("em::text").extract_first() + + if crawl_next: + response.meta["category"] = category + yield scrapy.Request(url, callback=self.parse_news, meta=response.meta) + + if crawl_next: + date_str = response.meta["date"].strftime("%Y%m%d") + tFile = f"{date_str}-1.xml" yield scrapy.FormRequest( - url="https://www.ettoday.net/show_roll.php", + url=API_URL, callback=self.parse_news_list, meta=response.meta, formdata={ - 'offset': str(response.meta['iter_time']), - 'tPage': '3', - 'tFile': tFile, - 'tOt': '0', - 'tSi': '100' - }) - + "offset": str(response.meta["iter_time"]), + "tPage": "3", + "tFile": tFile, + "tOt": "0", + "tSi": "100", + "tAr": "0", + }, + ) def parse_news(self, response): - title = response.css('h1.title::text').extract_first() + title = response.css("h1.title::text").extract_first() + date = response.meta["date"].strftime("%Y-%m-%d") if not title: - title = response.css('h2.title::text').extract_first() + title = response.css("h2.title::text").extract_first() if not title: - title = response.css('h1.title_article::text').extract_first() + title = response.css("h1.title_article::text").extract_first() - p_list = response.css('.story p::text').extract() + p_list = response.css(".story p::text").extract() - content = '' + content = "" for p in p_list: content += p + category = response.meta["category"] + + # description + try: + description = response.css("meta[property='og:description']::attr(content)").extract_first() + except Exception as e: + description = "" + + # key_word + try: + key_word = response.css("meta[name=news_keywords]::attr(content)").extract_first() + except Exception as e: + key_word = "" + yield { - 'website': "東森新聞雲", - 'url': response.url, - 'title': title, - 'date': time.strftime('%Y-%m-%d'), - 'content': content, - 'category': response.meta['category'] + "website": "東森新聞雲", + "url": response.url, + "title": title, + "date": date, + "content": content, + "category": category, + "description": description, + "key_word": key_word, } diff --git a/TaiwanNewsCrawler/spiders/ettoday_tag_spider.py b/TaiwanNewsCrawler/spiders/ettoday_tag_spider.py deleted file mode 100644 index fdc2e3f..0000000 --- a/TaiwanNewsCrawler/spiders/ettoday_tag_spider.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -Ettoday tag -the crawl deal with tags of ettoday's news, which could make the dictionary of jieba -Usage: scrapy crawl ettoday_tag -o -""" -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import datetime -import scrapy - -TODAY = datetime.date.today().strftime('%Y/%m/%d') -TODAY_URL = datetime.date.today().strftime('%Y-%m-%d') -ROOT_URL = 'http://www.ettoday.net' -OLDEST_DATA_YEAR = 2012 -NEWS_DATE_BEGIN = datetime.date(OLDEST_DATA_YEAR, 1, 1) - - -class EttodaySpider(scrapy.Spider): - name = "ettoday_tag" - - def start_requests(self): - day = datetime.timedelta(days=1) - current_time = NEWS_DATE_BEGIN - - while current_time <= datetime.date.today(): - date_str = current_time.strftime('%Y-%m-%d') - url = 'http://www.ettoday.net/news/news-list-' + date_str + '-0.htm' - meta = { - 'iter_time': 0, - 'date_str': current_time.strftime('%Y/%m/%d') - } - current_time += day - yield scrapy.Request(url, callback=self.parse_news_list, meta=meta) - - def parse_news_list(self, response): - has_next_page = True - response.meta['iter_time'] += 1 - current_date_str = response.meta['date_str'] - is_first_iter = response.meta['iter_time'] == 1 - prefix = '.part_list_2' if is_first_iter else '' - for news_item in response.css(prefix + ' h3'): - url = news_item.css('a::attr(href)').extract_first() - if ROOT_URL not in url: - url = ROOT_URL + url - category = news_item.css('em::text').extract_first() - date_time = news_item.css('span::text').extract_first() - - if current_date_str not in date_time: - has_next_page = False - continue - - response.meta['category'] = category - yield scrapy.Request( - url, callback=self.parse_tag_of_news, meta=response.meta) - if has_next_page: - tFile = datetime.date.today().strftime('%Y%m%d') + '.xml' - yield scrapy.FormRequest( - url="http://www.ettoday.net/show_roll.php", - callback=self.parse_news_list, - meta=response.meta, - formdata={ - 'offset': str(response.meta['iter_time']), - 'tPage': '3', - 'tFile': tFile, - 'tOt': '0', - 'tSi': '100' - }) - - - def parse_tag_of_news(self, response): - tag_string = response.css( - 'head meta[name=news_keywords]::attr(content)').extract_first() - tags = tag_string.split(',') - yield {'tag': tags} diff --git a/TaiwanNewsCrawler/spiders/liberty_realtimenews_spider.py b/TaiwanNewsCrawler/spiders/liberty_realtimenews_spider.py deleted file mode 100644 index 633a22b..0000000 --- a/TaiwanNewsCrawler/spiders/liberty_realtimenews_spider.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -自由時報即時新聞 -the crawl deal with liberty's realtime news -Usage: scrapy crawl libertyRealtime -o -""" -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import re -from datetime import datetime, date -import scrapy - -ROOT_URL = 'http://news.ltn.com.tw/' -Realtime_NEWS_URL = 'http://news.ltn.com.tw/list/breakingnews/all/' -today = date.today() - -CATEGORY_DIC = { - 'focus': '焦點', - 'politics': '政治', - 'society': '社會', - 'local': '地方', - 'life': '生活', - 'opinion': '言論', - 'world': '國際', - 'business': '財經', - 'entertainment': '娛樂', - 'consumer': '消費', - 'supplement': '副刊', - 'sports': '體育', - 'car': '汽車', - '3c': '3c', - 'istyle': 'istyle' -} - - -class LibertySpider(scrapy.Spider): - name = "libertyRealtime" - start_urls = ['http://news.ltn.com.tw/list/breakingnews/all'] - - def parse(self, response): - regex = r'\/all\/(\d+)' - current_index = re.search(regex, response.url) - if current_index: - next_index = int(current_index.group(1)) + 1 - else: - next_index = 2 - date_of_news = response.css('a.tit span::text').extract() - last_page = False - for d in date_of_news: - if '-' in d: - last_page = True - break - - for news_url in response.css('a.tit::attr(href)').extract(): - yield scrapy.Request(news_url, callback=self.parse_news) - - if not last_page: - next_target = Realtime_NEWS_URL + str(next_index) - yield scrapy.Request(next_target, callback=self.parse) - - def parse_news(self, response): - category = get_news_category(response) - - if category == 'opinion': - title = response.css('h2::text').extract_first() - else: - title = response.css('h1::text').extract_first() - - if category == 'opinion': - content = get_news_content(response, '.cont h4::text', '.cont p') - elif category == 'sports': - content = get_news_content(response, '.news_p h4::text', - '.news_p p') - elif category == 'entertainment': - content = get_news_content(response, '.news_content h4::text', - '.news_content p') - elif category == 'car': - content = get_news_content(response, '.con h4::text', '.con p') - elif category == '3c': - content = get_news_content(response, '.cont h4::text', '.cont p') - elif category == 'istyle': - content = get_news_content(response, '.boxTitle h4::text', - '.boxTitle p') - else: - content = get_news_content(response, '#newstext h4::text', - '.text p') - yield { - 'website': "自由時報", - 'url': response.url, - 'title': title, - 'date': datetime.now().strftime('%Y-%m-%d'), - 'content': content, - 'category': CATEGORY_DIC[category] - } - - -def get_news_category(response): - searched_category = re.search(r'\/news\/([a-z]*)\/breakingnews\/', - response.url) - - if searched_category and searched_category.group(1) != 'paper': - return searched_category.group(1) - elif 'talk' in response.url: - return 'opinion' - elif 'sports' in response.url: - return 'sports' - elif 'ent' in response.url: - return 'entertainment' - elif 'auto' in response.url: - return 'car' - elif '3c' in response.url: - return '3c' - elif 'istyle' in response.url: - return 'istyle' - - -def get_news_content(response, h4_query, p_query): - h4 = response.css(h4_query).extract() - h4_num = len(h4) - counter = 0 - content = "" - for p in response.css(p_query): - if counter < h4_num: - content += " " + h4[counter] - counter += 1 - if p.css("p::text"): - content += ' '.join(p.css("p::text").extract()) - return content diff --git a/TaiwanNewsCrawler/spiders/liberty_spider.py b/TaiwanNewsCrawler/spiders/liberty_spider.py index a2d4323..a56e829 100644 --- a/TaiwanNewsCrawler/spiders/liberty_spider.py +++ b/TaiwanNewsCrawler/spiders/liberty_spider.py @@ -1,131 +1,106 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- """ 自由時報 the crawl deal with liberty's news Usage: scrapy crawl liberty -o """ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import time -import re + +import json +from urllib.parse import urljoin + import scrapy +import scrapy.http + +import TaiwanNewsCrawler.utils as utils -ROOT_URL = 'http://news.ltn.com.tw' -CATEGORY_DIC = { - 'focus': '焦點', - 'politics': '政治', - 'society': '社會', - 'local': '地方', - 'life': '生活', - 'opinion': '言論', - 'world': '國際', - 'business': '財經', - 'entertainment': '娛樂', - 'consumer': '消費', - 'supplement': '副刊', - 'sports': '體育' -} +ROOT_URL = "http://news.ltn.com.tw/" +PAGE_URL = "http://news.ltn.com.tw/list/breakingnews/all/" +API_URL = "https://news.ltn.com.tw/ajax/breakingnews/all/{}" class LibertySpider(scrapy.Spider): name = "liberty" + def __init__(self, start_date: str = None, end_date: str = None): + super().__init__(start_date=start_date, end_date=end_date) + def start_requests(self): - urls = [ - 'http://news.ltn.com.tw/list/newspaper/focus/', - 'http://news.ltn.com.tw/list/newspaper/politics/', - 'http://news.ltn.com.tw/list/newspaper/society/', - 'http://news.ltn.com.tw/list/newspaper/local/', - 'http://news.ltn.com.tw/list/newspaper/life/', - 'http://news.ltn.com.tw/list/newspaper/opinion/', - 'http://news.ltn.com.tw/list/newspaper/world/', - 'http://news.ltn.com.tw/list/newspaper/business/', - 'http://news.ltn.com.tw/list/newspaper/sports/', - 'http://news.ltn.com.tw/list/newspaper/entertainment/', - 'http://news.ltn.com.tw/list/newspaper/consumer/', - 'http://news.ltn.com.tw/list/newspaper/supplement/' + meta = {"iter_time": 1} + url = API_URL.format(meta["iter_time"]) + yield scrapy.http.Request(url, method="GET", callback=self.parse_news_list, meta=meta) + + def parse_news_list(self, response: scrapy.Request): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + crawl_next = False + response.meta["iter_time"] += 1 + + response_data = json.loads(response.text) + if int(response_data["code"]) == 200: + for news in response_data["data"]: + if type(news) == str: + news = response_data["data"][news] + news_time = utils.parse_date(news["time"], "%H:%M") + if news_time is None: + news_date = utils.parse_date(news["time"], "%Y/%m/%d %H:%M") + else: + news_date = utils.TODAY + crawl_next = utils.can_crawl(news_date, start_date, end_date) + + if crawl_next: + url = news["url"] + if ROOT_URL not in url: + url = urljoin(ROOT_URL, url) + yield scrapy.Request(url, callback=self.parse_news) + + if crawl_next: + url = API_URL.format(response.meta["iter_time"]) + yield scrapy.http.Request(url, method="GET", callback=self.parse_news_list, meta=response.meta) + + def parse_news(self, response: scrapy.Selector): + title = response.css("h1::text").extract_first() + date_str = response.css("meta[property=pubdate]::attr(content)").extract_first() + if date_str is None: + date_str = response.css("span.time::text").extract_first() + date = utils.parse_date(date_str).replace(tzinfo=None) + + parse_text_list = [ + "div.text p", # normal + "div.text p span", # other ] - date = time.strftime('%Y%m%d') - for url in urls: - target = url + date - yield scrapy.Request(target, callback=self.parse_news_list) + for parse_text in parse_text_list: + article = response.css(parse_text) + if article is not None: + break - def parse_news_list(self, response): - for news_item in response.css('.list li'): - relative_url = news_item.css('a.tit::attr(href)').extract_first() - abs_url = response.urljoin(relative_url) - yield scrapy.Request(abs_url, callback=self.parse_news) + content = "" + for p in article: + if (len(p.css("::attr(href)")) == 0 or len(p.css("::attr(class)")) == 0) or p.css("::attr(lang)") == "zh-TW": # fmt: skip + p_text = p.css("::text") + content += " ".join(p_text.extract()) - page_list = [ - int(p) for p in response.css('.pagination a::text').extract() - if p.isdigit() - ] - current_page_extract = response.css( - '.pagination a.active::text').extract_first() - current_page = int( - current_page_extract) if current_page_extract is True else 1 - if (not page_list) or (current_page >= max(page_list)): - return - - next_page = current_page + 1 - - if next_page in page_list: - prefix = re.search(r'.*\/', response.url).group(0) - relative_url = prefix + '/' + str(next_page) - abs_url = response.urljoin(relative_url) - yield scrapy.Request(abs_url, callback=self.parse_news_list) - - def parse_news(self, response): - category = get_news_category(response) - - if category == 'opinion': - title = response.css('h2::text').extract_first() - else: - title = response.css('h1::text').extract_first() - - if category == 'opinion': - content = get_news_content(response, '.cont h4::text', '.cont p') - elif category == 'sports': - content = get_news_content(response, '.news_p h4::text', - '.news_p p') - elif category == 'entertainment': - content = get_news_content(response, '.news_content h4::text', - '.news_content p') - else: - content = get_news_content(response, '.text h4::text', '.text p') + category = response.css("div.breadcrumbs a::text").extract()[-1] - yield { - 'website': "自由時報", - 'url': response.url, - 'title': title, - 'date': time.strftime('%Y-%m-%d'), - 'content': content, - 'category': CATEGORY_DIC[category] - } + # description + try: + description = response.css("meta[property='og:description']::attr(content)").extract_first() + except Exception as e: + description = "" + # key_word + try: + key_word = response.css("meta[name=news_keywords]::attr(content)").extract_first() + except Exception as e: + key_word = "" -def get_news_category(response): - searched_category = re.search(r'\/news\/([a-z]*)\/', response.url) - - if searched_category and searched_category.group(1) != 'paper': - return searched_category.group(1) - elif 'talk' in response.url: - return 'opinion' - elif 'sports' in response.url: - return 'sports' - elif 'ent' in response.url: - return 'entertainment' - - -def get_news_content(response, h4_query, p_query): - h4 = response.css(h4_query).extract() - h4_num = len(h4) - counter = 0 - content = "" - for p in response.css(p_query): - if counter < h4_num: - content += " " + h4[counter] - counter += 1 - if p.css("p::text"): - content += ' '.join(p.css("p::text").extract()) - return content + yield { + "website": "自由時報", + "url": response.url, + "title": title, + "date": date, + "content": content, + "category": category, + "description": description, + "key_word": key_word, + } diff --git a/TaiwanNewsCrawler/spiders/liberty_tag_spider.py b/TaiwanNewsCrawler/spiders/liberty_tag_spider.py deleted file mode 100644 index 70972eb..0000000 --- a/TaiwanNewsCrawler/spiders/liberty_tag_spider.py +++ /dev/null @@ -1,91 +0,0 @@ -""" -自由時報tag -the crawl deal with tags of liberty's news, which could make the dictionary of jieba -Usage: scrapy crawl liberty_tag -o -""" -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import re -import datetime -import scrapy - -ROOT_URL = 'http://news.ltn.com.tw' -OLDEST_DATA_YEAR = 2015 -NEWS_DATE_BEGIN = datetime.date(OLDEST_DATA_YEAR, 1, 1) -TODAY = datetime.date.today() -CATEGORY_DIC = { - 'focus': '焦點', - 'politics': '政治', - 'society': '社會', - 'local': '地方', - 'life': '生活', - 'opinion': '言論', - 'world': '國際', - 'business': '財經', - 'entertainment': '娛樂', - 'consumer': '消費', - 'supplement': '副刊', - 'sports': '體育' -} - - -class LibertySpider(scrapy.Spider): - name = "liberty_tag" - - def start_requests(self): - urls = [ - 'http://news.ltn.com.tw/list/newspaper/focus/', - 'http://news.ltn.com.tw/list/newspaper/politics/', - 'http://news.ltn.com.tw/list/newspaper/society/', - 'http://news.ltn.com.tw/list/newspaper/local/', - 'http://news.ltn.com.tw/list/newspaper/life/', - 'http://news.ltn.com.tw/list/newspaper/opinion/', - 'http://news.ltn.com.tw/list/newspaper/world/', - 'http://news.ltn.com.tw/list/newspaper/business/', - 'http://news.ltn.com.tw/list/newspaper/sports/', - 'http://news.ltn.com.tw/list/newspaper/entertainment/', - 'http://news.ltn.com.tw/list/newspaper/consumer/', - 'http://news.ltn.com.tw/list/newspaper/supplement/' - ] - - day = datetime.timedelta(days=1) - current_time = NEWS_DATE_BEGIN - - while current_time <= TODAY: - date = current_time.strftime('%Y%m%d') - for url in urls: - target = url + date - yield scrapy.Request(target, callback=self.parse_news_list) - current_time += day - - def parse_news_list(self, response): - for news_item in response.css('.list li'): - relative_url = news_item.css('a.tit::attr(href)').extract_first() - abs_url = response.urljoin(relative_url) - yield scrapy.Request(abs_url, callback=self.parse_tag_of_news) - - page_list = [ - int(p) for p in response.css('.pagination a::text').extract() - if p.isdigit() - ] - current_page_extract = response.css( - '.pagination a.active::text').extract_first() - current_page = int( - current_page_extract) if current_page_extract is True else 1 - if (not page_list) or (current_page >= max(page_list)): - return - - next_page = current_page + 1 - - if next_page in page_list: - prefix = re.search(r'.*\/', response.url).group(0) - relative_url = prefix + '/' + str(next_page) - abs_url = response.urljoin(relative_url) - yield scrapy.Request(abs_url, callback=self.parse_news_list) - - def parse_tag_of_news(self, response): - tag_string = response.css( - 'head meta[name=keywords]::attr(content)').extract_first() - tags = tag_string.split(',') - - yield {'tag': tags} diff --git a/TaiwanNewsCrawler/spiders/next_apple_spider.py b/TaiwanNewsCrawler/spiders/next_apple_spider.py new file mode 100644 index 0000000..989e942 --- /dev/null +++ b/TaiwanNewsCrawler/spiders/next_apple_spider.py @@ -0,0 +1,101 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +""" +# 蘋果日報新聞 +the crawl deal with apple's news +apple's news will not update since 2022/09/01 +Usage: scrapy crawl apple -o +""" + +import datetime as dt +import json +from urllib.parse import urljoin + +import scrapy +import scrapy.http + +import TaiwanNewsCrawler.utils as utils + +ROOT_URL = "https://tw.nextapple.com" +PAGE_URL = "https://tw.nextapple.com/realtime/recommend/{}" + + +class NextAppleSpider(scrapy.Spider): + name = "nextapple" + + def __init__(self, start_date: str = None, end_date: str = None): + super().__init__(start_date=start_date, end_date=end_date) + + def start_requests(self): + meta = {"iter_time": 1} + url = PAGE_URL.format(meta["iter_time"]) + yield scrapy.Request(url, callback=self.parse, meta=meta) + + def parse(self, response: scrapy.Selector): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + crawl_next = False + response.meta["iter_time"] += 1 + + parse_text_list = ["div.post-hot article"] + for parse_text in parse_text_list: + for news in response.css(parse_text): + news_date = utils.parse_date(news.css("div.post-inner div.post-meta time::text").extract_first()) + crawl_next = utils.can_crawl(news_date, start_date, end_date) + + if crawl_next: + url = news.css("a::attr(href)").extract_first() + if ROOT_URL not in url: + url = urljoin(ROOT_URL, url) + yield scrapy.Request(url, callback=self.parse_news) + + if crawl_next: + url = PAGE_URL.format(response.meta["iter_time"]) + yield scrapy.Request(url, callback=self.parse, meta=response.meta) + + def parse_news(self, response: scrapy.Selector): + title = response.css("h1::text").extract_first() + date_str = response.css("meta[property=pubdate]::attr(content)").extract_first() + if date_str is None: + date_str = response.css("time::text").extract_first() + date = utils.parse_date(date_str).replace(tzinfo=None) + + parse_text_list = [ + "div#main-content div.post-content p", + ] + + for parse_text in parse_text_list: + article = response.css(parse_text) + if article is not None: + break + + content = "" + for p in article: + if (len(p.css("::attr(href)")) == 0 and len(p.css("::attr(class)")) == 0) or p.css("::attr(lang)") == "zh-TW": # fmt: skip + p_text = p.css("::text") + content += " ".join(p_text.extract()) + + category = response.css("div.category::text").extract_first() + + # description + try: + description = response.css("meta[property='og:description']::attr(content)").extract_first() + except Exception as e: + description = "" + + # key_word + try: + key_word = response.css("meta[name=keywords]::attr(content)").extract_first() + except Exception as e: + key_word = "" + + yield { + "website": "壹蘋新聞網", + "url": response.url, + "title": title, + "date": date, + "content": content, + "category": category, + "description": description, + "key_word": key_word, + } diff --git a/TaiwanNewsCrawler/spiders/pts_spider.py b/TaiwanNewsCrawler/spiders/pts_spider.py index 30245e3..35b0c8c 100644 --- a/TaiwanNewsCrawler/spiders/pts_spider.py +++ b/TaiwanNewsCrawler/spiders/pts_spider.py @@ -1,75 +1,108 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- """ 公視新聞 the crawl deal with pts's news Usage: scrapy crawl pts -o """ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import json -import time + +from urllib.parse import urljoin import scrapy -TODAY = time.strftime('%Y-%m-%d') -ROOT_URL = 'https://news.pts.org.tw/list/' -ARTICLE_PREFIX = 'http://news.pts.org.tw/article/' +import TaiwanNewsCrawler.utils as utils +ROOT_URL = "https://news.pts.org.tw/" +PAGE_URL = "https://news.pts.org.tw/dailynews?page={}" -class EttodaySpider(scrapy.Spider): + +class PtsSpider(scrapy.Spider): name = "pts" + def __init__(self, start_date: str = None, end_date: str = None): + super().__init__(start_date=start_date, end_date=end_date) + def start_requests(self): - url = 'https://news.pts.org.tw/list/0' - meta = {'iter_time': 0} - yield scrapy.Request(url, callback=self.parse_news_list, meta=meta) - - def parse_news_list(self, response): - response.meta['iter_time'] = 1 - for news_item in response.css('ul.list-news li'): - url = news_item.css('h2 a::attr(href)').extract_first() - date_time = news_item.css('.list-news-time::text').extract_first() - title = news_item.css('h2 a::text').extract_first() - content = news_item.css( - '.list-news-description::text').extract_first() - category = news_item.css( - '.list-news-program::text').extract_first() - - if TODAY in date_time: - yield { - 'website': '公視', - 'url': url, - 'title': title, - 'date': date_time, - 'content': content, - 'category': category - } - - yield scrapy.FormRequest( - url='https://news.pts.org.tw/list/getmore.php', - callback=self.get_news, - meta=response.meta, - formdata={ - 'page': '1' - }) - - def get_news(self, response): - response.meta['iter_time'] += 1 - news_items = json.loads(response.text) - - if news_items: - for n in news_items: - yield { - 'website': '公視', - 'url': ARTICLE_PREFIX + n['news_id'], - 'title': n['subject'], - 'date': n['news_date'], - 'content': n['content'], - 'category': n['program_name'] - } - yield scrapy.FormRequest( - url="https://news.pts.org.tw/list/getmore.php", - callback=self.get_news, - meta=response.meta, - formdata={ - 'page': str(response.meta['iter_time']) - }) + meta = {"iter_time": 1} + url = PAGE_URL.format(meta["iter_time"]) + yield scrapy.Request(url, callback=self.parse, meta=meta) + + def parse(self, response: scrapy.Selector): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + crawl_next = False + response.meta["iter_time"] += 1 + + parse_text_list = [ + "div.break-news-container div.breakingnews", + "div.break-news-container ul.news-list li.d-flex", + ] + for parse_text in parse_text_list: + for news in response.css(parse_text): + news_date = utils.parse_date(news.css("time::attr(datetime)").extract_first()) + crawl_next = utils.can_crawl(news_date, start_date, end_date) + + if crawl_next: + url = news.css("h2 a::attr(href)").extract_first() + if ROOT_URL not in url: + url = urljoin(ROOT_URL, url) + yield scrapy.Request(url, callback=self.parse_news) + + if crawl_next: + url = PAGE_URL.format(response.meta["iter_time"]) + yield scrapy.Request(url, callback=self.parse, meta=response.meta) + + def parse_news(self, response: scrapy.Selector): + title = response.css("h1::text").extract_first() + date_str = response.css("meta[property=pubdate]::attr(content)").extract_first() + if date_str is None: + date_str = response.css("time::text").extract_first() + date = utils.parse_date(date_str).replace(tzinfo=None) + + parse_text_list = [ + "article.post-article p", + ] + + for parse_text in parse_text_list: + article = response.css(parse_text) + if article is not None: + break + + content = "" + for p in article: + if (len(p.css("::attr(href)")) == 0 and len(p.css("::attr(class)")) == 0) or p.css("::attr(lang)") == "zh-TW": # fmt: skip + p_text = p.css("::text") + content += " ".join(p_text.extract()) + + category = response.css("ol.breadcrumb li.breadcrumb-item")[-1].css("a::text").extract()[-1] + + # description + try: + description = response.css("meta[property='og:description']::attr(content)").extract_first() + except Exception as e: + description = "" + + # key_word + try: + key_word_list = response.css("div.main-info ul.tag-list li.blue-tag") + key_word = "" + for li in key_word_list: + class_list = li.css("::attr(class)").extract_first() + if "more-tag" not in class_list: + text = li.css("a::text").extract_first() + if len(key_word) == 0: + key_word += f"{text}" + else: + key_word += f",{text}" + except Exception as e: + key_word = "" + + yield { + "website": "公視", + "url": response.url, + "title": title, + "date": date, + "content": content, + "category": category, + "description": description, + "key_word": key_word, + } diff --git a/TaiwanNewsCrawler/spiders/setn_spider.py b/TaiwanNewsCrawler/spiders/setn_spider.py index e1fa539..3644dc6 100644 --- a/TaiwanNewsCrawler/spiders/setn_spider.py +++ b/TaiwanNewsCrawler/spiders/setn_spider.py @@ -1,69 +1,106 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- """ 三立新聞 the crawl deal with setn's news Usage: scrapy crawl setn -o -s DOWNLOAD_DELAY=0.1 """ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import re -from datetime import date -from datetime import timedelta + +from urllib.parse import urljoin + import scrapy -YESTERDAY = (date.today() - timedelta(1)).strftime('%m/%d/%Y') +import TaiwanNewsCrawler.utils as utils + +ROOT_URL = "http://www.setn.com" +PAGE_URL = "http://www.setn.com/ViewAll.aspx?p={}" class SetnSpider(scrapy.Spider): name = "setn" - def __init__(self, category=None, *args, **kwargs): - super(SetnSpider, self).__init__(*args, **kwargs) - self.start_urls = [ - 'http://www.setn.com/ViewAll.aspx?date={}&p=1'.format(YESTERDAY) + def __init__(self, start_date: str = None, end_date: str = None): + super().__init__(start_date=start_date, end_date=end_date) + + def start_requests(self): + meta = {"iter_time": 1} + url = PAGE_URL.format(meta["iter_time"]) + yield scrapy.Request(url, callback=self.parse, meta=meta) + + def parse(self, response: scrapy.Selector): + crawl_next = False + response.meta["iter_time"] += 1 + + parse_text_list = ["#NewsList div.newsItems"] + for parse_text in parse_text_list: + for news in response.css(parse_text): + crawl_next = True + + url = news.css("h3 a::attr(href)").extract_first() + if ROOT_URL not in url: + url = urljoin(ROOT_URL, url) + yield scrapy.Request(url, callback=self.parse_news) + + if crawl_next: + url = PAGE_URL.format(response.meta["iter_time"]) + yield scrapy.Request(url, callback=self.parse, meta=response.meta) + + def parse_news(self, response: scrapy.Selector): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + title = response.css("h1::text").extract_first() + date_str = response.css("meta[name=pubdate]::attr(content)").extract_first() + if date_str is None: + date_str = response.css("time::text").extract_first() + if date_str is None: + date_str = response.css("meta[property='article:published_time']::attr(content)").extract_first() + date = utils.parse_date(date_str).replace(tzinfo=None) + + crawl = utils.can_crawl(date, start_date, end_date) + if not crawl: + return + + parse_text_list = [ + "article p", ] - self.last_page_flag = 0 - - def parse(self, response): - - for news in response.css('.box ul li'): - category = news.css('.tab_list_type span::text').extract_first() - meta = {'category': category} - url = news.css('a::attr(href)').extract_first() - url = response.urljoin(url) - yield scrapy.Request(url, callback=self.parse_news, meta=meta) - - last_two_pages = response.css('.pager a::attr(href)').extract()[-2:] - page1 = last_two_pages[0].split('&p=')[1] - page2 = last_two_pages[1].split('&p=')[1] - - if page1 == page2: - self.last_page_flag = self.last_page_flag + 1 - - if self.last_page_flag < 2: - url_arr = response.url.split('&p=') - current_page = int(url_arr[1]) - next_page_url = '&p='.join( - url_arr[:-1]) + '&p=' + str(current_page + 1) - yield scrapy.Request(next_page_url, callback=self.parse) - - def parse_news(self, response): - title = response.css('.title h1::text').extract_first() - content = '' - date_of_news = '' - if response.url.split('/')[3] == 'E': - date_of_news = response.css('.time::text').extract_first()[:10] - content = response.css('.Content2 p::text').extract() - else: - date_of_news = response.css('.date::text').extract_first()[:10] - content = response.css('#Content1 p::text').extract() - - content = ''.join(content) + + for parse_text in parse_text_list: + article = response.css(parse_text) + if article is not None: + break + + content = "" + for p in article: + if ( + len(p.css("::attr(href)")) == 0 + and len(p.css("::attr(class)")) == 0 + and len(p.css("::attr(style)")) == 0 + ) or p.css("::attr(lang)") == "zh-TW": + p_text = p.css("::text") + content += " ".join(p_text.extract()) + + category = response.css("meta[name=section]::attr(content)").extract_first() + if category is None: + category = response.css("meta[property='article:section']::attr(content)").extract_first() + + # description + try: + description = response.css("meta[property='og:description']::attr(content)").extract_first() + except Exception as e: + description = "" + + # key_word + try: + key_word = response.css("meta[name=news_keywords]::attr(content)").extract_first() + except Exception as e: + key_word = "" yield { - 'website': "三立新聞", - 'url': response.url, - 'title': title, - 'date': date_of_news, - 'content': content, - 'category': response.meta['category'] + "website": "三立新聞", + "url": response.url, + "title": title, + "date": date, + "content": content, + "category": category, + "description": description, + "key_word": key_word, } diff --git a/TaiwanNewsCrawler/spiders/tvbs_spider.py b/TaiwanNewsCrawler/spiders/tvbs_spider.py index 81cf8d0..c448e4f 100644 --- a/TaiwanNewsCrawler/spiders/tvbs_spider.py +++ b/TaiwanNewsCrawler/spiders/tvbs_spider.py @@ -1,75 +1,91 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- """ TVBS the crawl deal with tvbs's news Usage: scrapy crawl tvbs -o """ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import re -from datetime import date -from datetime import timedelta +import datetime as dt +from urllib.parse import urljoin import scrapy -YESTERDAY = (date.today() - timedelta(1)).strftime('%Y/%m/%d') -YESTERDAY = YESTERDAY.replace('/', '-') +import TaiwanNewsCrawler.utils as utils + +ROOT_URL = "https://news.tvbs.com.tw" +PAGE_URL = "https://news.tvbs.com.tw/realtime/news/{}" class TvbsSpider(scrapy.Spider): name = "tvbs" - start_urls = [ - 'http://news.tvbs.com.tw/news/realtime/all/{}/1'.format(YESTERDAY) - ] + + def __init__(self, start_date: str = None, end_date: str = None): + super().__init__(start_date=start_date, end_date=end_date) + + def start_requests(self): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + date = start_date + + while date < end_date: + url = PAGE_URL.format(date.strftime("%Y-%m-%d")) + yield scrapy.Request(url, method="GET", callback=self.parse) + date += dt.timedelta(days=1) def parse(self, response): - for news in response.css('.realtime_news_content_titel'): - category = news.css('p::text').extract_first() - meta = {'category': category} - url = news.css('div a::attr(href)').extract_first() - url = response.urljoin(url) - yield scrapy.Request(url, callback=self.parse_news, meta=meta) - - total_pages = response.css( - '.realtime_news_underbtn li:last-child::text').extract_first() - total_pages_num = int(total_pages[1:-1]) - url_arr = response.url.split('/') - current_page_index = int(url_arr[-1]) - - if current_page_index < total_pages_num: - next_page_url = '/'.join(url_arr[:-1]) + \ - '/' + str(current_page_index + 1) - yield scrapy.Request(next_page_url, callback=self.parse) - - def parse_news(self, response): - title = response.css('.newsdetail-h2 p strong::text').extract_first() - date_of_news = response.css( - '.newsdetail-time1 p::text').extract_first()[:10] - raw_content = response.css('.newsdetail-content').extract_first() - - TAG_RE = re.compile(r'<[^>]+>([^<]*]+>)?') - - content_prefix = '' - content_suffix1 = '' - content_suffix2 = '