From 6de6e5ec95e3c90a9242a499aa280d701978d1fa Mon Sep 17 00:00:00 2001 From: cool9203 Date: Mon, 24 Oct 2022 17:57:15 +0800 Subject: [PATCH 01/39] edit: version --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 69521a4..3893fc6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -scrapy==1.3.0 -Twisted==16.6.0 \ No newline at end of file +scrapy==2.7.0 +Twisted==22.8.0 \ No newline at end of file From c608d82e37f7fe908f0c17301492246c58bec8f8 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Mon, 24 Oct 2022 17:57:54 +0800 Subject: [PATCH 02/39] edit: Prerequisites and Available spiders --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3e53fd2..d3bd657 100644 --- a/README.md +++ b/README.md @@ -25,13 +25,14 @@ $ scrapy crawl apple -o apple_news.json ## Prerequisites - Python3 -- Scrapy 1.3.0 +- Scrapy >= 1.3.0 +- Twisted >= 16.6.0 ## Usage ```scrapy crawl -o ``` ### Available spiders -1. apple -2. appleRealtime +1. apple (not update since 2022/09/01) +2. appleRealtime (not update since 2022/09/01) 3. china 4. cna 5. cts From 721597d6760725d16efb8231bbd7e868960ba46c Mon Sep 17 00:00:00 2001 From: cool9203 Date: Mon, 24 Oct 2022 17:58:25 +0800 Subject: [PATCH 03/39] edit: now can work since 2022-10-24 --- TaiwanNewsCrawler/spiders/china_spider.py | 31 +++++++++++++++-------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/TaiwanNewsCrawler/spiders/china_spider.py b/TaiwanNewsCrawler/spiders/china_spider.py index 0a598f8..2c2c8cd 100644 --- a/TaiwanNewsCrawler/spiders/china_spider.py +++ b/TaiwanNewsCrawler/spiders/china_spider.py @@ -6,8 +6,10 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- from datetime import datetime +import time import scrapy +TODAY = time.strftime('%Y-%m-%d') ROOT_URL = 'http://www.chinatimes.com' PAGE_URL = 'http://www.chinatimes.com/newspapers/2601' @@ -17,34 +19,41 @@ class ChinaSpider(scrapy.Spider): start_urls = ['http://www.chinatimes.com/newspapers/2601'] def parse(self, response): - news_in_page = response.css('.listRight li h2 a') + has_next_page = False + news_in_page = response.css('ul.vertical-list li') if not news_in_page: return for news in news_in_page: url = news.css('a::attr(href)').extract_first() - if ROOT_URL not in url: - url = ROOT_URL + url - url = response.urljoin(url) - yield scrapy.Request(url, callback=self.parse_news) + date_time_1 = news.css('time::attr(datetime)').extract_first() + date_time_2 = news.css('time span.date::text').extract_first() + if TODAY in date_time_1 or TODAY in date_time_2: + has_next_page = True + + if ROOT_URL not in url: + url = ROOT_URL + url + url = response.urljoin(url) + yield scrapy.Request(url, callback=self.parse_news) if 'next_page' in response.meta: meta = {'next_page': response.meta['next_page'] + 1} else: meta = {'next_page': 2} - next_url = PAGE_URL + '?page=' + str(meta['next_page']) - yield scrapy.Request(next_url, callback=self.parse, meta=meta) + if has_next_page: + next_url = PAGE_URL + '?page=' + str(meta['next_page']) + yield scrapy.Request(next_url, callback=self.parse, meta=meta) def parse_news(self, response): title = response.css('h1::text').extract_first() date_of_news_str = response.css('time::attr(datetime)').extract_first() - date_of_news = datetime.strptime(date_of_news_str, '%Y/%m/%d %H:%M') + date_of_news = datetime.strptime(date_of_news_str, '%Y-%m-%d %H:%M') content = "" - for p in response.css('article p'): + for p in response.css('div.article-body p'): p_text = p.css('::text') if p_text: - content += ' '.join(p_text.extract()) + content += ''.join(p_text.extract()) - category = response.css('.page_index span::text').extract()[-1].strip() + category = response.css('meta[name=section]::attr(content)').extract_first() yield { 'website': "中國時報", From e995dfffe4fefc7edb718e60603868c0df3a8d7e Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 13:39:29 +0800 Subject: [PATCH 04/39] add: utils --- TaiwanNewsCrawler/utils.py | 43 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 TaiwanNewsCrawler/utils.py diff --git a/TaiwanNewsCrawler/utils.py b/TaiwanNewsCrawler/utils.py new file mode 100644 index 0000000..3dad65c --- /dev/null +++ b/TaiwanNewsCrawler/utils.py @@ -0,0 +1,43 @@ +from typing import (Union, Tuple) +import datetime as dt + +TODAY = dt.datetime.strptime(dt.datetime.now().strftime("%Y-%m-%d"), '%Y-%m-%d') +PARSE_DATE_FORMAT_LIST = ["%Y-%m-%d", "%Y/%m/%d", "%Y %m %d"] +PARSE_TIME_FORMAT_LIST = ["%H %M", "%H:%M", "%H %M %S", "%H:%M:%S"] + + +def parse_start_date_and_end_date(start_date: Union[str, None], end_date: Union[str, None]) -> Tuple[dt.datetime, dt.datetime]: + if (not start_date is None): + start_date = dt.datetime.strptime(start_date, '%Y-%m-%d') + else: + start_date = TODAY + + if (not end_date is None): + end_date = dt.datetime.strptime(end_date, '%Y-%m-%d') + else: + end_date = TODAY + end_date += dt.timedelta(days=1) + return (start_date, end_date) + + +def parse_date(date_str: str, parse_format: str=None) -> dt.datetime: + if (not parse_format is None): + return dt.datetime.strptime(date_str, parse_format) + + for date_format in PARSE_DATE_FORMAT_LIST: + for time_format in PARSE_TIME_FORMAT_LIST: + try: + date = dt.datetime.strptime(date_str, f"{date_format} {time_format}") + break + except: + date = None + if (not date is None): + break + return date + + +def can_crawl(date: dt.datetime, start_date: dt.datetime, end_date: dt.datetime) -> bool: + if (date >= start_date and date <= end_date): + return True + else: + return False \ No newline at end of file From 04f8f77087a01351a770603a3dd206600a507aa9 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 13:42:45 +0800 Subject: [PATCH 05/39] edit: add date args --- TaiwanNewsCrawler/spiders/china_spider.py | 40 +++++++++++++---------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/TaiwanNewsCrawler/spiders/china_spider.py b/TaiwanNewsCrawler/spiders/china_spider.py index 2c2c8cd..a6d6f83 100644 --- a/TaiwanNewsCrawler/spiders/china_spider.py +++ b/TaiwanNewsCrawler/spiders/china_spider.py @@ -5,48 +5,54 @@ """ #!/usr/bin/env python # -*- coding: utf-8 -*- -from datetime import datetime -import time import scrapy +import TaiwanNewsCrawler.utils as utils + -TODAY = time.strftime('%Y-%m-%d') ROOT_URL = 'http://www.chinatimes.com' PAGE_URL = 'http://www.chinatimes.com/newspapers/2601' - class ChinaSpider(scrapy.Spider): name = "china" start_urls = ['http://www.chinatimes.com/newspapers/2601'] - def parse(self, response): - has_next_page = False + def __init__(self, start_date: str=None, end_date: str=None): + super().__init__(start_date=start_date, end_date=end_date) + + def parse(self, response: scrapy.Selector): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + + crawl_next = False news_in_page = response.css('ul.vertical-list li') if not news_in_page: return for news in news_in_page: - url = news.css('a::attr(href)').extract_first() - date_time_1 = news.css('time::attr(datetime)').extract_first() - date_time_2 = news.css('time span.date::text').extract_first() - if TODAY in date_time_1 or TODAY in date_time_2: - has_next_page = True - - if ROOT_URL not in url: + news_date = utils.parse_date(news.css('time::attr(datetime)').extract_first()) + if (news_date is None): + continue + crawl_next = utils.can_crawl(news_date, start_date, end_date) + + if (crawl_next): + url = news.css('a::attr(href)').extract_first() + if (not ROOT_URL in url): url = ROOT_URL + url url = response.urljoin(url) yield scrapy.Request(url, callback=self.parse_news) - if 'next_page' in response.meta: + + if ('next_page' in response.meta): meta = {'next_page': response.meta['next_page'] + 1} else: meta = {'next_page': 2} - if has_next_page: + + if (crawl_next): next_url = PAGE_URL + '?page=' + str(meta['next_page']) yield scrapy.Request(next_url, callback=self.parse, meta=meta) - def parse_news(self, response): + def parse_news(self, response: scrapy.Selector): title = response.css('h1::text').extract_first() date_of_news_str = response.css('time::attr(datetime)').extract_first() - date_of_news = datetime.strptime(date_of_news_str, '%Y-%m-%d %H:%M') + date_of_news = utils.parse_date(date_of_news_str, '%Y-%m-%d %H:%M') content = "" for p in response.css('div.article-body p'): p_text = p.css('::text') From 35c3def2c13570e9e8ad4e6ad38914fcf4ddfbea Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 14:55:50 +0800 Subject: [PATCH 06/39] add: description --- TaiwanNewsCrawler/items.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TaiwanNewsCrawler/items.py b/TaiwanNewsCrawler/items.py index 166f23b..9d8e00a 100644 --- a/TaiwanNewsCrawler/items.py +++ b/TaiwanNewsCrawler/items.py @@ -15,3 +15,4 @@ class TaiwannewscrawlerItem(scrapy.Item): date = scrapy.Field() content = scrapy.Field() category = scrapy.Field() + description = scrapy.Field() From fdaa3cb597c3a12b989ab2926f6d5bd832ac0501 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 14:56:05 +0800 Subject: [PATCH 07/39] add: .vscode --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 1726074..7fc1dea 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,6 @@ ENV/ *.swp setup.py + +# workspace +.vscode \ No newline at end of file From 59bd1c81db95e14f27b7da236205e2c483c32239 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 14:56:18 +0800 Subject: [PATCH 08/39] add: description --- TaiwanNewsCrawler/spiders/china_spider.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/TaiwanNewsCrawler/spiders/china_spider.py b/TaiwanNewsCrawler/spiders/china_spider.py index a6d6f83..105187f 100644 --- a/TaiwanNewsCrawler/spiders/china_spider.py +++ b/TaiwanNewsCrawler/spiders/china_spider.py @@ -23,11 +23,11 @@ def parse(self, response: scrapy.Selector): start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) crawl_next = False - news_in_page = response.css('ul.vertical-list li') - if not news_in_page: + all_news = response.css('ul.vertical-list li') + if not all_news: return - for news in news_in_page: + for news in all_news: news_date = utils.parse_date(news.css('time::attr(datetime)').extract_first()) if (news_date is None): continue @@ -61,11 +61,18 @@ def parse_news(self, response: scrapy.Selector): category = response.css('meta[name=section]::attr(content)').extract_first() + # description + try: + description = response.css('meta[property=og:description]::attr(content)').extract_first() + except: + description = "" + yield { 'website': "中國時報", 'url': response.url, 'title': title, 'date': date_of_news, 'content': content, - 'category': category + 'category': category, + 'description': description } From 04b46ec8c8a527b5be9defc392819d2160622b80 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 14:57:06 +0800 Subject: [PATCH 09/39] edit: now can crawl cna --- TaiwanNewsCrawler/spiders/cna_spider.py | 94 +++++++++++++++---------- 1 file changed, 56 insertions(+), 38 deletions(-) diff --git a/TaiwanNewsCrawler/spiders/cna_spider.py b/TaiwanNewsCrawler/spiders/cna_spider.py index 990f0ff..47ae13b 100644 --- a/TaiwanNewsCrawler/spiders/cna_spider.py +++ b/TaiwanNewsCrawler/spiders/cna_spider.py @@ -5,58 +5,76 @@ """ #!/usr/bin/env python # -*- coding: utf-8 -*- -from datetime import datetime import scrapy +import json +import TaiwanNewsCrawler.utils as utils -ROOT_URL = 'http://www.cna.com.tw' -TODAY = datetime.today().date() +ROOT_URL = 'https://www.cna.com.tw' +API_URL = 'https://www.cna.com.tw/cna2018api/api/WNewsList' +API_POST_DATA = {"action": "0", "category": "aall", "pagesize": "20", "pageidx": 1} class CnaSpider(scrapy.Spider): name = "cna" - start_urls = ['http://www.cna.com.tw/list/aall-1.aspx'] - - def parse(self, response): - current_page_index = int( - response.css('.pagination li.current a::text').extract_first()) - - newses_time_str = response.css('.article_list li span::text').extract() - newses_time = [ - datetime.strptime(i, '%Y/%m/%d %H:%M').date() - for i in newses_time_str - ] - is_over_today = False - - for t in newses_time: - if t < TODAY: - is_over_today = True - - if not is_over_today: - next_url = 'http://www.cna.com.tw/list/aall-' + str( - current_page_index + 1) + '.aspx' - yield scrapy.Request(next_url, callback=self.parse) - - for news in response.css('div.article_list li a'): - url = response.urljoin(news.css('a::attr(href)').extract_first()) - yield scrapy.Request(url, callback=self.parse_news) - - def parse_news(self, response): - title = response.css('h1::text').extract_first() - date = response.css('div.update_times p::text').extract_first()[5:] + start_urls = ['https://www.cna.com.tw/list/aall.aspx'] + + def __init__(self, start_date: str=None, end_date: str=None): + super().__init__(start_date=start_date, end_date=end_date) + + def parse(self, response: scrapy.Selector): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + + crawl_next = False + all_news = response.css('ul#jsMainList li') + if not all_news: + return + + for news in all_news: + news_date = utils.parse_date(news.css('div.date::text').extract_first()) + if (news_date is None): + continue + crawl_next = utils.can_crawl(news_date, start_date, end_date) + + if (crawl_next): + url = news.css('a::attr(href)').extract_first() + if (not ROOT_URL in url): + url = ROOT_URL + url + url = response.urljoin(url) + yield scrapy.Request(url, callback=self.parse_news) + + if (crawl_next): + API_POST_DATA["pageidx"] += 1 + # use api to get more news + # yield scrapy.Request(API_URL, method='POST', body=json.dumps(API_POST_DATA), callback=self.parse_api, headers={'Content-Type':'application/json'}) + + def parse_news(self, response: scrapy.Selector): + title = response.css('h1 span::text').extract_first() + date_str = response.css('div.updatetime span::text').extract_first() + date = utils.parse_date(date_str, "%Y/%m/%d %H:%M") content = '' - for p in response.css('div.article_box section p'): + for p in response.css('div.centralContent div.paragraph p'): p_text = p.css('::text') if p_text: - content += ' '.join(p_text.extract()) + content += ''.join(p_text.extract()) - category_links = response.css('div.breadcrumb span a span') - category = category_links[1].css('::text').extract_first() + category = response.css('article.article::attr(data-origin-type-name)').extract_first() + + # description + try: + description = response.css('meta.description::attr(content)').extract_first() + except: + description = "" yield { 'website': "中央通訊社", 'url': response.url, 'title': title, - 'date': date[:10].replace('/', '-'), + 'date': date, 'content': content, - 'category': category + 'category': category, + 'description': description } + + # TODO: can use api to get more news + def parse_api(self, response): + pass \ No newline at end of file From 4edf1e92ffef51a4c033c55d0b40ed9ad29aa1e8 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 15:07:41 +0800 Subject: [PATCH 10/39] edit: Available spiders use check --- README.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index d3bd657..bb6f8be 100644 --- a/README.md +++ b/README.md @@ -30,19 +30,19 @@ $ scrapy crawl apple -o apple_news.json ## Usage ```scrapy crawl -o ``` -### Available spiders -1. apple (not update since 2022/09/01) -2. appleRealtime (not update since 2022/09/01) -3. china -4. cna -5. cts -6. ettoday -7. liberty -8. libertyRealtime -9. pts -10. setn -11. tvbs -12. udn +### Available spiders (all 12) +[ ] apple (not update since 2022/09/01) +[ ] appleRealtime (not update since 2022/09/01) +[X] china +[X] cna +[ ] cts +[ ] ettoday +[ ] liberty +[ ] libertyRealtime +[ ] pts +[ ] setn +[ ] tvbs +[ ] udn ## Output | Key | Value | From 05f9447c0beef30e0f99c6d686e83ee6308cbda7 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 16:40:17 +0800 Subject: [PATCH 11/39] edit: use same var name in parse_news --- TaiwanNewsCrawler/spiders/china_spider.py | 8 ++++---- TaiwanNewsCrawler/spiders/cna_spider.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/TaiwanNewsCrawler/spiders/china_spider.py b/TaiwanNewsCrawler/spiders/china_spider.py index 105187f..b5a0670 100644 --- a/TaiwanNewsCrawler/spiders/china_spider.py +++ b/TaiwanNewsCrawler/spiders/china_spider.py @@ -51,13 +51,13 @@ def parse(self, response: scrapy.Selector): def parse_news(self, response: scrapy.Selector): title = response.css('h1::text').extract_first() - date_of_news_str = response.css('time::attr(datetime)').extract_first() - date_of_news = utils.parse_date(date_of_news_str, '%Y-%m-%d %H:%M') + date_str = response.css('time::attr(datetime)').extract_first() + date = utils.parse_date(date_str, '%Y-%m-%d %H:%M') content = "" for p in response.css('div.article-body p'): p_text = p.css('::text') if p_text: - content += ''.join(p_text.extract()) + content += ' '.join(p_text.extract()) category = response.css('meta[name=section]::attr(content)').extract_first() @@ -71,7 +71,7 @@ def parse_news(self, response: scrapy.Selector): 'website': "中國時報", 'url': response.url, 'title': title, - 'date': date_of_news, + 'date': date, 'content': content, 'category': category, 'description': description diff --git a/TaiwanNewsCrawler/spiders/cna_spider.py b/TaiwanNewsCrawler/spiders/cna_spider.py index 47ae13b..fe27c0f 100644 --- a/TaiwanNewsCrawler/spiders/cna_spider.py +++ b/TaiwanNewsCrawler/spiders/cna_spider.py @@ -6,6 +6,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- import scrapy +import scrapy.http import json import TaiwanNewsCrawler.utils as utils @@ -45,7 +46,7 @@ def parse(self, response: scrapy.Selector): if (crawl_next): API_POST_DATA["pageidx"] += 1 # use api to get more news - # yield scrapy.Request(API_URL, method='POST', body=json.dumps(API_POST_DATA), callback=self.parse_api, headers={'Content-Type':'application/json'}) + # yield scrapy.http.Request(API_URL, method='POST', body=json.dumps(API_POST_DATA), callback=self.parse_api, headers={'Content-Type':'application/json'}) def parse_news(self, response: scrapy.Selector): title = response.css('h1 span::text').extract_first() @@ -55,7 +56,7 @@ def parse_news(self, response: scrapy.Selector): for p in response.css('div.centralContent div.paragraph p'): p_text = p.css('::text') if p_text: - content += ''.join(p_text.extract()) + content += ' '.join(p_text.extract()) category = response.css('article.article::attr(data-origin-type-name)').extract_first() From f1b566a86ad383ea76f05d2a7bec3a9010f926d8 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 16:47:41 +0800 Subject: [PATCH 12/39] edit: now can crawl cts json with cts.api --- TaiwanNewsCrawler/spiders/cts_spider.py | 83 ++++++++++++++----------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/TaiwanNewsCrawler/spiders/cts_spider.py b/TaiwanNewsCrawler/spiders/cts_spider.py index c8b54ac..14b16b0 100644 --- a/TaiwanNewsCrawler/spiders/cts_spider.py +++ b/TaiwanNewsCrawler/spiders/cts_spider.py @@ -5,55 +5,64 @@ """ #!/usr/bin/env python # -*- coding: utf-8 -*- -from datetime import date -from datetime import timedelta import scrapy +import scrapy.http +import datetime as dt +import json +import TaiwanNewsCrawler.utils as utils -YESTERDAY = (date.today() - timedelta(1)).strftime('%Y/%m/%d') - +ROOT_URL = "https://news.cts.com.tw" +API_URL = "https://news.cts.com.tw/api/news/{}/daylist-news.json" class CtsSpider(scrapy.Spider): name = "cts" - start_urls = [ - 'http://news.cts.com.tw/daylist/{}/index.html'.format(YESTERDAY) - ] - def parse(self, response): - for news in response.css('.news_right'): - url = news.css('a::attr(href)').extract_first() + def __init__(self, start_date: str=None, end_date: str=None): + super().__init__(start_date=start_date, end_date=end_date) + + def start_requests(self): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date, utils.YESTERDAY, utils.YESTERDAY) + date = start_date + + while (date < end_date): + url = API_URL.format(date.strftime("%Y/%m/%d")) + yield scrapy.http.Request(url, method='GET', callback=self.parse) + date += dt.timedelta(days=1) + + def parse(self, response: scrapy.Request): + response = json.loads(response.text) + for news in response: + url = news["news_url"] + if (not ROOT_URL in url): + url = ROOT_URL + url yield scrapy.Request(url, callback=self.parse_news) - page_desc = response.css('.page-desc::text').extract_first() - total_pages = page_desc.split('/')[1] - total_pages = int(total_pages[2:-2]) - url_arr = response.url.split('/') - url_suffix = url_arr[-1] - current_page_index = url_suffix[5:-5] - if current_page_index is '': - current_page_index = 1 - else: - current_page_index = int(current_page_index) - - if current_page_index < total_pages: - next_page = '/'.join(url_arr[:-1]) + '/index' + str( - current_page_index + 1) + '.html' - yield scrapy.Request(next_page, callback=self.parse) - - def parse_news(self, response): - title = response.css('.newsbigtitle::text').extract_first().strip( - ' \t\n\r') - date_of_news = response.css('.timebar::text').extract_first().strip( - ' \t\n\r') - date_of_news = date_of_news[:10] - category = response.css('.active a::text').extract()[-1] - content = response.css('.newscontents p::text').extract() - content = ' '.join(content) + def parse_news(self, response: scrapy.Selector): + title = response.css('div.artical-titlebar h1.artical-title::text').extract_first() + date_str = response.css('div.news-artical div.titlebar-top time.artical-time::text').extract_first() + date = utils.parse_date(date_str, "%Y/%m/%d %H:%M") + content = "" + for p in response.css('artical.news-artical div.artical-content p'): + if (len(p.css("::attr(href)")) == 0 or len(p.css("::attr(class)"))): + p_text = p.css('::text') + content += ' '.join(p_text.extract()) + + category = response.css('meta[name=section]::attr(content)').extract_first() + + # description + try: + description = response.css('meta[name=description]::attr(content)').extract_first() + except: + description = "" yield { 'website': "華視", 'url': response.url, 'title': title, - 'date': date_of_news, + 'date': date, 'content': content, - 'category': category + 'category': category, + "description": description } + + \ No newline at end of file From 8a02e4080752647e06bd0dea6033b2b0202b0881 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 17:54:18 +0800 Subject: [PATCH 13/39] edit: description use og --- TaiwanNewsCrawler/spiders/china_spider.py | 2 +- TaiwanNewsCrawler/spiders/cna_spider.py | 2 +- TaiwanNewsCrawler/spiders/cts_spider.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/TaiwanNewsCrawler/spiders/china_spider.py b/TaiwanNewsCrawler/spiders/china_spider.py index b5a0670..24d48b3 100644 --- a/TaiwanNewsCrawler/spiders/china_spider.py +++ b/TaiwanNewsCrawler/spiders/china_spider.py @@ -63,7 +63,7 @@ def parse_news(self, response: scrapy.Selector): # description try: - description = response.css('meta[property=og:description]::attr(content)').extract_first() + description = response.css("meta[property='og:description']::attr(content)").extract_first() except: description = "" diff --git a/TaiwanNewsCrawler/spiders/cna_spider.py b/TaiwanNewsCrawler/spiders/cna_spider.py index fe27c0f..c9dd7df 100644 --- a/TaiwanNewsCrawler/spiders/cna_spider.py +++ b/TaiwanNewsCrawler/spiders/cna_spider.py @@ -62,7 +62,7 @@ def parse_news(self, response: scrapy.Selector): # description try: - description = response.css('meta.description::attr(content)').extract_first() + description = response.css("meta[property='og:description']::attr(content)").extract_first() except: description = "" diff --git a/TaiwanNewsCrawler/spiders/cts_spider.py b/TaiwanNewsCrawler/spiders/cts_spider.py index 14b16b0..41225f8 100644 --- a/TaiwanNewsCrawler/spiders/cts_spider.py +++ b/TaiwanNewsCrawler/spiders/cts_spider.py @@ -51,7 +51,7 @@ def parse_news(self, response: scrapy.Selector): # description try: - description = response.css('meta[name=description]::attr(content)').extract_first() + description = response.css("meta[property='og:description']::attr(content)").extract_first() except: description = "" From 979ca93dd78142bbc66e1d8e0b7d63ab53fab4e9 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 17:54:44 +0800 Subject: [PATCH 14/39] add: YESTERDAY and parse_start_date_and_end_date add default --- TaiwanNewsCrawler/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/TaiwanNewsCrawler/utils.py b/TaiwanNewsCrawler/utils.py index 3dad65c..a1ad2b3 100644 --- a/TaiwanNewsCrawler/utils.py +++ b/TaiwanNewsCrawler/utils.py @@ -2,20 +2,21 @@ import datetime as dt TODAY = dt.datetime.strptime(dt.datetime.now().strftime("%Y-%m-%d"), '%Y-%m-%d') +YESTERDAY = TODAY - dt.timedelta(days=1) PARSE_DATE_FORMAT_LIST = ["%Y-%m-%d", "%Y/%m/%d", "%Y %m %d"] PARSE_TIME_FORMAT_LIST = ["%H %M", "%H:%M", "%H %M %S", "%H:%M:%S"] -def parse_start_date_and_end_date(start_date: Union[str, None], end_date: Union[str, None]) -> Tuple[dt.datetime, dt.datetime]: +def parse_start_date_and_end_date(start_date: Union[str, None], end_date: Union[str, None], start_date_default: dt.datetime=TODAY, end_date_default: dt.datetime=TODAY) -> Tuple[dt.datetime, dt.datetime]: if (not start_date is None): start_date = dt.datetime.strptime(start_date, '%Y-%m-%d') else: - start_date = TODAY + start_date = start_date_default if (not end_date is None): end_date = dt.datetime.strptime(end_date, '%Y-%m-%d') else: - end_date = TODAY + end_date = end_date_default end_date += dt.timedelta(days=1) return (start_date, end_date) From 867ceb9d0ba2d0da341aa7baf1e3b92e55b3f521 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 17:55:15 +0800 Subject: [PATCH 15/39] edit: now can crawl ettoday --- TaiwanNewsCrawler/spiders/ettoday_spider.py | 75 +++++++++++++-------- 1 file changed, 48 insertions(+), 27 deletions(-) diff --git a/TaiwanNewsCrawler/spiders/ettoday_spider.py b/TaiwanNewsCrawler/spiders/ettoday_spider.py index 66bcf75..c961b49 100644 --- a/TaiwanNewsCrawler/spiders/ettoday_spider.py +++ b/TaiwanNewsCrawler/spiders/ettoday_spider.py @@ -4,47 +4,57 @@ """ #!/usr/bin/env python # -*- coding: utf-8 -*- -import time import scrapy +import scrapy.http +import datetime as dt +import TaiwanNewsCrawler.utils as utils + -TODAY = time.strftime('%Y/%m/%d') -TODAY_URL = time.strftime('%Y-%m-%d') ROOT_URL = 'https://www.ettoday.net' +PAGE_URL = "https://www.ettoday.net/news/news-list-{}-0.htm" +API_URL = "https://www.ettoday.net/show_roll.php" class EttodaySpider(scrapy.Spider): name = "ettoday" + def __init__(self, start_date: str=None, end_date: str=None): + super().__init__(start_date=start_date, end_date=end_date) + def start_requests(self): - urls = [ - 'https://www.ettoday.net/news/news-list-' + TODAY_URL + '-0.htm' - ] - for url in urls: - meta = {'iter_time': 0} + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + date = start_date + + while (date < end_date): + meta = {'iter_time': 0, "date": date, "start_date": date, "end_date": date+dt.timedelta(days=1)} + url = PAGE_URL.format(date.strftime("%Y-%m-%d")) yield scrapy.Request(url, callback=self.parse_news_list, meta=meta) + date += dt.timedelta(days=1) def parse_news_list(self, response): - has_next_page = True + start_date, end_date = response.meta["start_date"], response.meta["end_date"] + crawl_next = False response.meta['iter_time'] += 1 is_first_iter = response.meta['iter_time'] == 1 prefix = '.part_list_2' if is_first_iter else '' - for news_item in response.css(prefix + ' h3'): - url = news_item.css('a::attr(href)').extract_first() + date_str = response.meta["date"].strftime("%Y/%m/%d") + for news in response.css(prefix + ' h3'): + url = news.css('a::attr(href)').extract_first() url = ROOT_URL + url - category = news_item.css('em::text').extract_first() - date_time = news_item.css('span::text').extract_first() - - if TODAY not in date_time: - has_next_page = False - continue - - response.meta['category'] = category - yield scrapy.Request( - url, callback=self.parse_news, meta=response.meta) - if has_next_page: - tFile = time.strftime('%Y%m%d') + '.xml' + category = news.css('em::text').extract_first() + news_date = utils.parse_date(news.css('span::text').extract_first()) + crawl_next = utils.can_crawl(news_date, start_date, end_date) + + if (crawl_next): + response.meta['category'] = category + yield scrapy.Request( + url, callback=self.parse_news, meta=response.meta) + + if (crawl_next): + date_str = response.meta["date"].strftime("%Y%m%d") + tFile = f"{date_str}-1.xml" yield scrapy.FormRequest( - url="https://www.ettoday.net/show_roll.php", + url=API_URL, callback=self.parse_news_list, meta=response.meta, formdata={ @@ -52,12 +62,14 @@ def parse_news_list(self, response): 'tPage': '3', 'tFile': tFile, 'tOt': '0', - 'tSi': '100' + 'tSi': '100', + "tAr": "0" }) def parse_news(self, response): title = response.css('h1.title::text').extract_first() + date = response.meta["date"].strftime("%Y-%m-%d") if not title: title = response.css('h2.title::text').extract_first() if not title: @@ -69,11 +81,20 @@ def parse_news(self, response): for p in p_list: content += p + category = response.meta['category'] + + # description + try: + description = response.css("meta[property='og:description']::attr(content)").extract_first() + except: + description = "" + yield { 'website': "東森新聞雲", 'url': response.url, 'title': title, - 'date': time.strftime('%Y-%m-%d'), + 'date': date, 'content': content, - 'category': response.meta['category'] + 'category': category, + "description": description } From b41b974e9bccc90d8e988774ea36ad227d7bb59e Mon Sep 17 00:00:00 2001 From: cool9203 Date: Tue, 25 Oct 2022 18:02:01 +0800 Subject: [PATCH 16/39] fixed: check box error --- README.md | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index bb6f8be..ec26aa0 100644 --- a/README.md +++ b/README.md @@ -25,24 +25,25 @@ $ scrapy crawl apple -o apple_news.json ## Prerequisites - Python3 -- Scrapy >= 1.3.0 -- Twisted >= 16.6.0 +- Scrapy >= 1.3.0 ~ 2.7.0 +- Twisted >= 16.6.0 ~ 22.8.0 ## Usage ```scrapy crawl -o ``` + ### Available spiders (all 12) -[ ] apple (not update since 2022/09/01) -[ ] appleRealtime (not update since 2022/09/01) -[X] china -[X] cna -[ ] cts -[ ] ettoday -[ ] liberty -[ ] libertyRealtime -[ ] pts -[ ] setn -[ ] tvbs -[ ] udn +- [ ] apple (not update since 2022/09/01) +- [ ] appleRealtime (not update since 2022/09/01) +- [X] china +- [X] cna +- [X] cts(can select date) +- [X] ettoday +- [ ] liberty +- [ ] libertyRealtime +- [ ] pts +- [ ] setn +- [ ] tvbs +- [ ] udn ## Output | Key | Value | From a63942f89abc656dd5e80d18f9c5dddb46ac2d56 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Wed, 26 Oct 2022 11:45:20 +0800 Subject: [PATCH 17/39] edit: add more parse rule --- TaiwanNewsCrawler/utils.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/TaiwanNewsCrawler/utils.py b/TaiwanNewsCrawler/utils.py index a1ad2b3..cdf7533 100644 --- a/TaiwanNewsCrawler/utils.py +++ b/TaiwanNewsCrawler/utils.py @@ -3,8 +3,10 @@ TODAY = dt.datetime.strptime(dt.datetime.now().strftime("%Y-%m-%d"), '%Y-%m-%d') YESTERDAY = TODAY - dt.timedelta(days=1) -PARSE_DATE_FORMAT_LIST = ["%Y-%m-%d", "%Y/%m/%d", "%Y %m %d"] -PARSE_TIME_FORMAT_LIST = ["%H %M", "%H:%M", "%H %M %S", "%H:%M:%S"] +PARSE_DATE_FORMAT_LIST = ["", "%Y-%m-%d", "%Y/%m/%d", "%Y %m %d"] +PARSE_INTERVAL_FORMAT_LIST = ["", " ", "T"] +PARSE_TIME_FORMAT_LIST = ["", "%H %M", "%H:%M", "%H %M %S", "%H:%M:%S"] +PARSE_TIMEZONE_FORMAT_LIST = ["", "%z"] def parse_start_date_and_end_date(start_date: Union[str, None], end_date: Union[str, None], start_date_default: dt.datetime=TODAY, end_date_default: dt.datetime=TODAY) -> Tuple[dt.datetime, dt.datetime]: @@ -23,17 +25,28 @@ def parse_start_date_and_end_date(start_date: Union[str, None], end_date: Union[ def parse_date(date_str: str, parse_format: str=None) -> dt.datetime: if (not parse_format is None): - return dt.datetime.strptime(date_str, parse_format) - + try: + date = dt.datetime.strptime(date_str, parse_format) + except: + date = None + return date + + date = None for date_format in PARSE_DATE_FORMAT_LIST: - for time_format in PARSE_TIME_FORMAT_LIST: - try: - date = dt.datetime.strptime(date_str, f"{date_format} {time_format}") - break - except: - date = None if (not date is None): break + for interval_format in PARSE_INTERVAL_FORMAT_LIST: + if (not date is None): + break + for time_format in PARSE_TIME_FORMAT_LIST: + if (not date is None): + break + for timezone_format in PARSE_TIMEZONE_FORMAT_LIST: + try: + date = dt.datetime.strptime(date_str, f"{date_format}{interval_format}{time_format}{timezone_format}") + break + except: + date = None return date From aa220357e9a557e83ff102bd45d8a62ff1f47d85 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Wed, 26 Oct 2022 11:46:10 +0800 Subject: [PATCH 18/39] edit: use urllib to combine url --- TaiwanNewsCrawler/spiders/china_spider.py | 3 ++- TaiwanNewsCrawler/spiders/cna_spider.py | 5 ++--- TaiwanNewsCrawler/spiders/cts_spider.py | 5 +++-- TaiwanNewsCrawler/spiders/ettoday_spider.py | 11 ++++++++--- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/TaiwanNewsCrawler/spiders/china_spider.py b/TaiwanNewsCrawler/spiders/china_spider.py index 24d48b3..e9b6670 100644 --- a/TaiwanNewsCrawler/spiders/china_spider.py +++ b/TaiwanNewsCrawler/spiders/china_spider.py @@ -6,6 +6,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- import scrapy +from urllib.parse import urljoin import TaiwanNewsCrawler.utils as utils @@ -36,7 +37,7 @@ def parse(self, response: scrapy.Selector): if (crawl_next): url = news.css('a::attr(href)').extract_first() if (not ROOT_URL in url): - url = ROOT_URL + url + url = urljoin(ROOT_URL, url) url = response.urljoin(url) yield scrapy.Request(url, callback=self.parse_news) diff --git a/TaiwanNewsCrawler/spiders/cna_spider.py b/TaiwanNewsCrawler/spiders/cna_spider.py index c9dd7df..f94eb83 100644 --- a/TaiwanNewsCrawler/spiders/cna_spider.py +++ b/TaiwanNewsCrawler/spiders/cna_spider.py @@ -7,7 +7,7 @@ # -*- coding: utf-8 -*- import scrapy import scrapy.http -import json +from urllib.parse import urljoin import TaiwanNewsCrawler.utils as utils @@ -39,8 +39,7 @@ def parse(self, response: scrapy.Selector): if (crawl_next): url = news.css('a::attr(href)').extract_first() if (not ROOT_URL in url): - url = ROOT_URL + url - url = response.urljoin(url) + url = urljoin(ROOT_URL, url) yield scrapy.Request(url, callback=self.parse_news) if (crawl_next): diff --git a/TaiwanNewsCrawler/spiders/cts_spider.py b/TaiwanNewsCrawler/spiders/cts_spider.py index 41225f8..d657668 100644 --- a/TaiwanNewsCrawler/spiders/cts_spider.py +++ b/TaiwanNewsCrawler/spiders/cts_spider.py @@ -7,6 +7,7 @@ # -*- coding: utf-8 -*- import scrapy import scrapy.http +from urllib.parse import urljoin import datetime as dt import json import TaiwanNewsCrawler.utils as utils @@ -34,7 +35,7 @@ def parse(self, response: scrapy.Request): for news in response: url = news["news_url"] if (not ROOT_URL in url): - url = ROOT_URL + url + url = urljoin(ROOT_URL, url) yield scrapy.Request(url, callback=self.parse_news) def parse_news(self, response: scrapy.Selector): @@ -43,7 +44,7 @@ def parse_news(self, response: scrapy.Selector): date = utils.parse_date(date_str, "%Y/%m/%d %H:%M") content = "" for p in response.css('artical.news-artical div.artical-content p'): - if (len(p.css("::attr(href)")) == 0 or len(p.css("::attr(class)"))): + if (len(p.css("::attr(href)")) == 0 or len(p.css("::attr(class)")) == 0): p_text = p.css('::text') content += ' '.join(p_text.extract()) diff --git a/TaiwanNewsCrawler/spiders/ettoday_spider.py b/TaiwanNewsCrawler/spiders/ettoday_spider.py index c961b49..13b7c5e 100644 --- a/TaiwanNewsCrawler/spiders/ettoday_spider.py +++ b/TaiwanNewsCrawler/spiders/ettoday_spider.py @@ -7,6 +7,7 @@ import scrapy import scrapy.http import datetime as dt +from urllib.parse import urljoin import TaiwanNewsCrawler.utils as utils @@ -38,13 +39,17 @@ def parse_news_list(self, response): is_first_iter = response.meta['iter_time'] == 1 prefix = '.part_list_2' if is_first_iter else '' date_str = response.meta["date"].strftime("%Y/%m/%d") + for news in response.css(prefix + ' h3'): - url = news.css('a::attr(href)').extract_first() - url = ROOT_URL + url - category = news.css('em::text').extract_first() news_date = utils.parse_date(news.css('span::text').extract_first()) crawl_next = utils.can_crawl(news_date, start_date, end_date) + url = news.css('a::attr(href)').extract_first() + if (not ROOT_URL in url): + url = urljoin(ROOT_URL, url) + category = news.css('em::text').extract_first() + + if (crawl_next): response.meta['category'] = category yield scrapy.Request( From 259e8cec2f1d32a10ae205fdbd938ef7e40126bc Mon Sep 17 00:00:00 2001 From: cool9203 Date: Wed, 26 Oct 2022 11:46:38 +0800 Subject: [PATCH 19/39] edit: liberty now can crawl --- README.md | 6 +- TaiwanNewsCrawler/spiders/liberty_spider.py | 186 ++++++++------------ 2 files changed, 79 insertions(+), 113 deletions(-) diff --git a/README.md b/README.md index ec26aa0..6b0cbf4 100644 --- a/README.md +++ b/README.md @@ -31,14 +31,14 @@ $ scrapy crawl apple -o apple_news.json ## Usage ```scrapy crawl -o ``` -### Available spiders (all 12) +### Available spiders (all 11) - [ ] apple (not update since 2022/09/01) - [ ] appleRealtime (not update since 2022/09/01) - [X] china - [X] cna - [X] cts(can select date) -- [X] ettoday -- [ ] liberty +- [X] ettoday(can select date) +- [X] liberty - [ ] libertyRealtime - [ ] pts - [ ] setn diff --git a/TaiwanNewsCrawler/spiders/liberty_spider.py b/TaiwanNewsCrawler/spiders/liberty_spider.py index a2d4323..02f7020 100644 --- a/TaiwanNewsCrawler/spiders/liberty_spider.py +++ b/TaiwanNewsCrawler/spiders/liberty_spider.py @@ -5,127 +5,93 @@ """ #!/usr/bin/env python # -*- coding: utf-8 -*- -import time -import re +from requests import Response import scrapy +import scrapy.http +from urllib.parse import urljoin +import datetime as dt +import json +import TaiwanNewsCrawler.utils as utils -ROOT_URL = 'http://news.ltn.com.tw' -CATEGORY_DIC = { - 'focus': '焦點', - 'politics': '政治', - 'society': '社會', - 'local': '地方', - 'life': '生活', - 'opinion': '言論', - 'world': '國際', - 'business': '財經', - 'entertainment': '娛樂', - 'consumer': '消費', - 'supplement': '副刊', - 'sports': '體育' -} - +ROOT_URL = 'http://news.ltn.com.tw/' +PAGE_URL = 'http://news.ltn.com.tw/list/breakingnews/all/' +API_URL = "https://news.ltn.com.tw/ajax/breakingnews/all/{}" class LibertySpider(scrapy.Spider): name = "liberty" - def start_requests(self): - urls = [ - 'http://news.ltn.com.tw/list/newspaper/focus/', - 'http://news.ltn.com.tw/list/newspaper/politics/', - 'http://news.ltn.com.tw/list/newspaper/society/', - 'http://news.ltn.com.tw/list/newspaper/local/', - 'http://news.ltn.com.tw/list/newspaper/life/', - 'http://news.ltn.com.tw/list/newspaper/opinion/', - 'http://news.ltn.com.tw/list/newspaper/world/', - 'http://news.ltn.com.tw/list/newspaper/business/', - 'http://news.ltn.com.tw/list/newspaper/sports/', - 'http://news.ltn.com.tw/list/newspaper/entertainment/', - 'http://news.ltn.com.tw/list/newspaper/consumer/', - 'http://news.ltn.com.tw/list/newspaper/supplement/' - ] - - date = time.strftime('%Y%m%d') - for url in urls: - target = url + date - yield scrapy.Request(target, callback=self.parse_news_list) - - def parse_news_list(self, response): - for news_item in response.css('.list li'): - relative_url = news_item.css('a.tit::attr(href)').extract_first() - abs_url = response.urljoin(relative_url) - yield scrapy.Request(abs_url, callback=self.parse_news) - - page_list = [ - int(p) for p in response.css('.pagination a::text').extract() - if p.isdigit() - ] - current_page_extract = response.css( - '.pagination a.active::text').extract_first() - current_page = int( - current_page_extract) if current_page_extract is True else 1 - if (not page_list) or (current_page >= max(page_list)): - return - - next_page = current_page + 1 - - if next_page in page_list: - prefix = re.search(r'.*\/', response.url).group(0) - relative_url = prefix + '/' + str(next_page) - abs_url = response.urljoin(relative_url) - yield scrapy.Request(abs_url, callback=self.parse_news_list) - - def parse_news(self, response): - category = get_news_category(response) + def __init__(self, start_date: str=None, end_date: str=None): + super().__init__(start_date=start_date, end_date=end_date) - if category == 'opinion': - title = response.css('h2::text').extract_first() - else: - title = response.css('h1::text').extract_first() - - if category == 'opinion': - content = get_news_content(response, '.cont h4::text', '.cont p') - elif category == 'sports': - content = get_news_content(response, '.news_p h4::text', - '.news_p p') - elif category == 'entertainment': - content = get_news_content(response, '.news_content h4::text', - '.news_content p') - else: - content = get_news_content(response, '.text h4::text', '.text p') + def start_requests(self): + meta = {"iter_time": 1} + url = API_URL.format(meta["iter_time"]) + yield scrapy.http.Request(url, method='GET', callback=self.parse_news_list, meta=meta) + + def parse_news_list(self, response: scrapy.Request): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + crawl_next = False + response.meta['iter_time'] += 1 + + response_data = json.loads(response.text) + if (int(response_data["code"]) == 200): + for news in response_data["data"]: + if (type(news) == str): + news = response_data["data"][news] + news_time = utils.parse_date(news["time"], "%H:%M") + if (news_time is None): + news_date = utils.parse_date(news["time"], "%Y/%m/%d %H:%M") + else: + news_date = utils.TODAY + crawl_next = utils.can_crawl(news_date, start_date, end_date) + + if (crawl_next): + url = news["url"] + if (not ROOT_URL in url): + url = urljoin(ROOT_URL, url) + yield scrapy.Request(url, callback=self.parse_news) + + if (crawl_next): + url = API_URL.format(response.meta["iter_time"]) + yield scrapy.http.Request(url, method='GET', callback=self.parse_news_list, meta=response.meta) + + + def parse_news(self, response: scrapy.Selector): + title = response.css('h1::text').extract_first() + date_str = response.css('meta[property=pubdate]::attr(content)').extract_first() + if (date_str is None): + date_str = response.css('span.time::text').extract_first() + date = utils.parse_date(date_str).replace(tzinfo=None) + + parse_text_list = ["div.text p", # normal + "div.text p span", # other + ] + + for parse_text in parse_text_list: + article = response.css(parse_text) + if (not article is None): + break + + content = "" + for p in article: + if (len(p.css("::attr(href)")) == 0 or len(p.css("::attr(class)")) == 0 or p.css("::attr(lang)") == "zh-TW"): + p_text = p.css('::text') + content += ' '.join(p_text.extract()) + + category = response.css('div.breadcrumbs a::text').extract()[-1] + + # description + try: + description = response.css("meta[property='og:description']::attr(content)").extract_first() + except: + description = "" yield { 'website': "自由時報", 'url': response.url, 'title': title, - 'date': time.strftime('%Y-%m-%d'), + 'date': date, 'content': content, - 'category': CATEGORY_DIC[category] + 'category': category, + "description": description } - - -def get_news_category(response): - searched_category = re.search(r'\/news\/([a-z]*)\/', response.url) - - if searched_category and searched_category.group(1) != 'paper': - return searched_category.group(1) - elif 'talk' in response.url: - return 'opinion' - elif 'sports' in response.url: - return 'sports' - elif 'ent' in response.url: - return 'entertainment' - - -def get_news_content(response, h4_query, p_query): - h4 = response.css(h4_query).extract() - h4_num = len(h4) - counter = 0 - content = "" - for p in response.css(p_query): - if counter < h4_num: - content += " " + h4[counter] - counter += 1 - if p.css("p::text"): - content += ' '.join(p.css("p::text").extract()) - return content From ebf5bb76bdd1d213d9764412f5718409fc833e0c Mon Sep 17 00:00:00 2001 From: cool9203 Date: Wed, 26 Oct 2022 11:55:05 +0800 Subject: [PATCH 20/39] rm: now use lib --- TaiwanNewsCrawler/spiders/liberty_spider.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/TaiwanNewsCrawler/spiders/liberty_spider.py b/TaiwanNewsCrawler/spiders/liberty_spider.py index 02f7020..cd51971 100644 --- a/TaiwanNewsCrawler/spiders/liberty_spider.py +++ b/TaiwanNewsCrawler/spiders/liberty_spider.py @@ -5,11 +5,9 @@ """ #!/usr/bin/env python # -*- coding: utf-8 -*- -from requests import Response import scrapy import scrapy.http from urllib.parse import urljoin -import datetime as dt import json import TaiwanNewsCrawler.utils as utils From 45fd254aeec34662b6ed3a2e3cadb101f22b4e59 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Wed, 26 Oct 2022 13:57:37 +0800 Subject: [PATCH 21/39] rm: liberty realtime spider --- .../spiders/liberty_realtimenews_spider.py | 127 ------------------ 1 file changed, 127 deletions(-) delete mode 100644 TaiwanNewsCrawler/spiders/liberty_realtimenews_spider.py diff --git a/TaiwanNewsCrawler/spiders/liberty_realtimenews_spider.py b/TaiwanNewsCrawler/spiders/liberty_realtimenews_spider.py deleted file mode 100644 index 633a22b..0000000 --- a/TaiwanNewsCrawler/spiders/liberty_realtimenews_spider.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -自由時報即時新聞 -the crawl deal with liberty's realtime news -Usage: scrapy crawl libertyRealtime -o -""" -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import re -from datetime import datetime, date -import scrapy - -ROOT_URL = 'http://news.ltn.com.tw/' -Realtime_NEWS_URL = 'http://news.ltn.com.tw/list/breakingnews/all/' -today = date.today() - -CATEGORY_DIC = { - 'focus': '焦點', - 'politics': '政治', - 'society': '社會', - 'local': '地方', - 'life': '生活', - 'opinion': '言論', - 'world': '國際', - 'business': '財經', - 'entertainment': '娛樂', - 'consumer': '消費', - 'supplement': '副刊', - 'sports': '體育', - 'car': '汽車', - '3c': '3c', - 'istyle': 'istyle' -} - - -class LibertySpider(scrapy.Spider): - name = "libertyRealtime" - start_urls = ['http://news.ltn.com.tw/list/breakingnews/all'] - - def parse(self, response): - regex = r'\/all\/(\d+)' - current_index = re.search(regex, response.url) - if current_index: - next_index = int(current_index.group(1)) + 1 - else: - next_index = 2 - date_of_news = response.css('a.tit span::text').extract() - last_page = False - for d in date_of_news: - if '-' in d: - last_page = True - break - - for news_url in response.css('a.tit::attr(href)').extract(): - yield scrapy.Request(news_url, callback=self.parse_news) - - if not last_page: - next_target = Realtime_NEWS_URL + str(next_index) - yield scrapy.Request(next_target, callback=self.parse) - - def parse_news(self, response): - category = get_news_category(response) - - if category == 'opinion': - title = response.css('h2::text').extract_first() - else: - title = response.css('h1::text').extract_first() - - if category == 'opinion': - content = get_news_content(response, '.cont h4::text', '.cont p') - elif category == 'sports': - content = get_news_content(response, '.news_p h4::text', - '.news_p p') - elif category == 'entertainment': - content = get_news_content(response, '.news_content h4::text', - '.news_content p') - elif category == 'car': - content = get_news_content(response, '.con h4::text', '.con p') - elif category == '3c': - content = get_news_content(response, '.cont h4::text', '.cont p') - elif category == 'istyle': - content = get_news_content(response, '.boxTitle h4::text', - '.boxTitle p') - else: - content = get_news_content(response, '#newstext h4::text', - '.text p') - yield { - 'website': "自由時報", - 'url': response.url, - 'title': title, - 'date': datetime.now().strftime('%Y-%m-%d'), - 'content': content, - 'category': CATEGORY_DIC[category] - } - - -def get_news_category(response): - searched_category = re.search(r'\/news\/([a-z]*)\/breakingnews\/', - response.url) - - if searched_category and searched_category.group(1) != 'paper': - return searched_category.group(1) - elif 'talk' in response.url: - return 'opinion' - elif 'sports' in response.url: - return 'sports' - elif 'ent' in response.url: - return 'entertainment' - elif 'auto' in response.url: - return 'car' - elif '3c' in response.url: - return '3c' - elif 'istyle' in response.url: - return 'istyle' - - -def get_news_content(response, h4_query, p_query): - h4 = response.css(h4_query).extract() - h4_num = len(h4) - counter = 0 - content = "" - for p in response.css(p_query): - if counter < h4_num: - content += " " + h4[counter] - counter += 1 - if p.css("p::text"): - content += ' '.join(p.css("p::text").extract()) - return content From ec52af7827eedf48fb96c8e96e7b1e4dea7e529d Mon Sep 17 00:00:00 2001 From: cool9203 Date: Wed, 26 Oct 2022 13:57:50 +0800 Subject: [PATCH 22/39] edit: pts now can crawl --- README.md | 2 +- TaiwanNewsCrawler/spiders/pts_spider.py | 123 +++++++++++++----------- 2 files changed, 67 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 6b0cbf4..7eb7689 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ $ scrapy crawl apple -o apple_news.json - [X] ettoday(can select date) - [X] liberty - [ ] libertyRealtime -- [ ] pts +- [X] pts - [ ] setn - [ ] tvbs - [ ] udn diff --git a/TaiwanNewsCrawler/spiders/pts_spider.py b/TaiwanNewsCrawler/spiders/pts_spider.py index 30245e3..b28ea5a 100644 --- a/TaiwanNewsCrawler/spiders/pts_spider.py +++ b/TaiwanNewsCrawler/spiders/pts_spider.py @@ -5,71 +5,80 @@ """ #!/usr/bin/env python # -*- coding: utf-8 -*- -import json -import time - import scrapy +from urllib.parse import urljoin +import json +import TaiwanNewsCrawler.utils as utils -TODAY = time.strftime('%Y-%m-%d') -ROOT_URL = 'https://news.pts.org.tw/list/' -ARTICLE_PREFIX = 'http://news.pts.org.tw/article/' - +ROOT_URL = 'https://news.pts.org.tw/' +PAGE_URL = "https://news.pts.org.tw/dailynews?page={}" -class EttodaySpider(scrapy.Spider): +class PtsSpider(scrapy.Spider): name = "pts" + def __init__(self, start_date: str=None, end_date: str=None): + super().__init__(start_date=start_date, end_date=end_date) + def start_requests(self): - url = 'https://news.pts.org.tw/list/0' - meta = {'iter_time': 0} - yield scrapy.Request(url, callback=self.parse_news_list, meta=meta) + meta = {"iter_time": 1} + url = PAGE_URL.format(meta["iter_time"]) + yield scrapy.Request(url, callback=self.parse, meta=meta) - def parse_news_list(self, response): - response.meta['iter_time'] = 1 - for news_item in response.css('ul.list-news li'): - url = news_item.css('h2 a::attr(href)').extract_first() - date_time = news_item.css('.list-news-time::text').extract_first() - title = news_item.css('h2 a::text').extract_first() - content = news_item.css( - '.list-news-description::text').extract_first() - category = news_item.css( - '.list-news-program::text').extract_first() + def parse(self, response: scrapy.Selector): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + crawl_next = False + response.meta['iter_time'] += 1 - if TODAY in date_time: - yield { - 'website': '公視', - 'url': url, - 'title': title, - 'date': date_time, - 'content': content, - 'category': category - } + parse_text_list = ["div.break-news-container div.breakingnews", "div.break-news-container ul.news-list li.d-flex"] + for parse_text in parse_text_list: + for news in response.css(parse_text): + news_date = utils.parse_date(news.css("time::attr(datetime)").extract_first()) + crawl_next = utils.can_crawl(news_date, start_date, end_date) - yield scrapy.FormRequest( - url='https://news.pts.org.tw/list/getmore.php', - callback=self.get_news, - meta=response.meta, - formdata={ - 'page': '1' - }) + if (crawl_next): + url = news.css("h2 a::attr(href)").extract_first() + yield scrapy.Request(url, callback=self.parse_news) + + if (crawl_next): + url = PAGE_URL.format(response.meta['iter_time']) + yield scrapy.Request(url, callback=self.parse, meta=response.meta) - def get_news(self, response): - response.meta['iter_time'] += 1 - news_items = json.loads(response.text) - if news_items: - for n in news_items: - yield { - 'website': '公視', - 'url': ARTICLE_PREFIX + n['news_id'], - 'title': n['subject'], - 'date': n['news_date'], - 'content': n['content'], - 'category': n['program_name'] - } - yield scrapy.FormRequest( - url="https://news.pts.org.tw/list/getmore.php", - callback=self.get_news, - meta=response.meta, - formdata={ - 'page': str(response.meta['iter_time']) - }) + def parse_news(self, response: scrapy.Selector): + title = response.css('h1::text').extract_first() + date_str = response.css('meta[property=pubdate]::attr(content)').extract_first() + if (date_str is None): + date_str = response.css('time::text').extract_first() + date = utils.parse_date(date_str).replace(tzinfo=None) + + parse_text_list = ["article.post-article p", + ] + + for parse_text in parse_text_list: + article = response.css(parse_text) + if (not article is None): + break + + content = "" + for p in article: + if (len(p.css("::attr(href)")) == 0 or len(p.css("::attr(class)")) == 0 or p.css("::attr(lang)") == "zh-TW"): + p_text = p.css('::text') + content += ' '.join(p_text.extract()) + + category = response.css('ol.breadcrumb li.breadcrumb-item')[-1].css("a::text").extract()[-1] + + # description + try: + description = response.css("meta[property='og:description']::attr(content)").extract_first() + except: + description = "" + + yield { + 'website': "公視", + 'url': response.url, + 'title': title, + 'date': date, + 'content': content, + 'category': category, + "description": description + } From 81c7fcc658cba728ceb28b2a8622fe21b8a17b84 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Wed, 26 Oct 2022 14:46:39 +0800 Subject: [PATCH 23/39] edit: now can crawl setn --- README.md | 3 +- TaiwanNewsCrawler/spiders/setn_spider.py | 106 ++++++++++++++--------- 2 files changed, 65 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 7eb7689..d7a0b5e 100644 --- a/README.md +++ b/README.md @@ -39,9 +39,8 @@ $ scrapy crawl apple -o apple_news.json - [X] cts(can select date) - [X] ettoday(can select date) - [X] liberty -- [ ] libertyRealtime - [X] pts -- [ ] setn +- [X] setn - [ ] tvbs - [ ] udn diff --git a/TaiwanNewsCrawler/spiders/setn_spider.py b/TaiwanNewsCrawler/spiders/setn_spider.py index e1fa539..cb9a86d 100644 --- a/TaiwanNewsCrawler/spiders/setn_spider.py +++ b/TaiwanNewsCrawler/spiders/setn_spider.py @@ -5,65 +5,87 @@ """ #!/usr/bin/env python # -*- coding: utf-8 -*- -import re -from datetime import date -from datetime import timedelta import scrapy +from urllib.parse import urljoin +import TaiwanNewsCrawler.utils as utils -YESTERDAY = (date.today() - timedelta(1)).strftime('%m/%d/%Y') - +ROOT_URL = "http://www.setn.com" +PAGE_URL = "http://www.setn.com/ViewAll.aspx?p={}" class SetnSpider(scrapy.Spider): name = "setn" - def __init__(self, category=None, *args, **kwargs): - super(SetnSpider, self).__init__(*args, **kwargs) - self.start_urls = [ - 'http://www.setn.com/ViewAll.aspx?date={}&p=1'.format(YESTERDAY) - ] - self.last_page_flag = 0 + def __init__(self, start_date: str=None, end_date: str=None): + super().__init__(start_date=start_date, end_date=end_date) + + def start_requests(self): + meta = {"iter_time": 1} + url = PAGE_URL.format(meta["iter_time"]) + yield scrapy.Request(url, callback=self.parse, meta=meta) + + def parse(self, response: scrapy.Selector): + crawl_next = False + response.meta['iter_time'] += 1 + + parse_text_list = ["#NewsList div.newsItems"] + for parse_text in parse_text_list: + for news in response.css(parse_text): + crawl_next = True - def parse(self, response): + url = news.css("h3 a::attr(href)").extract_first() + if (not ROOT_URL in url): + url = urljoin(ROOT_URL, url) + yield scrapy.Request(url, callback=self.parse_news) + + if (crawl_next): + url = PAGE_URL.format(response.meta['iter_time']) + yield scrapy.Request(url, callback=self.parse, meta=response.meta) - for news in response.css('.box ul li'): - category = news.css('.tab_list_type span::text').extract_first() - meta = {'category': category} - url = news.css('a::attr(href)').extract_first() - url = response.urljoin(url) - yield scrapy.Request(url, callback=self.parse_news, meta=meta) - last_two_pages = response.css('.pager a::attr(href)').extract()[-2:] - page1 = last_two_pages[0].split('&p=')[1] - page2 = last_two_pages[1].split('&p=')[1] + def parse_news(self, response: scrapy.Selector): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + title = response.css('h1::text').extract_first() + date_str = response.css('meta[name=pubdate]::attr(content)').extract_first() + if (date_str is None): + date_str = response.css('time::text').extract_first() + if (date_str is None): + date_str = response.css("meta[property='article:published_time']::attr(content)").extract_first() + date = utils.parse_date(date_str).replace(tzinfo=None) - if page1 == page2: - self.last_page_flag = self.last_page_flag + 1 + crawl = utils.can_crawl(date, start_date, end_date) + if (not crawl): + return + + parse_text_list = ["article p", + ] + + for parse_text in parse_text_list: + article = response.css(parse_text) + if (not article is None): + break - if self.last_page_flag < 2: - url_arr = response.url.split('&p=') - current_page = int(url_arr[1]) - next_page_url = '&p='.join( - url_arr[:-1]) + '&p=' + str(current_page + 1) - yield scrapy.Request(next_page_url, callback=self.parse) + content = "" + for p in article: + if ((len(p.css("::attr(href)")) == 0 and len(p.css("::attr(class)")) == 0 and len(p.css("::attr(style)")) == 0) or p.css("::attr(lang)") == "zh-TW"): + p_text = p.css('::text') + content += ' '.join(p_text.extract()) - def parse_news(self, response): - title = response.css('.title h1::text').extract_first() - content = '' - date_of_news = '' - if response.url.split('/')[3] == 'E': - date_of_news = response.css('.time::text').extract_first()[:10] - content = response.css('.Content2 p::text').extract() - else: - date_of_news = response.css('.date::text').extract_first()[:10] - content = response.css('#Content1 p::text').extract() + category = response.css("meta[name=section]::attr(content)").extract_first() + if (category is None): + category = response.css("meta[property='article:section']::attr(content)").extract_first() - content = ''.join(content) + # description + try: + description = response.css("meta[property='og:description']::attr(content)").extract_first() + except: + description = "" yield { 'website': "三立新聞", 'url': response.url, 'title': title, - 'date': date_of_news, + 'date': date, 'content': content, - 'category': response.meta['category'] + 'category': category, + "description": description } From 02cf1bf57904983f97b32686d08fe34b90cb4c6d Mon Sep 17 00:00:00 2001 From: cool9203 Date: Wed, 26 Oct 2022 14:47:40 +0800 Subject: [PATCH 24/39] edit: change

condition at crawl article --- TaiwanNewsCrawler/spiders/cts_spider.py | 2 +- TaiwanNewsCrawler/spiders/pts_spider.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/TaiwanNewsCrawler/spiders/cts_spider.py b/TaiwanNewsCrawler/spiders/cts_spider.py index d657668..9a94d5d 100644 --- a/TaiwanNewsCrawler/spiders/cts_spider.py +++ b/TaiwanNewsCrawler/spiders/cts_spider.py @@ -44,7 +44,7 @@ def parse_news(self, response: scrapy.Selector): date = utils.parse_date(date_str, "%Y/%m/%d %H:%M") content = "" for p in response.css('artical.news-artical div.artical-content p'): - if (len(p.css("::attr(href)")) == 0 or len(p.css("::attr(class)")) == 0): + if (len(p.css("::attr(href)")) == 0 and len(p.css("::attr(class)")) == 0): p_text = p.css('::text') content += ' '.join(p_text.extract()) diff --git a/TaiwanNewsCrawler/spiders/pts_spider.py b/TaiwanNewsCrawler/spiders/pts_spider.py index b28ea5a..1f036f4 100644 --- a/TaiwanNewsCrawler/spiders/pts_spider.py +++ b/TaiwanNewsCrawler/spiders/pts_spider.py @@ -7,7 +7,6 @@ # -*- coding: utf-8 -*- import scrapy from urllib.parse import urljoin -import json import TaiwanNewsCrawler.utils as utils ROOT_URL = 'https://news.pts.org.tw/' @@ -37,6 +36,8 @@ def parse(self, response: scrapy.Selector): if (crawl_next): url = news.css("h2 a::attr(href)").extract_first() + if (not ROOT_URL in url): + url = urljoin(ROOT_URL, url) yield scrapy.Request(url, callback=self.parse_news) if (crawl_next): @@ -61,7 +62,7 @@ def parse_news(self, response: scrapy.Selector): content = "" for p in article: - if (len(p.css("::attr(href)")) == 0 or len(p.css("::attr(class)")) == 0 or p.css("::attr(lang)") == "zh-TW"): + if ((len(p.css("::attr(href)")) == 0 and len(p.css("::attr(class)")) == 0) or p.css("::attr(lang)") == "zh-TW"): p_text = p.css('::text') content += ' '.join(p_text.extract()) From e22b6ec76203d67884213c97c97e62ef3c8e6d45 Mon Sep 17 00:00:00 2001 From: cool9203 Date: Wed, 26 Oct 2022 17:48:58 +0800 Subject: [PATCH 25/39] edit: now can crawl TVBS --- README.md | 2 +- TaiwanNewsCrawler/spiders/tvbs_spider.py | 105 ++++++++++++----------- 2 files changed, 57 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index d7a0b5e..a5fd9d5 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ $ scrapy crawl apple -o apple_news.json - [X] liberty - [X] pts - [X] setn -- [ ] tvbs +- [X] tvbs(can select date) - [ ] udn ## Output diff --git a/TaiwanNewsCrawler/spiders/tvbs_spider.py b/TaiwanNewsCrawler/spiders/tvbs_spider.py index 81cf8d0..cbc8319 100644 --- a/TaiwanNewsCrawler/spiders/tvbs_spider.py +++ b/TaiwanNewsCrawler/spiders/tvbs_spider.py @@ -5,71 +5,78 @@ """ #!/usr/bin/env python # -*- coding: utf-8 -*- -import re -from datetime import date -from datetime import timedelta - import scrapy +from urllib.parse import urljoin +import datetime as dt +import TaiwanNewsCrawler.utils as utils -YESTERDAY = (date.today() - timedelta(1)).strftime('%Y/%m/%d') -YESTERDAY = YESTERDAY.replace('/', '-') - +ROOT_URL = "https://news.tvbs.com.tw" +PAGE_URL = "https://news.tvbs.com.tw/realtime/news/{}" class TvbsSpider(scrapy.Spider): name = "tvbs" - start_urls = [ - 'http://news.tvbs.com.tw/news/realtime/all/{}/1'.format(YESTERDAY) - ] + + def __init__(self, start_date: str=None, end_date: str=None): + super().__init__(start_date=start_date, end_date=end_date) - def parse(self, response): - for news in response.css('.realtime_news_content_titel'): - category = news.css('p::text').extract_first() - meta = {'category': category} - url = news.css('div a::attr(href)').extract_first() - url = response.urljoin(url) - yield scrapy.Request(url, callback=self.parse_news, meta=meta) + def start_requests(self): + start_date, end_date = utils.parse_start_date_and_end_date(self.start_date, self.end_date) + date = start_date - total_pages = response.css( - '.realtime_news_underbtn li:last-child::text').extract_first() - total_pages_num = int(total_pages[1:-1]) - url_arr = response.url.split('/') - current_page_index = int(url_arr[-1]) + while (date < end_date): + url = PAGE_URL.format(date.strftime("%Y-%m-%d")) + yield scrapy.Request(url, method='GET', callback=self.parse) + date += dt.timedelta(days=1) - if current_page_index < total_pages_num: - next_page_url = '/'.join(url_arr[:-1]) + \ - '/' + str(current_page_index + 1) - yield scrapy.Request(next_page_url, callback=self.parse) - - def parse_news(self, response): - title = response.css('.newsdetail-h2 p strong::text').extract_first() - date_of_news = response.css( - '.newsdetail-time1 p::text').extract_first()[:10] - raw_content = response.css('.newsdetail-content').extract_first() + def parse(self, response): + parse_text_list = ["main article div.list li"] + for parse_text in parse_text_list: + for news in response.css(parse_text): + url = news.css("a::attr(href)").extract_first() + if (url is None): + continue + if (not ROOT_URL in url): + url = urljoin(ROOT_URL, url) + yield scrapy.Request(url, callback=self.parse_news) - TAG_RE = re.compile(r'<[^>]+>([^<]*]+>)?') + def parse_news(self, response: scrapy.Selector): + title = response.css('h1::text').extract_first() + date_str = response.css('meta[name=pubdate]::attr(content)').extract_first() + if (date_str is None): + date_str = response.css("meta[property='article:published_time']::attr(content)").extract_first() + date = utils.parse_date(date_str).replace(tzinfo=None) + + parse_text_list = ["div[itemprop=articleBody] div.article_content", + ] + + for parse_text in parse_text_list: + article = response.css(parse_text) + if (not article is None): + break - content_prefix = '' - content_suffix1 = '' - content_suffix2 = '