-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscrapy_2.py
30 lines (27 loc) · 1.36 KB
/
scrapy_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# -*- coding: utf-8 -*-
import scrapy
from scrapy_news.items import ScrapyNewsItem
class SunSpiderSpider(scrapy.Spider):
name = 'sun_spider'
allowed_domains = ['wz.sun0769.com']
url = 'http://wz.sun0769.com/index.php/question/questionType?id=1&page='
offset = 1
start_urls = [url + str(offset)]
def parse(self, response):
links = response.xpath('/html/body/div[2]/div[3]/ul[2]/li/span[3]/a/@href').extract()
for link in links:
link = response.urljoin(link)
yield scrapy.Request(link, callback = self.parse_item)
if self.offset<=10:
self.offset += 1
yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
def parse_item(self, response):
item = ScrapyNewsItem()
item['title'] = response.xpath('/html/body/div[3]/div[2]/div[2]/p/text()').extract()
item['date'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[2]/text()').extract()
state = response.xpath('normalize-space(/html/body/div[3]/div[2]/div[2]/div[1]/span[3]/text())').extract()
item['state'] = state
item['number'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract()
item['content'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()').extract()
item['url'] = response.url
yield item