From 56eeaed4154a6fd0756f96ea8b6f3772c749241e Mon Sep 17 00:00:00 2001 From: bjtulynn Date: Fri, 13 Nov 2015 16:19:55 +0800 Subject: [PATCH] to run the project normally with recent stable dependencies, we need modify some points as below: 1. update spider package (BaseSpider->Spider) 2. update pipeline package (MediaPipeline -> ImagesPipeline) 3. update mongodb package (pymongo.connection -> pymongo) 4. fix bug for SpiderInfo instance (spiderinfo[spider] -> spiderinfo(spider)) 5. fix bug for FilePipeline init (add one missing parameter) --- .../woaidu_crawler/commands/init_single_mongodb.py | 2 +- woaidu_crawler/woaidu_crawler/pipelines/bookfile.py | 2 +- woaidu_crawler/woaidu_crawler/pipelines/file.py | 6 +++--- woaidu_crawler/woaidu_crawler/pipelines/mongodb.py | 2 +- .../woaidu_crawler/spiders/woaidu_detail_spider.py | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/woaidu_crawler/woaidu_crawler/commands/init_single_mongodb.py b/woaidu_crawler/woaidu_crawler/commands/init_single_mongodb.py index 560aa9a..9782fe6 100644 --- a/woaidu_crawler/woaidu_crawler/commands/init_single_mongodb.py +++ b/woaidu_crawler/woaidu_crawler/commands/init_single_mongodb.py @@ -29,7 +29,7 @@ """ import types -from pymongo.connection import MongoClient +from pymongo import MongoClient from pymongo import ASCENDING, DESCENDING DATABASE_NAME = "books_fs" diff --git a/woaidu_crawler/woaidu_crawler/pipelines/bookfile.py b/woaidu_crawler/woaidu_crawler/pipelines/bookfile.py index 4d5caa7..9736cf6 100644 --- a/woaidu_crawler/woaidu_crawler/pipelines/bookfile.py +++ b/woaidu_crawler/woaidu_crawler/pipelines/bookfile.py @@ -73,7 +73,7 @@ def process_item(self, item, spider): custom process_item func,so it will manage the Request result. """ - info = self.spiderinfo[spider] + info = self.SpiderInfo(spider) requests = arg_to_iter(self.get_media_requests(item, info)) dlist = [self._process_request(r, info) for r in requests] dfd = DeferredList(dlist, consumeErrors=1) diff --git a/woaidu_crawler/woaidu_crawler/pipelines/file.py b/woaidu_crawler/woaidu_crawler/pipelines/file.py index a69a97f..21806d1 100644 --- a/woaidu_crawler/woaidu_crawler/pipelines/file.py +++ b/woaidu_crawler/woaidu_crawler/pipelines/file.py @@ -15,7 +15,7 @@ from scrapy.utils.misc import md5sum from collections import defaultdict from scrapy.utils.misc import arg_to_iter -from scrapy.contrib.pipeline.images import MediaPipeline +from scrapy.contrib.pipeline.images import ImagesPipeline from woaidu_crawler.utils.select_result import list_first_item from scrapy.exceptions import NotConfigured, IgnoreRequest @@ -96,7 +96,7 @@ def _mkdir(self, dirname, domain=None): if dirname not in seen: seen.add(dirname) -class FilePipeline(MediaPipeline): +class FilePipeline(ImagesPipeline): """ download file pipeline. """ @@ -117,7 +117,7 @@ def __init__(self,store_uri,download_func=None): raise NotConfigured self.store = self._get_store(store_uri) self.style = color.color_style() - super(FilePipeline, self).__init__(download_func=download_func) + super(FilePipeline, self).__init__(store_uri, download_func=download_func) @classmethod def from_settings(cls, settings): diff --git a/woaidu_crawler/woaidu_crawler/pipelines/mongodb.py b/woaidu_crawler/woaidu_crawler/pipelines/mongodb.py index 5253ccf..bf5b6e6 100644 --- a/woaidu_crawler/woaidu_crawler/pipelines/mongodb.py +++ b/woaidu_crawler/woaidu_crawler/pipelines/mongodb.py @@ -7,7 +7,7 @@ from woaidu_crawler.utils import color from scrapy import log from woaidu_crawler.utils import color -from pymongo.connection import MongoClient +from pymongo import MongoClient class SingleMongodbPipeline(object): """ diff --git a/woaidu_crawler/woaidu_crawler/spiders/woaidu_detail_spider.py b/woaidu_crawler/woaidu_crawler/spiders/woaidu_detail_spider.py index b230cf9..7a9a368 100644 --- a/woaidu_crawler/woaidu_crawler/spiders/woaidu_detail_spider.py +++ b/woaidu_crawler/woaidu_crawler/spiders/woaidu_detail_spider.py @@ -3,13 +3,13 @@ import time from pprint import pprint -from scrapy.spider import BaseSpider +from scrapy.spider import Spider from scrapy.selector import HtmlXPathSelector from scrapy.http import Request from woaidu_crawler.items import WoaiduCrawlerItem from woaidu_crawler.utils.select_result import list_first_item,strip_null,deduplication,clean_url -class WoaiduSpider(BaseSpider): +class WoaiduSpider(Spider): name = "woaidu" start_urls = ( 'http://www.woaidu.org/sitemap_1.html',