forked from he3210/Mini_spider-1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl_thread.py
90 lines (80 loc) · 2.95 KB
/
crawl_thread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# encoding: UTF-8
"""
实现抓取线程
"""
import threading
import time
import Queue
import urllib2
import log
import config_load
import webpage_saves
import webpage_parse
#创建新类继承threading.Thread,重写threading.Thread的run方法
class MyThread(threading.Thread):
"""
spider thread
Attributes:
logger: log information
queue: the url queue waiting to be cralwed
conf_dict["max_depth"]: spider's max_depth
conf_dict["target_url"]: The target url patter
conf_dit["crawl_interval"]: the parse interval
conf_dict["crawl_timeout"]: urlopen time out
conf_dict["output_directory"]: the directory to save url content
"""
def __init__(self,queue,conf_dict):
threading.Thread.__init__(self)
self.queue=queue
self.output_directory=conf_dict['output_directory']
self.max_depth = conf_dict['max_depth']
self.crawl_interval = conf_dict['crawl_interval']
self.crawl_timeout = conf_dict['crawl_timeout']
self.target_url = conf_dict['target_url']
self.thread_stop=False
self.logger=log.init_log("./log/mini_spider_log")
def run(self):
while not self.thread_stop:
try:
#获取队列中的url,timeout等待时间
url=self.queue.get(timeout=self.crawl_timeout)
self.logger.info("get the url waiting from queue is %s"%url)
except Queue.Empty:
self.logger.info("%s is empty"%self.getName())#当前线程为空
self.thread_stop=True
continue
#判断url是否有效
if url is None or url=="":
self.queue.task_done()
continue
#抓取url的内容
content=webpage_saves.WebpageSaves().GetUrl(url)
if content is None or content=="":
continue
target_urls=webpage_parse.WebpageParse().get_target_urls(content)
#保存满足要求的url的content
#print self.output_directory#保存的路径
#对抓取到的网页进行处理
try:
for target_url in target_urls:
target_url = url + "/"+target_url
#print target_url
target_url = target_url.replace('\'', '')
target_url = target_url.replace('\"', '')
print target_url
webpage_saves.WebpageSaves().Save(target_url,self.output_directory)
except urllib2.HTTPError, e:
self.logger.error("%s Webpage happened error and can't open"%e.reason)
self.stop()
def stop(self):
self.thread_stop=True
"""
url='http://pycm.baidu.com:8081'
queue=Queue.Queue(10)
queue.put(url)
conf = config_load.LoadConfig()
conf_dict = {}
conf_dict = conf.loads('spider.conf')
thread=MyThread(queue,conf_dict)
thread.start()
"""