Skip to content

Commit

Permalink
完成csdn刷分爬虫
Browse files Browse the repository at this point in the history
  • Loading branch information
luyishisi committed Feb 28, 2017
1 parent 5caecf1 commit 99707cf
Show file tree
Hide file tree
Showing 13 changed files with 14,721 additions and 0 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file modified 6.爬虫项目源码/.DS_Store
Binary file not shown.
25 changes: 25 additions & 0 deletions 6.爬虫项目源码/16.csdn/Coroutine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import time

def consumer():
r = ''
while True:
n = yield r
if not n:
return
print('[CONSUMER] Consuming %s...' % n)
time.sleep(1)
r = '200 OK'

def produce(c):
c.next()
n = 0
while n < 5:
n = n + 1
print('[PRODUCER] Producing %s...' % n)
r = c.send(n)
print('[PRODUCER] Consumer return: %s' % r)
c.close()

if __name__=='__main__':
c = consumer()
produce(c)
211 changes: 211 additions & 0 deletions 6.爬虫项目源码/16.csdn/UrlSpider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#-------------------------------------------------------------------------
# 程序:UrlSpider.py
# 版本:1
# 作者:ly
# 日期:编写日期2016/12/25
# 语言:Python 2.7.x
# 操作:python UrlSpider.py
# 功能:指定任务表,读取url,多线程采集
# 表结构(id, ip, lon_gd, lat_gd, datetime, flag)
# 采用数据库批量插入优化等表结构优化
#-------------------------------------------------------------------------
import re ,os ,sys ,time ,json ,random ,MySQLdb ,requesocks ,threadingrequests

#--------------------------------------------------
#中文编码设置
reload(sys)
sys.setdefaultencoding('utf-8')
Type = sys.getfilesystemencoding()

#------------------------------------------------
# 代理以及tor设置。
session = requesocks.session()
# session.proxies = {'http':'socks5://127.0.0.1:9050','https':'socks5://127.0.0.1:9050'}

#------------------------------------------------
# 可修改的全局变量参数
Table = "table" # 表名称需修改
HOST, USER, PASSWD, DB, PORT = 'host', 'user', 'pass', 'dbname', 3306 # 数据库连接参数
select_sql = "SELECT id,url FROM %s where flag = 3 limit 30000;" # 在数据库中i已经打乱了.
Update_sql = "UPDATE "+Table+" SET date=%s, flag=%s WHERE id =%s;" #数据存储

THREAD_COUNT = 50 #开启线程数
sql_num_base = 200 #自定义的执行批量插入的随机值基数,当此值为1时则每次获取数据均直接插入。
sql_num_add = 100 #自定义的随机值加数,平均而言,当单独一个线程执行sql_num_base+1/3*sql_num_add次数时执行插入
# 不可修改全局变量参数
#------------------------------------------------
schedule = 0 # 当前线程标志
ErrorList = []
WarnList = []

class Handle_HTML(threading.Thread):
"""docstring for Handle_HTML"""
def __init__(self, lock, ThreadID, tasklist, Total_TaskNum):
super(Handle_HTML, self).__init__()
self.lock = lock
self.ThreadID = ThreadID
self.tasklist = tasklist
self.Total_TaskNum = Total_TaskNum

def run(self):

global schedule, ErrorList
connect, cursor = ConnectDB()
self.lock.acquire()
print "The Thread tasklist number :", len(self.tasklist)
self.lock.release()
total = len(self.tasklist)
user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
date_list = []
now_requests_num = 0
for (id, url) in self.tasklist:
# -------------------------
# 每个请求开始前进行进度说明,对线程上锁
self.lock.acquire()
time_Now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print "Tread-%s:" % self.ThreadID, time_Now, "Already Completed:[%s] ,Also remaining:[%s]" % (schedule, self.Total_TaskNum - schedule)
self.lock.release()

# ------------------------
# 可伪造的头部信息
headers = {
'User-Agent': user_agent,
'Referer':'',
'X-Forwarded-For': ip,
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Host':'ditu.amap.com',
'Pragma':'no-cache',
'Referer':''
#User-Agent:Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/53.0.2785.143 Chrome/53.0.2785.143 Safari/537.36
}
URL = url
date = ''
now_requests_num += 1
#print '*************************************',ip,i#,date_list
# -------------------------
# 请求的具体请求部分
try:
# -- 发起
time.sleep(random.uniform(0, 1))
response = session.get(URL, headers=headers)
result = response.text.encode('utf-8')

# --- 请求解析--- 自定义使用正则还是xpath或etree,接口类数据可使用json
if result:
date = result
date_list.append([date,1,id])# 用于批量插入,需要构建为一个列表,1作为flag存入
else:
date_list.append([date,0,id])# 用于批量插入,需要构建为一个列表,0作为flag存入

except Exception as e:
print e
time.sleep(random.uniform(0, 3))
ErrorList.append("The ip is :[%s] Error:%s\n result:%s" %(ip, e, result))

# ------------------------
# 数据插入部分
try:
global sql_num_base
sql_num = int(random.uniform(sql_num_base, sql_num_base + 100)) #随机一个限制数,200-300 到则进行插入
if(now_requests_num >= sql_num):
now_requests_num = 0
cursor.executemany(Update_sql , date_list)
connect.commit()
date_list = []
print 'up',time.ctime(),'&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&',sql_num
except Exception ,e:
print e
time.sleep(random.uniform(0, 3))
ErrorList.append("The ip is :[%s] Error:%s\n result:%s" %(ip, e, result))
# 切换线程
self.lock.acquire()
schedule += 1
self.lock.release()
cursor.executemany(Update_sql , date_list)#大爷的注释,,这里要保存一次
connect.commit()
connect.close()


def ConnectDB():
"Connect MySQLdb "
connect, cursor = None, None
while True:
try:
connect = MySQLdb.connect(
host=HOST, user=USER, passwd=PASSWD, db=DB, port=PORT, charset='utf8')
cursor = connect.cursor()
break
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return connect, cursor


def Thread_Handle(taskList, Total_TaskNum):
'''多线程启动区域--无需修改'''
global THREAD_COUNT
lock = threading.Lock()
WorksThread = []
every_thread_number = len(taskList) / THREAD_COUNT
if every_thread_number == 0:
THREAD_COUNT = len(taskList)
every_thread_number = 1

for i in range(THREAD_COUNT):
if i != THREAD_COUNT - 1:
source_list = taskList[
i * every_thread_number: (i + 1) * every_thread_number]
Work = Handle_HTML(lock, i, source_list, Total_TaskNum)
else:
source_list = taskList[i * every_thread_number:]
Work = Handle_HTML(lock, i, source_list, Total_TaskNum)
Work.start()
WorksThread.append(Work)
for Work in WorksThread:
Work.join()


def main():
global ErrorList, WarnList
connect, cursor = ConnectDB()

# 统计表总行数,依据flag = 3
try:
cursor.execute("SELECT COUNT(*) FROM %s WHERE flag = 3 ;" % Table)
except Exception,e:
print e
TaskNum = cursor.fetchall()
connect.close()

if TaskNum[0][0] == 0:
print "Warning:There is no need to do the task!!!"
else:
Total_TaskNum = int(TaskNum[0][0])
while True:
connect, cursor = ConnectDB()# 建立数据库连接
try:
if cursor.execute(select_sql % Table):# 取任务url
rows = cursor.fetchall()
Thread_Handle(rows, Total_TaskNum)# 线程启动
else:
break
except Exception, e:
print e
connect.close()
print "_____************_____"
if ErrorList :
for error in ErrorList:
print error
print "Error:", len(ErrorList), "Warning:",len(WarnList)

if __name__ == '__main__':
print "The Program start time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
start = time.time()
main()
print "The Program end time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), "[%s]" % (time.time() - start)
# raw_input("Please enter any key to exit!")
29 changes: 29 additions & 0 deletions 6.爬虫项目源码/16.csdn/csdn_Commentary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#coding:utf-8
import requests

url = "http://blog.csdn.net/xunalove/comment/submit"

querystring = {"id":"54948790"}

payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"commentid\"\r\n\r\n\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"content\"\r\n\r\n写的不错,我来看看\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"replyId\"\r\n\r\n\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--"
headers = {
'accept': "*/*",
'accept-encoding': "gzip, deflate",
'accept-language': "zh-CN,zh;q=0.8",
'connection': "keep-alive",
'content-length': "109",
'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
'host': "blog.csdn.net",
'origin': "http://blog.csdn.net",
'referer': "http://blog.csdn.net/xunalove/article/details/54948790",
'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
'x-forwarded-for': "175.51.151.188",
'x-real-ip': "175.51.151.188",
'x-requested-with': "XMLHttpRequest",
'cookie': "bdshare_firstime=1482074386697; uuid_tt_dd=2876793974890581118_20161218; _JQCMT_ifcookie=1; _JQCMT_browser=f6435c23260ef40cd7f7e91eb576bb77; OUTFOX_SEARCH_USER_ID_NCOO=137446819.53926876; uuid=c5874c71-9d0f-4604-a074-d5ae17bae98a; _ga=GA1.2.1503044291.1482220609; UserName=a83533774; UserInfo=8BqULP2%2BFlHA%2BWQ49z9UMSUt1IKLSdAZprXd7ViHIQtBFKSYvNDJ1G4gBYKbI6lZveNzLiHt%2Bh%2BBDCGQ5TkuICkR65ji2LMUyvERYDotyZmKk6cuvToAgVYLFvDmsALA; UserNick=a83533774; AU=F64; UN=a83533774; UE=\"[email protected]\"; BT=1488174612516; access-token=ba5921ec-6063-43fa-bf22-31a6fec43a33; avh=49408103%2c17427655%2c57470073%2c54948790%2c54948790; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1487846336,1488173816,1488173847,1488174727; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1488176398; dc_tos=om0s9a; dc_session_id=1488176253802",
'cache-control': "no-cache",
'postman-token': "a6948124-e7b7-d13b-7b81-5b735509b765"
}

response = requests.request("POST", url, data=payload, headers=headers, params=querystring)
print(response.text)
60 changes: 60 additions & 0 deletions 6.爬虫项目源码/16.csdn/get_page_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#-------------------------------------------------------------------------
# 程序:get_page_id.py
# 版本:0.1
# 作者:ly
# 日期:编写日期2017/2/27
# 语言:Python 2.7.x
# 操作:python get_page_id.py
# 功能:给定用户名name,获取该用户所有已经发表的文章id
#-------------------------------------------------------------------------
import requests
import re,sys
from lxml import etree
import random,time



def forge_headers(url,user_agent):
if len(user_agent ) < 10:
user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0'
headers = {
'cache-control': "no-cache",
'Host':"blog.csdn.net",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"User-Agent": user_agent
}
response = requests.request("GET", url, headers=headers)
#print(response.text)
return response.text

def main():

name = 'xunalove'
url = "http://blog.csdn.net/"+name+"/article/list/20"

user_agent_list = []
f = open('user_agent.txt','r')
for date_line in f:
user_agent_list.append(date_line.replace('\r\n',''))
now_ua = random.choice(user_agent_list)
html_code = forge_headers(url,now_ua)

page_id_set = set()
page_id = re.findall('/xunalove/article/details/[0-9]{8}',html_code)
for now_url in page_id:
page_id_set.add(now_url[-8:])
#print len(page_id_set)
for now_set in page_id_set:
print now_set


if __name__ == '__main__':

print 'begin',time.ctime()
main()
print 'end',time.ctime()
7 changes: 7 additions & 0 deletions 6.爬虫项目源码/16.csdn/ghostdriver.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[INFO - 2017-02-27T06:09:01.865Z] GhostDriver - Main - running on port 62412
[INFO - 2017-02-27T06:09:02.734Z] Session [3c976880-fcb3-11e6-a231-e5478504608d] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true}
[INFO - 2017-02-27T06:09:02.734Z] Session [3c976880-fcb3-11e6-a231-e5478504608d] - page.customHeaders: - {}
[INFO - 2017-02-27T06:09:02.735Z] Session [3c976880-fcb3-11e6-a231-e5478504608d] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"mac-unknown-64bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}}
[INFO - 2017-02-27T06:09:02.735Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 3c976880-fcb3-11e6-a231-e5478504608d
[INFO - 2017-02-27T06:14:01.873Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW
[INFO - 2017-02-27T06:14:01.874Z] SessionManagerReqHand - _cleanupWindowlessSessions - Deleted Session '3c976880-fcb3-11e6-a231-e5478504608d', because windowless
Loading

0 comments on commit 99707cf

Please sign in to comment.