forked from luyishisi/Anti-Anti-Spider
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
14,721 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import time | ||
|
||
def consumer(): | ||
r = '' | ||
while True: | ||
n = yield r | ||
if not n: | ||
return | ||
print('[CONSUMER] Consuming %s...' % n) | ||
time.sleep(1) | ||
r = '200 OK' | ||
|
||
def produce(c): | ||
c.next() | ||
n = 0 | ||
while n < 5: | ||
n = n + 1 | ||
print('[PRODUCER] Producing %s...' % n) | ||
r = c.send(n) | ||
print('[PRODUCER] Consumer return: %s' % r) | ||
c.close() | ||
|
||
if __name__=='__main__': | ||
c = consumer() | ||
produce(c) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: UTF-8 -*- | ||
#------------------------------------------------------------------------- | ||
# 程序:UrlSpider.py | ||
# 版本:1 | ||
# 作者:ly | ||
# 日期:编写日期2016/12/25 | ||
# 语言:Python 2.7.x | ||
# 操作:python UrlSpider.py | ||
# 功能:指定任务表,读取url,多线程采集 | ||
# 表结构(id, ip, lon_gd, lat_gd, datetime, flag) | ||
# 采用数据库批量插入优化等表结构优化 | ||
#------------------------------------------------------------------------- | ||
import re ,os ,sys ,time ,json ,random ,MySQLdb ,requesocks ,threading,requests | ||
|
||
#-------------------------------------------------- | ||
#中文编码设置 | ||
reload(sys) | ||
sys.setdefaultencoding('utf-8') | ||
Type = sys.getfilesystemencoding() | ||
|
||
#------------------------------------------------ | ||
# 代理以及tor设置。 | ||
session = requesocks.session() | ||
# session.proxies = {'http':'socks5://127.0.0.1:9050','https':'socks5://127.0.0.1:9050'} | ||
|
||
#------------------------------------------------ | ||
# 可修改的全局变量参数 | ||
Table = "table" # 表名称需修改 | ||
HOST, USER, PASSWD, DB, PORT = 'host', 'user', 'pass', 'dbname', 3306 # 数据库连接参数 | ||
select_sql = "SELECT id,url FROM %s where flag = 3 limit 30000;" # 在数据库中i已经打乱了. | ||
Update_sql = "UPDATE "+Table+" SET date=%s, flag=%s WHERE id =%s;" #数据存储 | ||
|
||
THREAD_COUNT = 50 #开启线程数 | ||
sql_num_base = 200 #自定义的执行批量插入的随机值基数,当此值为1时则每次获取数据均直接插入。 | ||
sql_num_add = 100 #自定义的随机值加数,平均而言,当单独一个线程执行sql_num_base+1/3*sql_num_add次数时执行插入 | ||
# 不可修改全局变量参数 | ||
#------------------------------------------------ | ||
schedule = 0 # 当前线程标志 | ||
ErrorList = [] | ||
WarnList = [] | ||
|
||
class Handle_HTML(threading.Thread): | ||
"""docstring for Handle_HTML""" | ||
def __init__(self, lock, ThreadID, tasklist, Total_TaskNum): | ||
super(Handle_HTML, self).__init__() | ||
self.lock = lock | ||
self.ThreadID = ThreadID | ||
self.tasklist = tasklist | ||
self.Total_TaskNum = Total_TaskNum | ||
|
||
def run(self): | ||
|
||
global schedule, ErrorList | ||
connect, cursor = ConnectDB() | ||
self.lock.acquire() | ||
print "The Thread tasklist number :", len(self.tasklist) | ||
self.lock.release() | ||
total = len(self.tasklist) | ||
user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36' | ||
date_list = [] | ||
now_requests_num = 0 | ||
for (id, url) in self.tasklist: | ||
# ------------------------- | ||
# 每个请求开始前进行进度说明,对线程上锁 | ||
self.lock.acquire() | ||
time_Now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) | ||
print "Tread-%s:" % self.ThreadID, time_Now, "Already Completed:[%s] ,Also remaining:[%s]" % (schedule, self.Total_TaskNum - schedule) | ||
self.lock.release() | ||
|
||
# ------------------------ | ||
# 可伪造的头部信息 | ||
headers = { | ||
'User-Agent': user_agent, | ||
'Referer':'', | ||
'X-Forwarded-For': ip, | ||
'Accept':'*/*', | ||
'Accept-Encoding':'gzip, deflate, sdch', | ||
'Accept-Language':'zh-CN,zh;q=0.8', | ||
'Cache-Control':'no-cache', | ||
'Connection':'keep-alive', | ||
'Host':'ditu.amap.com', | ||
'Pragma':'no-cache', | ||
'Referer':'' | ||
#User-Agent:Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/53.0.2785.143 Chrome/53.0.2785.143 Safari/537.36 | ||
} | ||
URL = url | ||
date = '' | ||
now_requests_num += 1 | ||
#print '*************************************',ip,i#,date_list | ||
# ------------------------- | ||
# 请求的具体请求部分 | ||
try: | ||
# -- 发起 | ||
time.sleep(random.uniform(0, 1)) | ||
response = session.get(URL, headers=headers) | ||
result = response.text.encode('utf-8') | ||
|
||
# --- 请求解析--- 自定义使用正则还是xpath或etree,接口类数据可使用json | ||
if result: | ||
date = result | ||
date_list.append([date,1,id])# 用于批量插入,需要构建为一个列表,1作为flag存入 | ||
else: | ||
date_list.append([date,0,id])# 用于批量插入,需要构建为一个列表,0作为flag存入 | ||
|
||
except Exception as e: | ||
print e | ||
time.sleep(random.uniform(0, 3)) | ||
ErrorList.append("The ip is :[%s] Error:%s\n result:%s" %(ip, e, result)) | ||
|
||
# ------------------------ | ||
# 数据插入部分 | ||
try: | ||
global sql_num_base | ||
sql_num = int(random.uniform(sql_num_base, sql_num_base + 100)) #随机一个限制数,200-300 到则进行插入 | ||
if(now_requests_num >= sql_num): | ||
now_requests_num = 0 | ||
cursor.executemany(Update_sql , date_list) | ||
connect.commit() | ||
date_list = [] | ||
print 'up',time.ctime(),'&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&',sql_num | ||
except Exception ,e: | ||
print e | ||
time.sleep(random.uniform(0, 3)) | ||
ErrorList.append("The ip is :[%s] Error:%s\n result:%s" %(ip, e, result)) | ||
# 切换线程 | ||
self.lock.acquire() | ||
schedule += 1 | ||
self.lock.release() | ||
cursor.executemany(Update_sql , date_list)#大爷的注释,,这里要保存一次 | ||
connect.commit() | ||
connect.close() | ||
|
||
|
||
def ConnectDB(): | ||
"Connect MySQLdb " | ||
connect, cursor = None, None | ||
while True: | ||
try: | ||
connect = MySQLdb.connect( | ||
host=HOST, user=USER, passwd=PASSWD, db=DB, port=PORT, charset='utf8') | ||
cursor = connect.cursor() | ||
break | ||
except MySQLdb.Error, e: | ||
print "Error %d: %s" % (e.args[0], e.args[1]) | ||
return connect, cursor | ||
|
||
|
||
def Thread_Handle(taskList, Total_TaskNum): | ||
'''多线程启动区域--无需修改''' | ||
global THREAD_COUNT | ||
lock = threading.Lock() | ||
WorksThread = [] | ||
every_thread_number = len(taskList) / THREAD_COUNT | ||
if every_thread_number == 0: | ||
THREAD_COUNT = len(taskList) | ||
every_thread_number = 1 | ||
|
||
for i in range(THREAD_COUNT): | ||
if i != THREAD_COUNT - 1: | ||
source_list = taskList[ | ||
i * every_thread_number: (i + 1) * every_thread_number] | ||
Work = Handle_HTML(lock, i, source_list, Total_TaskNum) | ||
else: | ||
source_list = taskList[i * every_thread_number:] | ||
Work = Handle_HTML(lock, i, source_list, Total_TaskNum) | ||
Work.start() | ||
WorksThread.append(Work) | ||
for Work in WorksThread: | ||
Work.join() | ||
|
||
|
||
def main(): | ||
global ErrorList, WarnList | ||
connect, cursor = ConnectDB() | ||
|
||
# 统计表总行数,依据flag = 3 | ||
try: | ||
cursor.execute("SELECT COUNT(*) FROM %s WHERE flag = 3 ;" % Table) | ||
except Exception,e: | ||
print e | ||
TaskNum = cursor.fetchall() | ||
connect.close() | ||
|
||
if TaskNum[0][0] == 0: | ||
print "Warning:There is no need to do the task!!!" | ||
else: | ||
Total_TaskNum = int(TaskNum[0][0]) | ||
while True: | ||
connect, cursor = ConnectDB()# 建立数据库连接 | ||
try: | ||
if cursor.execute(select_sql % Table):# 取任务url | ||
rows = cursor.fetchall() | ||
Thread_Handle(rows, Total_TaskNum)# 线程启动 | ||
else: | ||
break | ||
except Exception, e: | ||
print e | ||
connect.close() | ||
print "_____************_____" | ||
if ErrorList : | ||
for error in ErrorList: | ||
print error | ||
print "Error:", len(ErrorList), "Warning:",len(WarnList) | ||
|
||
if __name__ == '__main__': | ||
print "The Program start time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) | ||
start = time.time() | ||
main() | ||
print "The Program end time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), "[%s]" % (time.time() - start) | ||
# raw_input("Please enter any key to exit!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#coding:utf-8 | ||
import requests | ||
|
||
url = "http://blog.csdn.net/xunalove/comment/submit" | ||
|
||
querystring = {"id":"54948790"} | ||
|
||
payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"commentid\"\r\n\r\n\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"content\"\r\n\r\n写的不错,我来看看\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"replyId\"\r\n\r\n\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--" | ||
headers = { | ||
'accept': "*/*", | ||
'accept-encoding': "gzip, deflate", | ||
'accept-language': "zh-CN,zh;q=0.8", | ||
'connection': "keep-alive", | ||
'content-length': "109", | ||
'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW", | ||
'host': "blog.csdn.net", | ||
'origin': "http://blog.csdn.net", | ||
'referer': "http://blog.csdn.net/xunalove/article/details/54948790", | ||
'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", | ||
'x-forwarded-for': "175.51.151.188", | ||
'x-real-ip': "175.51.151.188", | ||
'x-requested-with': "XMLHttpRequest", | ||
'cookie': "bdshare_firstime=1482074386697; uuid_tt_dd=2876793974890581118_20161218; _JQCMT_ifcookie=1; _JQCMT_browser=f6435c23260ef40cd7f7e91eb576bb77; OUTFOX_SEARCH_USER_ID_NCOO=137446819.53926876; uuid=c5874c71-9d0f-4604-a074-d5ae17bae98a; _ga=GA1.2.1503044291.1482220609; UserName=a83533774; UserInfo=8BqULP2%2BFlHA%2BWQ49z9UMSUt1IKLSdAZprXd7ViHIQtBFKSYvNDJ1G4gBYKbI6lZveNzLiHt%2Bh%2BBDCGQ5TkuICkR65ji2LMUyvERYDotyZmKk6cuvToAgVYLFvDmsALA; UserNick=a83533774; AU=F64; UN=a83533774; UE=\"[email protected]\"; BT=1488174612516; access-token=ba5921ec-6063-43fa-bf22-31a6fec43a33; avh=49408103%2c17427655%2c57470073%2c54948790%2c54948790; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1487846336,1488173816,1488173847,1488174727; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1488176398; dc_tos=om0s9a; dc_session_id=1488176253802", | ||
'cache-control': "no-cache", | ||
'postman-token': "a6948124-e7b7-d13b-7b81-5b735509b765" | ||
} | ||
|
||
response = requests.request("POST", url, data=payload, headers=headers, params=querystring) | ||
print(response.text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: UTF-8 -*- | ||
#------------------------------------------------------------------------- | ||
# 程序:get_page_id.py | ||
# 版本:0.1 | ||
# 作者:ly | ||
# 日期:编写日期2017/2/27 | ||
# 语言:Python 2.7.x | ||
# 操作:python get_page_id.py | ||
# 功能:给定用户名name,获取该用户所有已经发表的文章id | ||
#------------------------------------------------------------------------- | ||
import requests | ||
import re,sys | ||
from lxml import etree | ||
import random,time | ||
|
||
|
||
|
||
def forge_headers(url,user_agent): | ||
if len(user_agent ) < 10: | ||
user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0' | ||
headers = { | ||
'cache-control': "no-cache", | ||
'Host':"blog.csdn.net", | ||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | ||
"Accept-Encoding": "gzip, deflate", | ||
"Accept-Language": "en-US,en;q=0.5", | ||
"Connection": "keep-alive", | ||
"User-Agent": user_agent | ||
} | ||
response = requests.request("GET", url, headers=headers) | ||
#print(response.text) | ||
return response.text | ||
|
||
def main(): | ||
|
||
name = 'xunalove' | ||
url = "http://blog.csdn.net/"+name+"/article/list/20" | ||
|
||
user_agent_list = [] | ||
f = open('user_agent.txt','r') | ||
for date_line in f: | ||
user_agent_list.append(date_line.replace('\r\n','')) | ||
now_ua = random.choice(user_agent_list) | ||
html_code = forge_headers(url,now_ua) | ||
|
||
page_id_set = set() | ||
page_id = re.findall('/xunalove/article/details/[0-9]{8}',html_code) | ||
for now_url in page_id: | ||
page_id_set.add(now_url[-8:]) | ||
#print len(page_id_set) | ||
for now_set in page_id_set: | ||
print now_set | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
print 'begin',time.ctime() | ||
main() | ||
print 'end',time.ctime() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[INFO - 2017-02-27T06:09:01.865Z] GhostDriver - Main - running on port 62412 | ||
[INFO - 2017-02-27T06:09:02.734Z] Session [3c976880-fcb3-11e6-a231-e5478504608d] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true} | ||
[INFO - 2017-02-27T06:09:02.734Z] Session [3c976880-fcb3-11e6-a231-e5478504608d] - page.customHeaders: - {} | ||
[INFO - 2017-02-27T06:09:02.735Z] Session [3c976880-fcb3-11e6-a231-e5478504608d] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"mac-unknown-64bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}} | ||
[INFO - 2017-02-27T06:09:02.735Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 3c976880-fcb3-11e6-a231-e5478504608d | ||
[INFO - 2017-02-27T06:14:01.873Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW | ||
[INFO - 2017-02-27T06:14:01.874Z] SessionManagerReqHand - _cleanupWindowlessSessions - Deleted Session '3c976880-fcb3-11e6-a231-e5478504608d', because windowless |
Oops, something went wrong.