forked from luyishisi/Anti-Anti-Spider
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
20 changed files
with
66,460 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
# -*- coding: UTF-8 -*- | ||
# 通过百度获取我们想要的id | ||
# 缺点其实百度的每一个结果页面可以通过post来完成 | ||
# 运行方式 :python3 get_id.py | ||
import requests | ||
import bs4 | ||
import re | ||
from selenium import webdriver | ||
import time | ||
import MySQLdb | ||
|
||
reload(sys) | ||
sys.setdefaultencoding('utf-8') | ||
Type = sys.getfilesystemencoding() | ||
|
||
Table = "cn_proj_landmark_hebei_baidu_copy"#sys.argv[1] | ||
THREAD_COUNT = 50 #需要修改 | ||
schedule = 0 | ||
HOST, USER, PASSWD, DB, PORT = '','','', '', 23306#需要修改 | ||
|
||
select_sql = "SELECT * FROM %s " | ||
into_sql = "" | ||
|
||
def ConnectDB(): | ||
"Connect MySQLdb and Print version." | ||
connect, cursor = None, None | ||
count = 0 | ||
while True: | ||
try : | ||
connect = MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, db=DB, port = PORT, charset ='utf8') | ||
cursor = connect.cursor() | ||
break | ||
except MySQLdb.Error, e: | ||
print "Error %d: %s" % (e.args[0],e.args[1]) | ||
count += 1 | ||
time.sleep(10) | ||
if count > 100: | ||
print 'error > 100 end' | ||
sys.exit(1) | ||
return connect, cursor | ||
def changeurl(url): | ||
# req=requests.get('https://www.baidu.com/link?url=j7vtTkcM6jvJZ0RvWMtjslhKOY9lcrdlq8ruIP473AXuyfWomhFIgd0103xefJiWsR5n68jOkg1PjsLwV13d9a&wd=&eqid=cf53142600017e65000000045882cdfd') | ||
# https://www.baidu.com/link?url=w8wWEQMyVf0cD3TsKcn_pTQZ92cIqLqxVZKWFtT4rYJcESE_qfhKlPJg5B7OM2mXhZoSM1H0ogmCIgi4G2EkP_&wd=&eqid=aa2c3db90000bf4c0000000458831761 | ||
req=requests.get(url+'&wd=') | ||
# time.sleep(1) | ||
regx = r'http://baijiahao.baidu.com/u[\S]*"' | ||
pattern = re.compile(regx) | ||
match = re.findall(pattern,req.text) | ||
print(match) | ||
return match[0] | ||
|
||
def getbaiduurl(): | ||
urllist=set() | ||
browser = webdriver.Chrome() | ||
browser.get('https://www.baidu.com/s?wd=site:(baijiahao.baidu.com) inurl: ( "http://baijiahao.baidu.com/u?app_id=" )') | ||
# print(source)//*[@id="page"]/a[11],//*[@id="page"]/a[10] | ||
try: | ||
browser.find_element_by_xpath('//*[@id="page"]/a[11]').click() | ||
except Exception,e: | ||
print e | ||
browser.find_element_by_xpath('//*[@id="page"]/a[10]').click() | ||
time.sleep(2) | ||
while True: | ||
source = browser.page_source | ||
try: | ||
browser.find_element_by_xpath('//*[@id="page"]/a[11]').click() | ||
time.sleep(2) | ||
except: | ||
print('not find next_button may be for the page end!!!') | ||
break | ||
|
||
|
||
soup=bs4.BeautifulSoup(source,'lxml') | ||
for i in soup.findAll(class_='result c-container '): | ||
url=i.find(class_='t').find('a').get('href') | ||
|
||
if len(url) is 116: | ||
try: | ||
url = changeurl(str(url)) | ||
print(url[36:-1]) | ||
print(len(urllist)) | ||
urllist.add(url) | ||
except: | ||
print('error') | ||
#urllist.clear() | ||
time.sleep(0.5) | ||
print 'begin_save' | ||
with open('urllist_2_6_1.txt','w') as file: | ||
for i in urllist: | ||
file.write(i[36:-1]) | ||
file.write('\n') | ||
file.close() | ||
|
||
getbaiduurl() | ||
|
||
if __name__ == '__main__': | ||
connect, cursor = ConnectDB() | ||
|
||
cursor.execute(sel_count %Table) | ||
|
||
TaskNum = cursor.fetchall() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# -*- coding: UTF-8 -*- | ||
# 通过百度获取我们想要的id | ||
# 缺点其实百度的每一个结果页面可以通过post来完成 | ||
# 运行方式 :python3 get_id.py | ||
import requests | ||
import bs4 | ||
import re | ||
from selenium import webdriver | ||
import time | ||
import random | ||
from sys import argv | ||
|
||
def changeurl(url): | ||
# req=requests.get('https://www.baidu.com/link?url=j7vtTkcM6jvJZ0RvWMtjslhKOY9lcrdlq8ruIP473AXuyfWomhFIgd0103xefJiWsR5n68jOkg1PjsLwV13d9a&wd=&eqid=cf53142600017e65000000045882cdfd') | ||
# https://www.baidu.com/link?url=w8wWEQMyVf0cD3TsKcn_pTQZ92cIqLqxVZKWFtT4rYJcESE_qfhKlPJg5B7OM2mXhZoSM1H0ogmCIgi4G2EkP_&wd=&eqid=aa2c3db90000bf4c0000000458831761 | ||
try: | ||
req=requests.get(url+'&wd=') | ||
#req=requests.get(url+'&wd=') | ||
time.sleep(1) | ||
regx = r'http://baijiahao.baidu.com/u[\S]*"' | ||
pattern = re.compile(regx) | ||
match = re.findall(pattern,req.text) | ||
print '#4',match | ||
return match[0] | ||
except Exception,e: | ||
print '#5',e | ||
return '0' | ||
|
||
def getbaiduurl(key_list): | ||
#browser = webdriver.Chrome() | ||
browser = webdriver.PhantomJS() | ||
|
||
#urllist=set() | ||
#key = '军事' | ||
num = argv[1] | ||
#num = 1 | ||
for now_num_id in range(num,len(key_list)): | ||
key = key_list[now_num_id] | ||
print num,key,time.ctime() | ||
num += 1 | ||
urllist=set() | ||
now_num = 0 | ||
browser.implicitly_wait(10) | ||
browser.get('https://www.baidu.com/s?wd=site:(baijiahao.baidu.com) '+ key )#+'inurl:( "http://baijiahao.baidu.com/u?app_id=" )') | ||
if now_num == 1: | ||
try: | ||
browser.find_element_by_xpath('//*[@id="page"]/a[10]').click() | ||
time.sleep(2) | ||
except Exception,e: | ||
print '#0',#e | ||
continue | ||
|
||
while True: | ||
now_num += 1 | ||
source = browser.page_source | ||
soup=bs4.BeautifulSoup(source,'lxml') | ||
print 'next_page' | ||
for i in soup.findAll(class_='result c-container '): | ||
url=i.find(class_='t').find('a').get('href') | ||
print '#1',url | ||
try: | ||
#url = changeurl(str(url)) | ||
#print(url[36:-1]) | ||
#print '#2',url, | ||
#print (len(urllist)) | ||
urllist.add(url) | ||
except Exception,e: | ||
print '#3 error',e | ||
time.sleep(1) | ||
if now_num > 1: | ||
try: | ||
browser.find_element_by_xpath('//*[@id="page"]/a[11]').click() | ||
time.sleep(1) | ||
except: | ||
print('not find next_button may be for the page end!!!') | ||
break | ||
#print urllist | ||
#存储部分 | ||
print 'beging save ',len(urllist) | ||
with open('urllist4.txt','a') as file: | ||
for i in urllist: | ||
file.write(i)#[36:-1]) | ||
file.write('\n') | ||
file.close() | ||
print 'end save ' | ||
|
||
if __name__ == '__main__': | ||
#main() | ||
count = 0 | ||
idlist = [] | ||
file = open('fingerDic.txt') # urllist.txt在你运行的目录下 | ||
for i in file.readlines(): | ||
idlist.append(i.replace('\n','').replace('\r','')) | ||
count += 1 | ||
file.close() | ||
getbaiduurl(idlist) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# -*- coding: UTF-8 -*- | ||
# 通过百度获取我们想要的id | ||
# 缺点其实百度的每一个结果页面可以通过post来完成 | ||
# 运行方式 :python3 get_id.py | ||
import requests | ||
import bs4 | ||
import re | ||
from selenium import webdriver | ||
import time | ||
import MySQLdb | ||
import sys | ||
|
||
reload(sys) | ||
sys.setdefaultencoding('utf-8') | ||
Type = sys.getfilesystemencoding() | ||
|
||
Table = ""#sys.argv[1] | ||
THREAD_COUNT = 50 #需要修改 | ||
schedule = 0 | ||
HOST, USER, PASSWD, DB, PORT = '','','', 'IP_CN', 23306#需要修改 | ||
|
||
select_sql = "SELECT * FROM %s " | ||
into_sql = "" | ||
|
||
def ConnectDB(): | ||
"Connect MySQLdb and Print version." | ||
connect, cursor = None, None | ||
count = 0 | ||
while True: | ||
try : | ||
connect = MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, db=DB, port = PORT, charset ='utf8') | ||
cursor = connect.cursor() | ||
break | ||
except MySQLdb.Error, e: | ||
print "Error %d: %s" % (e.args[0],e.args[1]) | ||
count += 1 | ||
time.sleep(10) | ||
if count > 100: | ||
print 'error > 100 end' | ||
sys.exit(1) | ||
return connect, cursor | ||
if __name__ == '__main__': | ||
idlist=[] | ||
count = 0 | ||
id_set = set('') | ||
|
||
id_text_name_list = ['url_to_id.txt'] | ||
#id_text_name_list = os.listdir() | ||
|
||
for name in id_text_name_list: | ||
if name.find('txt') != -1: | ||
file = open(name) # urllist.txt在你运行的目录下 | ||
for i in file.readlines(): | ||
idlist.append(i) | ||
id_set.add(i.replace('\n','')) | ||
#print count,i | ||
count += 1 | ||
file.close() | ||
|
||
print len(id_set),count | ||
print id_set | ||
f = open('temp.txt','a') | ||
for i in id_set: | ||
print i | ||
f.writelines(i+'\n') | ||
f.close() | ||
|
||
# connect, cursor = ConnectDB() | ||
# | ||
# cursor.execute(sel_count %Table) | ||
# | ||
# TaskNum = cursor.fetchall() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
# -*- coding: UTF-8 -*- | ||
# 通过百度获取我们想要的id | ||
# 缺点其实百度的每一个结果页面可以通过post来完成 | ||
# 运行方式 :python3 get_id.py | ||
import requests | ||
import bs4 | ||
from selenium import webdriver | ||
from selenium.webdriver.common.keys import Keys | ||
import time,sys,os | ||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | ||
import Image,re | ||
from PIL import Image | ||
import fileinput | ||
from lxml import etree# | ||
from sys import argv | ||
import random | ||
|
||
# 中文编码设置 | ||
reload(sys) | ||
sys.setdefaultencoding('utf-8') | ||
Type = sys.getfilesystemencoding() | ||
|
||
def changeurl(url): | ||
try: | ||
req=requests.get(url,timeout=5) | ||
#print 'end' | ||
#print req.text.encode('utf8') | ||
#req=requests.get(url+'&wd=') | ||
time.sleep(1) | ||
regx = r'http[s]+://baijiahao.baidu.com/u[\S]app_id=[0-9]*' | ||
#regx = r'手机百度' | ||
pattern = re.compile(regx) | ||
match = re.findall(pattern,req.text) | ||
print '#4',match | ||
return match[0] | ||
except Exception,e: | ||
print '#5',e | ||
return '0' | ||
|
||
def url_get_id_usephantomjs(url,driver_pc): | ||
|
||
driver_pc.implicitly_wait(10) | ||
driver_pc.get(url) | ||
#driver_pc.save_screenshot('1.png')#//*[@id="followconModule"]/div/h3/div[2]/a/div/comment()[1] | ||
#print driver_pc.find_element_by_xpath("//div[@class='detail']/div[@class ='name']").text.encode('utf8') | ||
try: | ||
app_id = driver_pc.find_element_by_xpath("//div[@class='detail']/a[@class ='mth-pblog']").get_attribute("href")#.text.encode('utf8') | ||
return app_id | ||
except Exception,e: | ||
print e | ||
try: | ||
#changeurl(url) | ||
return '0' | ||
except Exception,e: | ||
return '0' | ||
|
||
if __name__ == '__main__': | ||
|
||
driver_pc = webdriver.PhantomJS() | ||
file = open(r'./text/urllist5.txt','r') | ||
num = 0 | ||
begin_num = int(argv[1]) | ||
end_num = int(argv[2]) | ||
for url in file.readlines(): | ||
if begin_num >= num or num >= end_num: | ||
num += 1 | ||
#print num | ||
continue | ||
else: | ||
num += 1 | ||
print num,url, | ||
app_id = url_get_id_usephantomjs(url,driver_pc) | ||
time.sleep(0.5) | ||
print app_id | ||
print '********************' | ||
with open('./id_text/url_to_id'+str(begin_num)+'-'+str(end_num)+'.txt','a') as file_id: | ||
try: | ||
file_id.write(app_id[37:53]) | ||
file_id.write('\n') | ||
except Exception,e: | ||
print e | ||
file_id.close() | ||
|
||
file.close() | ||
driver_pc.quit() |
Oops, something went wrong.