Skip to content

Commit

Permalink
开启百家号采集项目
Browse files Browse the repository at this point in the history
  • Loading branch information
luyishisi committed Feb 21, 2017
1 parent a59d7ea commit fb50cbf
Show file tree
Hide file tree
Showing 20 changed files with 66,460 additions and 0 deletions.
57,478 changes: 57,478 additions & 0 deletions 6.爬虫项目源码/11.百家号/baijiahao/fingerDic.txt

Large diffs are not rendered by default.

101 changes: 101 additions & 0 deletions 6.爬虫项目源码/11.百家号/baijiahao/get_id_mysql.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# -*- coding: UTF-8 -*-
# 通过百度获取我们想要的id
# 缺点其实百度的每一个结果页面可以通过post来完成
# 运行方式 :python3 get_id.py
import requests
import bs4
import re
from selenium import webdriver
import time
import MySQLdb

reload(sys)
sys.setdefaultencoding('utf-8')
Type = sys.getfilesystemencoding()

Table = "cn_proj_landmark_hebei_baidu_copy"#sys.argv[1]
THREAD_COUNT = 50 #需要修改
schedule = 0
HOST, USER, PASSWD, DB, PORT = '','','', '', 23306#需要修改

select_sql = "SELECT * FROM %s "
into_sql = ""

def ConnectDB():
"Connect MySQLdb and Print version."
connect, cursor = None, None
count = 0
while True:
try :
connect = MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, db=DB, port = PORT, charset ='utf8')
cursor = connect.cursor()
break
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0],e.args[1])
count += 1
time.sleep(10)
if count > 100:
print 'error > 100 end'
sys.exit(1)
return connect, cursor
def changeurl(url):
# req=requests.get('https://www.baidu.com/link?url=j7vtTkcM6jvJZ0RvWMtjslhKOY9lcrdlq8ruIP473AXuyfWomhFIgd0103xefJiWsR5n68jOkg1PjsLwV13d9a&wd=&eqid=cf53142600017e65000000045882cdfd')
# https://www.baidu.com/link?url=w8wWEQMyVf0cD3TsKcn_pTQZ92cIqLqxVZKWFtT4rYJcESE_qfhKlPJg5B7OM2mXhZoSM1H0ogmCIgi4G2EkP_&wd=&eqid=aa2c3db90000bf4c0000000458831761
req=requests.get(url+'&wd=')
# time.sleep(1)
regx = r'http://baijiahao.baidu.com/u[\S]*"'
pattern = re.compile(regx)
match = re.findall(pattern,req.text)
print(match)
return match[0]

def getbaiduurl():
urllist=set()
browser = webdriver.Chrome()
browser.get('https://www.baidu.com/s?wd=site:(baijiahao.baidu.com) inurl: ( "http://baijiahao.baidu.com/u?app_id=" )')
# print(source)//*[@id="page"]/a[11],//*[@id="page"]/a[10]
try:
browser.find_element_by_xpath('//*[@id="page"]/a[11]').click()
except Exception,e:
print e
browser.find_element_by_xpath('//*[@id="page"]/a[10]').click()
time.sleep(2)
while True:
source = browser.page_source
try:
browser.find_element_by_xpath('//*[@id="page"]/a[11]').click()
time.sleep(2)
except:
print('not find next_button may be for the page end!!!')
break


soup=bs4.BeautifulSoup(source,'lxml')
for i in soup.findAll(class_='result c-container '):
url=i.find(class_='t').find('a').get('href')

if len(url) is 116:
try:
url = changeurl(str(url))
print(url[36:-1])
print(len(urllist))
urllist.add(url)
except:
print('error')
#urllist.clear()
time.sleep(0.5)
print 'begin_save'
with open('urllist_2_6_1.txt','w') as file:
for i in urllist:
file.write(i[36:-1])
file.write('\n')
file.close()

getbaiduurl()

if __name__ == '__main__':
connect, cursor = ConnectDB()

cursor.execute(sel_count %Table)

TaskNum = cursor.fetchall()
96 changes: 96 additions & 0 deletions 6.爬虫项目源码/11.百家号/baijiahao/get_id_txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# -*- coding: UTF-8 -*-
# 通过百度获取我们想要的id
# 缺点其实百度的每一个结果页面可以通过post来完成
# 运行方式 :python3 get_id.py
import requests
import bs4
import re
from selenium import webdriver
import time
import random
from sys import argv

def changeurl(url):
# req=requests.get('https://www.baidu.com/link?url=j7vtTkcM6jvJZ0RvWMtjslhKOY9lcrdlq8ruIP473AXuyfWomhFIgd0103xefJiWsR5n68jOkg1PjsLwV13d9a&wd=&eqid=cf53142600017e65000000045882cdfd')
# https://www.baidu.com/link?url=w8wWEQMyVf0cD3TsKcn_pTQZ92cIqLqxVZKWFtT4rYJcESE_qfhKlPJg5B7OM2mXhZoSM1H0ogmCIgi4G2EkP_&wd=&eqid=aa2c3db90000bf4c0000000458831761
try:
req=requests.get(url+'&wd=')
#req=requests.get(url+'&wd=')
time.sleep(1)
regx = r'http://baijiahao.baidu.com/u[\S]*"'
pattern = re.compile(regx)
match = re.findall(pattern,req.text)
print '#4',match
return match[0]
except Exception,e:
print '#5',e
return '0'

def getbaiduurl(key_list):
#browser = webdriver.Chrome()
browser = webdriver.PhantomJS()

#urllist=set()
#key = '军事'
num = argv[1]
#num = 1
for now_num_id in range(num,len(key_list)):
key = key_list[now_num_id]
print num,key,time.ctime()
num += 1
urllist=set()
now_num = 0
browser.implicitly_wait(10)
browser.get('https://www.baidu.com/s?wd=site:(baijiahao.baidu.com) '+ key )#+'inurl:( "http://baijiahao.baidu.com/u?app_id=" )')
if now_num == 1:
try:
browser.find_element_by_xpath('//*[@id="page"]/a[10]').click()
time.sleep(2)
except Exception,e:
print '#0',#e
continue

while True:
now_num += 1
source = browser.page_source
soup=bs4.BeautifulSoup(source,'lxml')
print 'next_page'
for i in soup.findAll(class_='result c-container '):
url=i.find(class_='t').find('a').get('href')
print '#1',url
try:
#url = changeurl(str(url))
#print(url[36:-1])
#print '#2',url,
#print (len(urllist))
urllist.add(url)
except Exception,e:
print '#3 error',e
time.sleep(1)
if now_num > 1:
try:
browser.find_element_by_xpath('//*[@id="page"]/a[11]').click()
time.sleep(1)
except:
print('not find next_button may be for the page end!!!')
break
#print urllist
#存储部分
print 'beging save ',len(urllist)
with open('urllist4.txt','a') as file:
for i in urllist:
file.write(i)#[36:-1])
file.write('\n')
file.close()
print 'end save '

if __name__ == '__main__':
#main()
count = 0
idlist = []
file = open('fingerDic.txt') # urllist.txt在你运行的目录下
for i in file.readlines():
idlist.append(i.replace('\n','').replace('\r',''))
count += 1
file.close()
getbaiduurl(idlist)
72 changes: 72 additions & 0 deletions 6.爬虫项目源码/11.百家号/baijiahao/id_hebin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# -*- coding: UTF-8 -*-
# 通过百度获取我们想要的id
# 缺点其实百度的每一个结果页面可以通过post来完成
# 运行方式 :python3 get_id.py
import requests
import bs4
import re
from selenium import webdriver
import time
import MySQLdb
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
Type = sys.getfilesystemencoding()

Table = ""#sys.argv[1]
THREAD_COUNT = 50 #需要修改
schedule = 0
HOST, USER, PASSWD, DB, PORT = '','','', 'IP_CN', 23306#需要修改

select_sql = "SELECT * FROM %s "
into_sql = ""

def ConnectDB():
"Connect MySQLdb and Print version."
connect, cursor = None, None
count = 0
while True:
try :
connect = MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, db=DB, port = PORT, charset ='utf8')
cursor = connect.cursor()
break
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0],e.args[1])
count += 1
time.sleep(10)
if count > 100:
print 'error > 100 end'
sys.exit(1)
return connect, cursor
if __name__ == '__main__':
idlist=[]
count = 0
id_set = set('')

id_text_name_list = ['url_to_id.txt']
#id_text_name_list = os.listdir()

for name in id_text_name_list:
if name.find('txt') != -1:
file = open(name) # urllist.txt在你运行的目录下
for i in file.readlines():
idlist.append(i)
id_set.add(i.replace('\n',''))
#print count,i
count += 1
file.close()

print len(id_set),count
print id_set
f = open('temp.txt','a')
for i in id_set:
print i
f.writelines(i+'\n')
f.close()

# connect, cursor = ConnectDB()
#
# cursor.execute(sel_count %Table)
#
# TaskNum = cursor.fetchall()
85 changes: 85 additions & 0 deletions 6.爬虫项目源码/11.百家号/baijiahao/linkurl_to_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# -*- coding: UTF-8 -*-
# 通过百度获取我们想要的id
# 缺点其实百度的每一个结果页面可以通过post来完成
# 运行方式 :python3 get_id.py
import requests
import bs4
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time,sys,os
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import Image,re
from PIL import Image
import fileinput
from lxml import etree#
from sys import argv
import random

# 中文编码设置
reload(sys)
sys.setdefaultencoding('utf-8')
Type = sys.getfilesystemencoding()

def changeurl(url):
try:
req=requests.get(url,timeout=5)
#print 'end'
#print req.text.encode('utf8')
#req=requests.get(url+'&wd=')
time.sleep(1)
regx = r'http[s]+://baijiahao.baidu.com/u[\S]app_id=[0-9]*'
#regx = r'手机百度'
pattern = re.compile(regx)
match = re.findall(pattern,req.text)
print '#4',match
return match[0]
except Exception,e:
print '#5',e
return '0'

def url_get_id_usephantomjs(url,driver_pc):

driver_pc.implicitly_wait(10)
driver_pc.get(url)
#driver_pc.save_screenshot('1.png')#//*[@id="followconModule"]/div/h3/div[2]/a/div/comment()[1]
#print driver_pc.find_element_by_xpath("//div[@class='detail']/div[@class ='name']").text.encode('utf8')
try:
app_id = driver_pc.find_element_by_xpath("//div[@class='detail']/a[@class ='mth-pblog']").get_attribute("href")#.text.encode('utf8')
return app_id
except Exception,e:
print e
try:
#changeurl(url)
return '0'
except Exception,e:
return '0'

if __name__ == '__main__':

driver_pc = webdriver.PhantomJS()
file = open(r'./text/urllist5.txt','r')
num = 0
begin_num = int(argv[1])
end_num = int(argv[2])
for url in file.readlines():
if begin_num >= num or num >= end_num:
num += 1
#print num
continue
else:
num += 1
print num,url,
app_id = url_get_id_usephantomjs(url,driver_pc)
time.sleep(0.5)
print app_id
print '********************'
with open('./id_text/url_to_id'+str(begin_num)+'-'+str(end_num)+'.txt','a') as file_id:
try:
file_id.write(app_id[37:53])
file_id.write('\n')
except Exception,e:
print e
file_id.close()

file.close()
driver_pc.quit()
Loading

0 comments on commit fb50cbf

Please sign in to comment.