Skip to content

Commit

Permalink
完善百家号文章大批量采集的说明文档
Browse files Browse the repository at this point in the history
  • Loading branch information
luyishisi committed Mar 6, 2017
1 parent c511c72 commit 087e123
Show file tree
Hide file tree
Showing 32 changed files with 1,459 additions and 416 deletions.
51 changes: 27 additions & 24 deletions 10.selenium/拍拍贷/selenium_so_phamtomjs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,35 @@
# 功能:拍拍贷页面截图
#-------------------------------------------------------------------------

import sys
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time,sys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys

# 中文编码设置
reload(sys)
sys.setdefaultencoding('utf-8')
Type = sys.getfilesystemencoding()

#加载内核
# 加载内核

#driver = webdriver.PhantomJS()
#driver = webdriver.Chrome()
print 'begin',time.ctime()
print 'begin', time.ctime()
dcap = dict(DesiredCapabilities.PHANTOMJS)

dcap["phantomjs.page.settings.userAgent"]=(
"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"
)

driver = webdriver.PhantomJS(desired_capabilities=dcap)
#发起请求≈
# 发起请求≈
for i in range(20):
#id = 31780000-i
id = 3000029+i
now_url = 'http://invest.ppdai.com/loan/info?id='+str(id)
id = 3000029 + i
now_url = 'http://invest.ppdai.com/loan/info?id=' + str(id)
driver.get(now_url)

js1 = 'return document.body.scrollHeight'
Expand All @@ -44,53 +47,53 @@
num = 300
max_num = driver.execute_script(js1)
add_num = max_num / 20
while(max_num > num ):
while(max_num > num):
num += add_num
js2 = 'window.scrollTo(0, '+str(num)+')'
js2 = 'window.scrollTo(0, ' + str(num) + ')'
driver.execute_script(js2)
time.sleep(0.2)
max_num = driver.execute_script(js1)
#add_num = max_num / 20
print num,'/',max_num
time.sleep(4)#主要等待时间延迟可设置
print num, '/', max_num
time.sleep(4) # 主要等待时间延迟可设置

name = str(id)+'.png'
name = str(id) + '.png'
driver.save_screenshot(name)
print name

#获取用户名框并输入
# 获取用户名框并输入
# elem = driver.find_element_by_xpath('//*[@id="email"]')
# elem.send_keys("****")

#获取密码框并输入
# 获取密码框并输入
# elem = driver.find_element_by_xpath('//*[@id="password"]')
# elem.send_keys("****")

#通过回车键进行登录
#elem.send_keys(Keys.RETURN)
# 通过回车键进行登录
# elem.send_keys(Keys.RETURN)

# 通过id选择到登录键
# driver.find_element_by_id('submit-button').click()


# time.sleep(2)

#保存页面截图和源码
# 保存页面截图和源码
#name = '~/so_img/'+time.ctime().replace(' ','-')+'.png'
# name = time.ctime().replace(' ','-')+'.png'
#name_html = "~/so_img/"+time.ctime().replace(' ','-')+'.html'

# driver.save_screenshot(name)
#f = open(name_html.encode('utf-8'),'w')
#f.write(driver.page_source)
#f.close()
# f.write(driver.page_source)
# f.close()

#print driver.page_source
# print driver.page_source

# time.sleep(5)

# print 'end',time.ctime()
driver.quit()
#elem.clear()
#time.sleep(10)
# elem.clear()
# time.sleep(10)
driver.close()
28 changes: 28 additions & 0 deletions 6.爬虫项目源码/11.百家号/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@

本项目主要内容:

1:通过模拟用户在百度搜索定站关键词来搜集足够多的百家作者id

2:通过百度作者id获取百家号数以百万的文章URL及其标题阅读量标签等


操作1:

get_id 该目录下运行
请确保keylist.txt在同一个目录下

python baijiahao.py

便在同目录下产生urllist.txt文件,,便是通过百度搜索得出的作者id
运行时间越久数量越多。



操作2:
在id_to_excel目录下运行
请确保urllist.txt在同一个目录下(如果是从上部代码获取的urllist需要单独抽取出其id,请看好格式)

python spider_list_to_excel
之后运算以及耗时半小时以上,根据其id的数量,

将获得到的url会例如2017_2_6.xslx
Loading

0 comments on commit 087e123

Please sign in to comment.