完善百家号文章大批量采集的说明文档

shaojava · Mar 6, 2017 · 087e123 · 087e123
1 parent c511c72
commit 087e123
Show file tree

Hide file tree

Showing 32 changed files with 1,459 additions and 416 deletions.
diff --git a/10.selenium/拍拍贷/selenium_so_phamtomjs.py b/10.selenium/拍拍贷/selenium_so_phamtomjs.py
@@ -10,32 +10,35 @@
 #   功能：拍拍贷页面截图
 #-------------------------------------------------------------------------
 
+import sys
+import time
+
 from selenium import webdriver
-from selenium.webdriver.common.keys import Keys
-import time,sys
 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+from selenium.webdriver.common.keys import Keys
+
 # 中文编码设置
 reload(sys)
 sys.setdefaultencoding('utf-8')
 Type = sys.getfilesystemencoding()
 
-#加载内核
+# 加载内核
 
 #driver = webdriver.PhantomJS()
 #driver = webdriver.Chrome()
-print 'begin',time.ctime()
+print 'begin', time.ctime()
 dcap = dict(DesiredCapabilities.PHANTOMJS)
 
-dcap["phantomjs.page.settings.userAgent"]=(
-"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"
+dcap["phantomjs.page.settings.userAgent"] = (
+    "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"
 )
 
 driver = webdriver.PhantomJS(desired_capabilities=dcap)
-#发起请求≈
+# 发起请求≈
 for i in range(20):
     #id = 31780000-i
-    id = 3000029+i
-    now_url = 'http://invest.ppdai.com/loan/info?id='+str(id)
+    id = 3000029 + i
+    now_url = 'http://invest.ppdai.com/loan/info?id=' + str(id)
     driver.get(now_url)
 
     js1 = 'return document.body.scrollHeight'
@@ -44,53 +47,53 @@
     num = 300
     max_num = driver.execute_script(js1)
     add_num = max_num / 20
-    while(max_num > num ):
+    while(max_num > num):
         num += add_num
-        js2 = 'window.scrollTo(0, '+str(num)+')'
+        js2 = 'window.scrollTo(0, ' + str(num) + ')'
         driver.execute_script(js2)
         time.sleep(0.2)
         max_num = driver.execute_script(js1)
         #add_num = max_num / 20
-        print num,'/',max_num
-    time.sleep(4)#主要等待时间延迟可设置
+        print num, '/', max_num
+    time.sleep(4)  # 主要等待时间延迟可设置
 
-    name = str(id)+'.png'
+    name = str(id) + '.png'
     driver.save_screenshot(name)
     print name
 
-#获取用户名框并输入
+# 获取用户名框并输入
 # elem = driver.find_element_by_xpath('//*[@id="email"]')
 # elem.send_keys("****")
 
-#获取密码框并输入
+# 获取密码框并输入
 # elem = driver.find_element_by_xpath('//*[@id="password"]')
 # elem.send_keys("****")
 
-#通过回车键进行登录
-#elem.send_keys(Keys.RETURN)
+# 通过回车键进行登录
+# elem.send_keys(Keys.RETURN)
 
 # 通过id选择到登录键
 # driver.find_element_by_id('submit-button').click()
 
 
 # time.sleep(2)
 
-#保存页面截图和源码
+# 保存页面截图和源码
 #name = '~/so_img/'+time.ctime().replace(' ','-')+'.png'
 # name = time.ctime().replace(' ','-')+'.png'
 #name_html = "~/so_img/"+time.ctime().replace(' ','-')+'.html'
 
 # driver.save_screenshot(name)
 #f = open(name_html.encode('utf-8'),'w')
-#f.write(driver.page_source)
-#f.close()
+# f.write(driver.page_source)
+# f.close()
 
-#print driver.page_source
+# print driver.page_source
 
 # time.sleep(5)
 
 # print 'end',time.ctime()
 driver.quit()
-#elem.clear()
-#time.sleep(10)
+# elem.clear()
+# time.sleep(10)
 driver.close()
diff --git a/6.爬虫项目源码/11.百家号/README.md b/6.爬虫项目源码/11.百家号/README.md
@@ -0,0 +1,28 @@
+
+本项目主要内容：
+
+1：通过模拟用户在百度搜索定站关键词来搜集足够多的百家作者id
+
+2：通过百度作者id获取百家号数以百万的文章URL及其标题阅读量标签等
+
+
+操作1：
+
+get_id 该目录下运行
+请确保keylist.txt在同一个目录下
+
+python baijiahao.py
+
+便在同目录下产生urllist.txt文件，，便是通过百度搜索得出的作者id
+运行时间越久数量越多。
+
+
+
+操作2：
+在id_to_excel目录下运行
+请确保urllist.txt在同一个目录下(如果是从上部代码获取的urllist需要单独抽取出其id，请看好格式)
+
+python spider_list_to_excel
+之后运算以及耗时半小时以上，根据其id的数量，
+
+将获得到的url会例如2017_2_6.xslx
diff --git a/6.爬虫项目源码/11.百家号/baijiahao/text/urllist4.txt → 6.爬虫项目源码/11.百家号/get_id/appid.txt b/6.爬虫项目源码/11.百家号/baijiahao/text/urllist4.txt → 6.爬虫项目源码/11.百家号/get_id/appid.txt