diff --git "a/10.selenium/\346\213\215\346\213\215\350\264\267/selenium_so_phamtomjs.py" "b/10.selenium/\346\213\215\346\213\215\350\264\267/selenium_so_phamtomjs.py" index 72473039..c564e31b 100644 --- "a/10.selenium/\346\213\215\346\213\215\350\264\267/selenium_so_phamtomjs.py" +++ "b/10.selenium/\346\213\215\346\213\215\350\264\267/selenium_so_phamtomjs.py" @@ -10,32 +10,35 @@ # 功能:拍拍贷页面截图 #------------------------------------------------------------------------- +import sys +import time + from selenium import webdriver -from selenium.webdriver.common.keys import Keys -import time,sys from selenium.webdriver.common.desired_capabilities import DesiredCapabilities +from selenium.webdriver.common.keys import Keys + # 中文编码设置 reload(sys) sys.setdefaultencoding('utf-8') Type = sys.getfilesystemencoding() -#加载内核 +# 加载内核 #driver = webdriver.PhantomJS() #driver = webdriver.Chrome() -print 'begin',time.ctime() +print 'begin', time.ctime() dcap = dict(DesiredCapabilities.PHANTOMJS) -dcap["phantomjs.page.settings.userAgent"]=( -"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36" +dcap["phantomjs.page.settings.userAgent"] = ( + "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36" ) driver = webdriver.PhantomJS(desired_capabilities=dcap) -#发起请求≈ +# 发起请求≈ for i in range(20): #id = 31780000-i - id = 3000029+i - now_url = 'http://invest.ppdai.com/loan/info?id='+str(id) + id = 3000029 + i + now_url = 'http://invest.ppdai.com/loan/info?id=' + str(id) driver.get(now_url) js1 = 'return document.body.scrollHeight' @@ -44,30 +47,30 @@ num = 300 max_num = driver.execute_script(js1) add_num = max_num / 20 - while(max_num > num ): + while(max_num > num): num += add_num - js2 = 'window.scrollTo(0, '+str(num)+')' + js2 = 'window.scrollTo(0, ' + str(num) + ')' driver.execute_script(js2) time.sleep(0.2) max_num = driver.execute_script(js1) #add_num = max_num / 20 - print num,'/',max_num - time.sleep(4)#主要等待时间延迟可设置 + print num, '/', max_num + time.sleep(4) # 主要等待时间延迟可设置 - name = str(id)+'.png' + name = str(id) + '.png' driver.save_screenshot(name) print name -#获取用户名框并输入 +# 获取用户名框并输入 # elem = driver.find_element_by_xpath('//*[@id="email"]') # elem.send_keys("****") -#获取密码框并输入 +# 获取密码框并输入 # elem = driver.find_element_by_xpath('//*[@id="password"]') # elem.send_keys("****") -#通过回车键进行登录 -#elem.send_keys(Keys.RETURN) +# 通过回车键进行登录 +# elem.send_keys(Keys.RETURN) # 通过id选择到登录键 # driver.find_element_by_id('submit-button').click() @@ -75,22 +78,22 @@ # time.sleep(2) -#保存页面截图和源码 +# 保存页面截图和源码 #name = '~/so_img/'+time.ctime().replace(' ','-')+'.png' # name = time.ctime().replace(' ','-')+'.png' #name_html = "~/so_img/"+time.ctime().replace(' ','-')+'.html' # driver.save_screenshot(name) #f = open(name_html.encode('utf-8'),'w') -#f.write(driver.page_source) -#f.close() +# f.write(driver.page_source) +# f.close() -#print driver.page_source +# print driver.page_source # time.sleep(5) # print 'end',time.ctime() driver.quit() -#elem.clear() -#time.sleep(10) +# elem.clear() +# time.sleep(10) driver.close() diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/README.md" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/README.md" new file mode 100644 index 00000000..f08873ac --- /dev/null +++ "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/README.md" @@ -0,0 +1,28 @@ + +本项目主要内容: + +1:通过模拟用户在百度搜索定站关键词来搜集足够多的百家作者id + +2:通过百度作者id获取百家号数以百万的文章URL及其标题阅读量标签等 + + +操作1: + +get_id 该目录下运行 +请确保keylist.txt在同一个目录下 + +python baijiahao.py + +便在同目录下产生urllist.txt文件,,便是通过百度搜索得出的作者id +运行时间越久数量越多。 + + + +操作2: +在id_to_excel目录下运行 +请确保urllist.txt在同一个目录下(如果是从上部代码获取的urllist需要单独抽取出其id,请看好格式) + +python spider_list_to_excel +之后运算以及耗时半小时以上,根据其id的数量, + +将获得到的url会例如2017_2_6.xslx diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist4.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/appid.txt" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist4.txt" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/appid.txt" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/baijiahao.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/baijiahao.py" similarity index 88% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/baijiahao.py" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/baijiahao.py" index ca33b7cf..8890b82f 100644 --- "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/baijiahao.py" +++ "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/baijiahao.py" @@ -1,142 +1,159 @@ -#coding:utf-8 -'''本模块是为了解决获取百家号url并且从这个url里面获取我们想要的新闻''' -import re -import time - -import bs4 -import requests -from selenium import webdriver - - -class sobaidu(): - '''sobaidu类实现通过百度搜索获取真实的url并且把url写入数据库''' - - def __init__(self): - self.KEYFILENAME = "keylist.txt" - self.URLFILENAME = "urllist.txt" - self.KEYLIST = set() - self.URLLIST = set() - self.URLFILE = open(self.URLFILENAME, 'w') - - def _readkey(self): - '''读取百度搜索所需要的所有关键词''' - with open(self.KEYFILENAME) as keyklistfile: - for i in keyklistfile.readlines(): - self.KEYLIST.add(i) - def _changeurl(self, url): - '''百度搜索结果url转换为真实的url''' - try: - req = requests.get(url+'&wd=') - # time.sleep(1) - # print(req.text) - regx = r'http[s]*://baijiahao.baidu.com/[\S]*id=[0-9]*' - pattern = re.compile(regx) - match = re.findall(pattern, req.text) - return match[0] - except Exception as e: - print(e) - - def _writetomysql(self): - '''将真实url写入数据库''' - pass - - def _writetofile(self,url): - self.URLFILE.write(url) - self.URLFILE.write('\n') - - def sobaidu(self): - '''调用以上函数解决我们的问题''' - # browser = webdriver.Chrome() - browser = webdriver.PhantomJS() - num = 0 - for key in self.KEYLIST: - ''''doc''' - num += 1 - now_num = 0 - browser.implicitly_wait(30) - browser.get('https://www.baidu.com/s?wd=site:(baijiahao.baidu.com)' + key) - while True: - if now_num == 1: - try: - browser.find_element_by_xpath('//*[@id="page"]/a[10]').click() - time.sleep(2) - except Exception as e: - print(e) - print("有问题") - break - now_num += 1 - print(now_num) - source = browser.page_source - soup = bs4.BeautifulSoup(source, 'lxml') - print('next_page') - for i in soup.findAll(class_='result c-container '): - url = i.find(class_='t').find('a').get('href') - # print(url) - # self.URLLIST.add(self._changeurl(url)) - self._writetofile(self._changeurl(url)) - time.sleep(1) - if now_num > 1: - try: - browser.find_element_by_xpath('//*[@id="page"]/a[11]').click() - time.sleep(1) - except: - print('not find next_button may be for the page end!!!') - break - -class getappid: - def __init__(self): - self.URLFILENAME = "urllist.txt" - self.APPIDLIST = "appid.txt" - self.URLLIST = set() - self.APPIDFILE = open(self.APPIDLIST, 'w') - - def _readurl(self): - '''读取新闻页的url''' - with open(self.URLFILENAME) as urllistfile: - for i in urllistfile.readlines(): - self.URLLIST.add(i) - def _writeappid(self, appid): - self.APPIDFILE.write(appid) - self.APPIDFILE.write('\n') - print("写入成功") - def getid(self): - # browser = webdriver.PhantomJS() - browser = webdriver.Chrome() - browser.implicitly_wait(10) - # browser.set_script_timeout(10) - # browser.set_page_load_timeout(10) - for url in self.URLLIST: - browser.get(url) - regx = r'http[s]*://baijiahao.baidu.com/u[\S]*id=[0-9]*' - pattern = re.compile(regx) - match = re.findall(pattern, browser.page_source) - time.sleep(1) - try: - print(match[0]) - self._writeappid(match[0]) - except Exception as e: - print('匹配失败') -def main(): - dsfsd = sobaidu() - # strings = dsfsd._changeurl('https://www.baidu.com/link?url=w8wWEQMyVf0cD3TsKcn_pTQZ92cIqLqxVZKWFtT4rYJcESE_qfhKlPJg5B7OM2mXhZoSM1H0ogmCIgi4G2EkP_&wd=&eqid=aa2c3db90000bf4c0000000458831761') - # print(strings) - # # ###################### 奇怪的分割线 ############### - dsfsd._readkey() - print(len(dsfsd.KEYLIST)) - ######################### 有点傻的分割线 ############# - dsfsd.sobaidu() - # print(len(dsfsd.URLLIST)) - # for i in dsfsd.URLLIST: - # print(i) - # ####################### 电脑有点卡的分割线 ########### - dsfsd.URLFILE.close() -def getid(): - dsfsd = getappid() - dsfsd._readurl() - print(len(dsfsd.URLLIST)) - dsfsd.getid() - dsfsd.APPIDFILE.close() - -if __name__ == '__main__': - # main() - # getid() +# coding:utf-8 +'''本模块是为了解决获取百家号url并且从这个url里面获取我们想要的新闻''' +import re +import time + +import bs4 +import requests +from selenium import webdriver + + +class sobaidu(): + '''sobaidu类实现通过百度搜索获取真实的url并且把url写入数据库''' + + def __init__(self): + self.KEYFILENAME = "keylist.txt" + self.URLFILENAME = "urllist.txt" + self.KEYLIST = set() + self.URLLIST = set() + self.URLFILE = open(self.URLFILENAME, 'w') + + def _readkey(self): + '''读取百度搜索所需要的所有关键词''' + with open(self.KEYFILENAME) as keyklistfile: + for i in keyklistfile.readlines(): + self.KEYLIST.add(i) + + def _changeurl(self, url): + '''百度搜索结果url转换为真实的url''' + try: + req = requests.get(url + '&wd=') + # time.sleep(1) + # print(req.text) + regx = r'http[s]*://baijiahao.baidu.com/[\S]*id=[0-9]*' + pattern = re.compile(regx) + match = re.findall(pattern, req.text) + return match[0] + except Exception as e: + print(e) + + def _writetomysql(self): + '''将真实url写入数据库''' + pass + + def _writetofile(self, url): + self.URLFILE.write(url) + self.URLFILE.write('\n') + + def sobaidu(self): + '''调用以上函数解决我们的问题''' + # browser = webdriver.Chrome() + browser = webdriver.PhantomJS() + num = 0 + for key in self.KEYLIST: + ''''doc''' + num += 1 + now_num = 0 + browser.implicitly_wait(30) + browser.get( + 'https://www.baidu.com/s?wd=site:(baijiahao.baidu.com)' + key) + while True: + if now_num == 1: + try: + browser.find_element_by_xpath( + '//*[@id="page"]/a[10]').click() + time.sleep(2) + except Exception as e: + print(e) + print("有问题") + break + now_num += 1 + print(now_num) + source = browser.page_source + soup = bs4.BeautifulSoup(source, 'lxml') + print('next_page') + for i in soup.findAll(class_='result c-container '): + url = i.find(class_='t').find('a').get('href') + # print(url) + # self.URLLIST.add(self._changeurl(url)) + self._writetofile(self._changeurl(url)) + time.sleep(1) + if now_num > 1: + try: + browser.find_element_by_xpath( + '//*[@id="page"]/a[11]').click() + time.sleep(1) + except: + print('not find next_button may be for the page end!!!') + break + + +class getappid: + + def __init__(self): + self.URLFILENAME = "urllist.txt" + self.APPIDLIST = "appid.txt" + self.URLLIST = set() + self.APPIDFILE = open(self.APPIDLIST, 'w') + + def _readurl(self): + '''读取新闻页的url''' + with open(self.URLFILENAME) as urllistfile: + for i in urllistfile.readlines(): + self.URLLIST.add(i) + + def _writeappid(self, appid): + self.APPIDFILE.write(appid) + self.APPIDFILE.write('\n') + print("写入成功") + + def getid(self): + # browser = webdriver.PhantomJS() + browser = webdriver.Chrome() + browser.implicitly_wait(10) + # browser.set_script_timeout(10) + # browser.set_page_load_timeout(10) + for url in self.URLLIST: + browser.get(url) + regx = r'http[s]*://baijiahao.baidu.com/u[\S]*id=[0-9]*' + pattern = re.compile(regx) + match = re.findall(pattern, browser.page_source) + time.sleep(1) + try: + print(match[0]) + self._writeappid(match[0]) + except Exception as e: + print('匹配失败') + + +def main(): + dsfsd = sobaidu() + # strings = dsfsd._changeurl('https://www.baidu.com/link?url=w8wWEQMyVf0cD3TsKcn_pTQZ92cIqLqxVZKWFtT4rYJcESE_qfhKlPJg5B7OM2mXhZoSM1H0ogmCIgi4G2EkP_&wd=&eqid=aa2c3db90000bf4c0000000458831761') + # print(strings) + # # ###################### 奇怪的分割线 ############### + dsfsd._readkey() + print(len(dsfsd.KEYLIST)) + ######################### 有点傻的分割线 ############# + dsfsd.sobaidu() + # print(len(dsfsd.URLLIST)) + # for i in dsfsd.URLLIST: + # print(i) + # ####################### 电脑有点卡的分割线 ########### + dsfsd.URLFILE.close() + + +def getid(): + dsfsd = getappid() + dsfsd._readurl() + print(len(dsfsd.URLLIST)) + dsfsd.getid() + dsfsd.APPIDFILE.close() + +if __name__ == '__main__': + #取消注释来执行,使用getid获取作者id 使用main 来模拟访问,将数据存入urllist中 + + + #main() + + + getid() diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/get_url.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/get_url.py" similarity index 99% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/get_url.py" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/get_url.py" index 2aff846e..4dd1ca51 100644 --- "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/get_url.py" +++ "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/get_url.py" @@ -1,3 +1,4 @@ +#coding:utf-8 import re import time @@ -79,7 +80,7 @@ def sobaidu(self): time.sleep(1) def main(): dsfsd = sobaidu() - # strings = dsfsd._changeurl('https://www.baidu.com/link?url=w8wWEQMyVf0cD3TsKcn_pTQZ92cIqLqxVZKWFtT4rYJcESE_qfhKlPJg5B7OM2mXhZoSM1H0ogmCIgi4G2EkP_&wd=&eqid=aa2c3db90000bf4c0000000458831761') + # strings = dsfsd._changeurl('https://www.baidu.com/link?url=w8wWEQMyVf0cD3TsKcn_pTQZ92cIqLqxVZKWFtT4rYJcESE_qfhKlPJg5B7OM2mXhZoSM1H0ogmCIgi4G2EkP_&wd=&eqid=aa2c3db90000bf4c0000000458831761') # print(strings) # # ###################### 奇怪的分割线 ############### dsfsd._readkey() @@ -92,4 +93,4 @@ def main(): dsfsd.URLFILE.close() if __name__ == '__main__': main() - # getid() \ No newline at end of file + # getid() diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/ghostdriver.log" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/ghostdriver.log" new file mode 100644 index 00000000..b0667504 --- /dev/null +++ "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/ghostdriver.log" @@ -0,0 +1,23 @@ +[INFO - 2017-03-06T07:47:16.702Z] GhostDriver - Main - running on port 62963 +[INFO - 2017-03-06T07:47:17.337Z] Session [1ef08b20-0241-11e7-a6b1-ab15bc28129e] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true} +[INFO - 2017-03-06T07:47:17.340Z] Session [1ef08b20-0241-11e7-a6b1-ab15bc28129e] - page.customHeaders: - {} +[INFO - 2017-03-06T07:47:17.340Z] Session [1ef08b20-0241-11e7-a6b1-ab15bc28129e] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"mac-unknown-64bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}} +[INFO - 2017-03-06T07:47:17.340Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 1ef08b20-0241-11e7-a6b1-ab15bc28129e +[ERROR - 2017-03-06T07:47:17.912Z] Session [1ef08b20-0241-11e7-a6b1-ab15bc28129e] - page.onError - msg: ReferenceError: Can't find variable: swfobject + + phantomjs://platform/console++.js:263 in error +[ERROR - 2017-03-06T07:47:17.912Z] Session [1ef08b20-0241-11e7-a6b1-ab15bc28129e] - page.onError - stack: + support (https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/voice/js/voice_1e62c0f.js:23) + (anonymous function) (https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/js/all_async_search_ad79fcd.js:386) + i (https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/js/all_async_search_ad79fcd.js:110) + b (https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/js/all_async_search_ad79fcd.js:110) + (anonymous function) (https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/js/all_async_search_ad79fcd.js:114) + i (https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/js/all_async_search_ad79fcd.js:110) + g (https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/js/all_async_search_ad79fcd.js:109) + n (https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/js/all_async_search_ad79fcd.js:108) + p (https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/js/all_async_search_ad79fcd.js:109) + d (https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/js/all_async_search_ad79fcd.js:107) + o (https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/js/all_async_search_ad79fcd.js:110) + n (https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/js/all_async_search_ad79fcd.js:117) + + phantomjs://platform/console++.js:263 in error diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/keylist.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/keylist.txt" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/keylist.txt" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/keylist.txt" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/urllist.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/urllist.txt" new file mode 100644 index 00000000..59322642 --- /dev/null +++ "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/get_id/urllist.txt" @@ -0,0 +1,247 @@ +https://baijiahao.baidu.com/s?id=1553948881259842 +https://baijiahao.baidu.com/s?id=1551889785289213 +https://baijiahao.baidu.com/s?id=1554302586761947 +https://baijiahao.baidu.com/s?id=1551405511794484 +https://baijiahao.baidu.com/s?id=1548133560956542 +https://baijiahao.baidu.com/s?id=1550960209792859 +https://baijiahao.baidu.com/s?id=1559293598258649 +https://baijiahao.baidu.com/s?id=1549489020618000 +https://baijiahao.baidu.com/s?id=1559636467989300 +https://baijiahao.baidu.com/s?id=1553692119731645 +https://baijiahao.baidu.com/s?id=1552299569225368 +https://baijiahao.baidu.com/s?id=1552035921550434 +https://baijiahao.baidu.com/s?id=1549448598122488 +https://baijiahao.baidu.com/s?id=1551778742308601 +https://baijiahao.baidu.com/s?id=1550022235918081 +https://baijiahao.baidu.com/s?id=1551025218996433 +https://baijiahao.baidu.com/s?id=1554399858624556 +https://baijiahao.baidu.com/s?id=1553582925209189 +http://baijiahao.baidu.com/s?id=1556115293931135 +https://baijiahao.baidu.com/s?id=1559726755889568 +http://baijiahao.baidu.com/s?id=1553138279712096 +https://baijiahao.baidu.com/s?id=1554476899861959 +https://baijiahao.baidu.com/s?id=1554377024115324 +https://baijiahao.baidu.com/s?id=1552675738850209 +http://baijiahao.baidu.com/s?id=1554581149767353 +https://baijiahao.baidu.com/s?id=1559905732783043 +https://baijiahao.baidu.com/s?id=1549296295514136 +https://baijiahao.baidu.com/s?id=1551687915129683 +https://baijiahao.baidu.com/s?id=1551631696502053 +https://baijiahao.baidu.com/s?id=1553788119398905 +https://baijiahao.baidu.com/s?id=1554481414859790 +https://baijiahao.baidu.com/s?id=1543804324418243 +https://baijiahao.baidu.com/s?id=1550956312758747 +https://baijiahao.baidu.com/s?id=1552950903243344 +https://baijiahao.baidu.com/s?id=1553295798255160 +https://baijiahao.baidu.com/s?id=1552292596126106 +https://baijiahao.baidu.com/s?id=1554113590635015 +https://baijiahao.baidu.com/s?id=1558383580216728 +https://baijiahao.baidu.com/s?id=1552482271877607 +https://baijiahao.baidu.com/s?id=1560373974747510 +https://baijiahao.baidu.com/s?id=1550301256745167 +https://baijiahao.baidu.com/s?id=1557846498221776 +https://baijiahao.baidu.com/s?id=1550392115650114 +https://baijiahao.baidu.com/s?id=1554294662325986 +https://baijiahao.baidu.com/s?id=1552686557875185 +https://baijiahao.baidu.com/s?id=1553677736863522 +https://baijiahao.baidu.com/s?id=1554220373406171 +https://baijiahao.baidu.com/s?id=1547799065116942 +https://baijiahao.baidu.com/s?id=1552130525812327 +https://baijiahao.baidu.com/s?id=1552435531359021 +https://baijiahao.baidu.com/s?id=1554016995768231 +https://baijiahao.baidu.com/s?id=1553458880737743 +https://baijiahao.baidu.com/s?id=1551154752639943 +https://baijiahao.baidu.com/s?id=1554112619931042 +https://baijiahao.baidu.com/s?id=1550781072950488 +https://baijiahao.baidu.com/s?id=1547914641433119 +http://baijiahao.baidu.com/s?id=1552853299126870 +https://baijiahao.baidu.com/s?id=1551249689407182 +https://baijiahao.baidu.com/s?id=1556685276751215 +https://baijiahao.baidu.com/s?id=1557406109122709 +https://baijiahao.baidu.com/s?id=1547592474916296 +http://baijiahao.baidu.com/s?id=1555283000901677 +https://baijiahao.baidu.com/s?id=1557048762522637 +https://baijiahao.baidu.com/s?id=1559557445363712 +https://baijiahao.baidu.com/s?id=1555668101245107 +https://baijiahao.baidu.com/s?id=1551961276761870 +https://baijiahao.baidu.com/s?id=1554480546291584 +https://baijiahao.baidu.com/s?id=1554243275760501 +https://baijiahao.baidu.com/s?id=1554462874848744 +https://baijiahao.baidu.com/s?id=1551793725444463 +https://baijiahao.baidu.com/s?id=1552944173283285 +https://baijiahao.baidu.com/s?id=1553321085205696 +https://baijiahao.baidu.com/s?id=1559837939915476 +https://baijiahao.baidu.com/s?id=1558486150761293 +https://baijiahao.baidu.com/s?id=1553042193684822 +http://baijiahao.baidu.com/s?id=1555225618710730 +http://baijiahao.baidu.com/s?id=1554391149101170 +https://baijiahao.baidu.com/s?id=1554375709608627 +http://baijiahao.baidu.com/s?id=1554502109010674 +https://baijiahao.baidu.com/s?id=1549959061424064 +http://baijiahao.baidu.com/s?id=1555653481286149 +https://baijiahao.baidu.com/s?id=1556852312073631 +https://baijiahao.baidu.com/s?id=1558315992874241 +http://baijiahao.baidu.com/s?id=1554590362427118 +https://baijiahao.baidu.com/s?id=1553159522709133 +http://baijiahao.baidu.com/s?id=1555061695366431 +https://baijiahao.baidu.com/s?id=1551310937911678 +https://baijiahao.baidu.com/s?id=1555416987110372 +https://baijiahao.baidu.com/s?id=1557409554825459 +https://baijiahao.baidu.com/s?id=1552505612531510 +https://baijiahao.baidu.com/s?id=1553590670559445 +https://baijiahao.baidu.com/s?id=1552408952025195 +https://baijiahao.baidu.com/s?id=1559838189686814 +https://baijiahao.baidu.com/s?id=1553200547112615 +https://baijiahao.baidu.com/s?id=1551176167606010 +https://baijiahao.baidu.com/s?id=1547336359187143 +https://baijiahao.baidu.com/s?id=1547926784276788 +https://baijiahao.baidu.com/s?id=1551751732361650 +https://baijiahao.baidu.com/s?id=1554027525123182 +https://baijiahao.baidu.com/s?id=1551406160713932 +https://baijiahao.baidu.com/s?id=1551679877930159 +https://baijiahao.baidu.com/s?id=1556338606467770 +https://baijiahao.baidu.com/s?id=1552971658214571 +https://baijiahao.baidu.com/s?id=1554390573818151 +https://baijiahao.baidu.com/s?id=1558856458695214 +https://baijiahao.baidu.com/s?id=1550845239965894 +https://baijiahao.baidu.com/s?id=1557239622823895 +https://baijiahao.baidu.com/s?id=1560976334032375 +http://baijiahao.baidu.com/s?id=1555946147958586 +http://baijiahao.baidu.com/s?id=1553297801817562 +https://baijiahao.baidu.com/s?id=1553073519059198 +https://baijiahao.baidu.com/s?id=1560191755449322 +https://baijiahao.baidu.com/s?id=1551958243470725 +https://baijiahao.baidu.com/s?id=1560304053284574 +http://baijiahao.baidu.com/s?id=1554747441876419 +http://baijiahao.baidu.com/s?id=1555681320298831 +http://baijiahao.baidu.com/s?id=1556005965945927 +https://baijiahao.baidu.com/s?id=1553406643546315 +https://baijiahao.baidu.com/s?id=1558474679384768 +https://baijiahao.baidu.com/s?id=1560009522640251 +https://baijiahao.baidu.com/s?id=1553295718918451 +https://baijiahao.baidu.com/s?id=1559564972303422 +https://baijiahao.baidu.com/s?id=1560221418139407 +https://baijiahao.baidu.com/s?id=1551658439358606 +https://baijiahao.baidu.com/s?id=1552524555630105 +https://baijiahao.baidu.com/s?id=1557219571037910 +https://baijiahao.baidu.com/s?id=1556754990774189 +https://baijiahao.baidu.com/s?id=1557189759152996 +http://baijiahao.baidu.com/s?id=1555781477215911 +http://baijiahao.baidu.com/s?id=1552322433666869 +https://baijiahao.baidu.com/s?id=1559653971491914 +https://baijiahao.baidu.com/s?id=1553941394130673 +https://baijiahao.baidu.com/s?id=1550963211410876 +https://baijiahao.baidu.com/s?id=1553245577573193 +https://baijiahao.baidu.com/s?id=1560113027889704 +http://baijiahao.baidu.com/s?id=1555304232515699 +https://baijiahao.baidu.com/s?id=1560221857005084 +https://baijiahao.baidu.com/s?id=1556662373178223 +https://baijiahao.baidu.com/s?id=1558926394840072 +https://baijiahao.baidu.com/s?id=1552940591965066 +https://baijiahao.baidu.com/s?id=1560024413182745 +https://baijiahao.baidu.com/s?id=1553931829012362 +http://baijiahao.baidu.com/s?id=1553288927113126 +https://baijiahao.baidu.com/s?id=1552116548015775 +https://baijiahao.baidu.com/s?id=1555853765872965 +https://baijiahao.baidu.com/s?id=1551761513096597 +https://baijiahao.baidu.com/s?id=1550846150407953 +https://baijiahao.baidu.com/s?id=1553761615576454 +https://baijiahao.baidu.com/s?id=1559978872487863 +https://baijiahao.baidu.com/s?id=1560213599943085 +http://baijiahao.baidu.com/s?id=1555124791409902 +http://baijiahao.baidu.com/s?id=1555216168389151 +https://baijiahao.baidu.com/s?id=1553790422980163 +http://baijiahao.baidu.com/s?id=1554935312291132 +http://baijiahao.baidu.com/s?id=1545869116055635 +http://baijiahao.baidu.com/s?id=1554476162887758 +https://baijiahao.baidu.com/s?id=1554308880640788 +https://baijiahao.baidu.com/s?id=1556300720491189 +http://baijiahao.baidu.com/s?id=1555240182945408 +https://baijiahao.baidu.com/s?id=1551958548508981 +https://baijiahao.baidu.com/s?id=1559748811631621 +https://baijiahao.baidu.com/s?id=1559547309629671 +http://baijiahao.baidu.com/s?id=1555417249283856 +https://baijiahao.baidu.com/s?id=1553114739408262 +https://baijiahao.baidu.com/s?id=1558582912082768 +https://baijiahao.baidu.com/s?id=1553842127701175 +https://baijiahao.baidu.com/s?id=1557665947248505 +http://baijiahao.baidu.com/s?id=1554829343956227 +https://baijiahao.baidu.com/s?id=1560266372181655 +http://baijiahao.baidu.com/s?id=1555505956261930 +https://baijiahao.baidu.com/s?id=1556235046647602 +https://baijiahao.baidu.com/s?id=1556480508656644 +http://baijiahao.baidu.com/s?id=1555826195690741 +https://baijiahao.baidu.com/s?id=1559736178016272 +https://baijiahao.baidu.com/s?id=1560111750721851 +https://baijiahao.baidu.com/s?id=1552932335100885 +https://baijiahao.baidu.com/s?id=1557048893107757 +https://baijiahao.baidu.com/s?id=1552665675185651 +https://baijiahao.baidu.com/s?id=1557498871320153 +https://baijiahao.baidu.com/s?id=1559463187948841 +http://baijiahao.baidu.com/s?id=1551296694610256 +https://baijiahao.baidu.com/s?id=1559927494292796 +https://baijiahao.baidu.com/s?id=1558542779852851 +http://baijiahao.baidu.com/s?id=1555207085231012 +http://baijiahao.baidu.com/s?id=1555952498375414 +https://baijiahao.baidu.com/s?id=1553066911085745 +http://baijiahao.baidu.com/s?id=1555771870526478 +https://baijiahao.baidu.com/s?id=1558653191979339 +https://baijiahao.baidu.com/s?id=1559038180625726 +https://baijiahao.baidu.com/s?id=1558728936343437 +https://baijiahao.baidu.com/s?id=1559414862847909 +https://baijiahao.baidu.com/s?id=1552852778174880 +https://baijiahao.baidu.com/s?id=1551125007923084 +http://baijiahao.baidu.com/s?id=1555678247645064 +https://baijiahao.baidu.com/s?id=1551853119842961 +https://baijiahao.baidu.com/s?id=1554472503149061 +https://baijiahao.baidu.com/s?id=1559295191263367 +http://baijiahao.baidu.com/s?id=1555743092426512 +https://baijiahao.baidu.com/s?id=1560014112547253 +https://baijiahao.baidu.com/s?id=1560544262302830 +https://baijiahao.baidu.com/s?id=1560109929266492 +https://baijiahao.baidu.com/s?id=1559924058707695 +https://baijiahao.baidu.com/s?id=1552840891086083 +https://baijiahao.baidu.com/s?id=1559410280363220 +https://baijiahao.baidu.com/s?id=1556340139619230 +https://baijiahao.baidu.com/s?id=1551403054048211 +https://baijiahao.baidu.com/s?id=1552862643592602 +http://baijiahao.baidu.com/s?id=1553228417454087 +https://baijiahao.baidu.com/s?id=1550336326354309 +https://baijiahao.baidu.com/s?id=1559648521196403 +https://baijiahao.baidu.com/s?id=1557400538972077 +http://baijiahao.baidu.com/s?id=1554563531422363 +https://baijiahao.baidu.com/s?id=1552865025775336 +https://baijiahao.baidu.com/s?id=1559909723926540 +http://baijiahao.baidu.com/s?id=1556024430826899 +https://baijiahao.baidu.com/s?id=1559656744763096 +https://baijiahao.baidu.com/s?id=1556224297125304 +http://baijiahao.baidu.com/s?id=1555302369466322 +https://baijiahao.baidu.com/s?id=1552581453634019 +https://baijiahao.baidu.com/s?id=1560682850545016 +http://baijiahao.baidu.com/s?id=1555586159691483 +https://baijiahao.baidu.com/s?id=1560292541310740 +http://baijiahao.baidu.com/s?id=1548004540581820 +https://baijiahao.baidu.com/s?id=1559639566503825 +https://baijiahao.baidu.com/s?id=1558573432550108 +https://baijiahao.baidu.com/s?id=1559041141660053 +https://baijiahao.baidu.com/s?id=1552222353266229 +http://baijiahao.baidu.com/s?id=1555034910453673 +https://baijiahao.baidu.com/s?id=1561087550456305 +https://baijiahao.baidu.com/s?id=1556391450339176 +https://baijiahao.baidu.com/s?id=1559382920429574 +https://baijiahao.baidu.com/s?id=1558657280273430 +https://baijiahao.baidu.com/s?id=1559269479144875 +https://baijiahao.baidu.com/s?id=1560012366183213 +https://baijiahao.baidu.com/s?id=1554589870907628 +https://baijiahao.baidu.com/s?id=1559774839735725 +http://baijiahao.baidu.com/s?id=1555201523097413 +http://baijiahao.baidu.com/s?id=1555378088818083 +https://baijiahao.baidu.com/s?id=1560173192520181 +https://baijiahao.baidu.com/s?id=1561016151005631 +https://baijiahao.baidu.com/s?id=1559987317034015 +https://baijiahao.baidu.com/s?id=1553019545852277 +http://baijiahao.baidu.com/s?id=1555557350520694 +https://baijiahao.baidu.com/s?id=1557510332939130 +https://baijiahao.baidu.com/s?id=1556328769485992 +https://baijiahao.baidu.com/s?id=1559535782586097 +http://baijiahao.baidu.com/s?id=15 \ No newline at end of file diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/2017_2_6.xlsx" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/2017_2_6.xlsx" new file mode 100644 index 00000000..6eb268ee Binary files /dev/null and "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/2017_2_6.xlsx" differ diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/fingerDic.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/fingerDic.txt" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/fingerDic.txt" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/fingerDic.txt" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/get_id_mysql.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/get_id_mysql.py" similarity index 98% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/get_id_mysql.py" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/get_id_mysql.py" index c4943464..f975191b 100644 --- "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/get_id_mysql.py" +++ "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/get_id_mysql.py" @@ -13,7 +13,7 @@ sys.setdefaultencoding('utf-8') Type = sys.getfilesystemencoding() -Table = "cn_proj_landmark_hebei_baidu_copy"#sys.argv[1] +Table = " "#sys.argv[1] THREAD_COUNT = 50 #需要修改 schedule = 0 HOST, USER, PASSWD, DB, PORT = '','','', '', 23306#需要修改 diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/get_id_txt.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/get_id_txt.py" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/get_id_txt.py" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/get_id_txt.py" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/id_hebin.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/id_hebin.py" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/id_hebin.py" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/id_hebin.py" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/linkurl_to_id.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/linkurl_to_id.py" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/linkurl_to_id.py" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/linkurl_to_id.py" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/spider_list_to_excel.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/spider_list_to_excel.py" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/spider_list_to_excel.py" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/spider_list_to_excel.py" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/spider_list_to_mysql.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/spider_list_to_mysql.py" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/spider_list_to_mysql.py" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/spider_list_to_mysql.py" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/temp.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/temp.txt" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/temp.txt" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/temp.txt" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist.txt" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist.txt" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist.txt" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist0.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist0.txt" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist0.txt" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist0.txt" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist1.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist1.txt" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist1.txt" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist1.txt" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist2.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist2.txt" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist2.txt" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist2.txt" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist3.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist3.txt" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist3.txt" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist3.txt" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist4.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist4.txt" new file mode 100644 index 00000000..e69de29b diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist5.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist5.txt" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist5.txt" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist5.txt" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist_2_6.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist_2_6.txt" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/text/urllist_2_6.txt" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/text/urllist_2_6.txt" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/time_out_fuc.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/time_out_fuc.py" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/time_out_fuc.py" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/time_out_fuc.py" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/url_to_id.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/url_to_id.txt" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/url_to_id.txt" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/url_to_id.txt" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/urllist.txt" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/urllist.txt" new file mode 100644 index 00000000..f572648f --- /dev/null +++ "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/urllist.txt" @@ -0,0 +1,740 @@ +1545088776537192 +1552207487001854 +1547807874609109 +1549495346986617 +1550966674330496 +1552314191804283 +1554887487606255 +1547702403216877 +1539018034261023 +1543148776726403 +1546688762066080 +1543059162087803 +1553587864315739 +1537307971556811 +1537281685766988 +1549147001778778 +1550666241900566 +1550781807142832 +1550935648676002 +1545409628358782 +1549414470885495 +1546551181745930 +1549982161060389 +1544912354826155 +1553535710373130 +1536928819374018 +1550502864751528 +1547724561467757 +1553570263351311 +1547498864321584 +1546797884739913 +1537893768517506 +1550633884518654 +1550423455574081 +1536854034253373 +1553513073675533 +1552326841535443 +1549227296692637 +1550621250133484 +1550124300139642 +1549400766715529 +1546714003335513 +1550136591966375 +1553664442841385 +1556388838442697 +1553679185083922 +1547626811526514 +1540185833306023 +1553143222515571 +1551029010800996 +1536766846283363 +1550064004809403 +1550982448280366 +1548724680172079 +1544327775176001 +1546875343001090 +1536767660929123 +1538353244545035 +1548678348705829 +1555135120155061 +1539729329615969 +1550893357010643 +1550422552228852 +1550685152515729 +1547539088341458 +1550322351319126 +1553791646260205 +1550422888401952 +1537019693714134 +1549127659920852 +1550954216899132 +1551254115694461 +1545793063806275 +1536767188728184 +1550308732417528 +1551696576379662 +1552596452362052 +1550502634468884 +1551849040511827 +1551968512149452 +1547718919302474 +1551241552058873 +1549608413453462 +1536767244136893 +1550349243966239 +1549248700821221 +1551348436246323 +1551756368849009 +1539729320951885 +1547445453090174 +1549974399926031 +1547533811471853 +1546692950004835 +1553138625979055 +1553704006753326 +1550954879928096 +1538380620640678 +1550491970845566 +1550785432004569 +1551049363505759 +1550953538819079 +1544961716909221 +1540262717593457 +1537186580809239 +1551600255248403 +1538346899224532 +1553488668035049 +1549425642642050 +1550692462386597 +1544424409925706 +1549716515256427 +1550119293694164 +1550966974445682 +1545056089352011 +1552037537111450 +1553586721663261 +1552567904173236 +1550421064156078 +1546704141555131 +1547689623327564 +1548059563216619 +1547863457808675 +1537730547256076 +1547318682468045 +1536766602271830 +1550717762350676 +1548357541036306 +1549855889996565 +1550678812602312 +1543053315812213 +1550039822973338 +1555649228974353 +1537271617024896 +1550037923678430 +1543055152175959 +1546617041317980 +1553323140534938 +1550331715612554 +1552858791367618 +1537100260684241 +1545165476102108 +1536667817802713 +1549438205889783 +1553674483411789 +1547512493766131 +1536557752918599 +1550580823875064 +1546079337994304 +1550947367462524 +1549777648093793 +1548770610033715 +1549246947235798 +1551861230904499 +1549330394915941 +1553332501952183 +1554196501090422 +1547441410239990 +1553579767501768 +1549255309314335 +1550888272064721 +1538537099131548 +1536769308908672 +1552422950033071 +1549770794136661 +1546865348615564 +1550952332912302 +1551050900410134 +1537091904970662 +1551952822522972 +1547796365734712 +1551793447737795 +1537810316529191 +1548054536106339 +1552070058168126 +1546722082019925 +1550670197189762 +1544802185833247 +1550424455494254 +1552485437217243 +1547769449460204 +1550864297174424 +1536767984069926 +1549125549253842 +1550963995344563 +1549251810468652 +1550580023648296 +1550948034216643 +1549304116771266 +1552412770892024 +1553775563606346 +1552573616394441 +1551852011698018 +1544998992170121 +1546807523837924 +1553677429848495 +1548035824954240 +1538432277235708 +1551597834741699 +1550589562340323 +1551676154986174 +1549875511850741 +1556101497854275 +1537280409466611 +1548604025917379 +1536552663110584 +1546734096835527 +1544810581427418 +1553122471913839 +1536585761489190 +1543162377088773 +1550669664097714 +1550578976365369 +1551689474571936 +1548531701162108 +1546734396379776 +1550979241153631 +1550480785381490 +1536766975198570 +1544738006928691 +1549044760638107 +1548775210444368 +1547686849710159 +1550675287358946 +1549396159936687 +1545093763836095 +1547725484510628 +1546142825547138 +1546950678292863 +1550536971639634 +1550323138177162 +1548404584674859 +1536930113265732 +1536740360109621 +1536576618598106 +1549678081103394 +1553701951475514 +1550512586814929 +1547605598643460 +1550640101657414 +1550230893427616 +1553575504475243 +1538998139071721 +1544319068027300 +1546685057503986 +1536771423150175 +1550895712157453 +1552228719382070 +1536768279494925 +1550681391078635 +1547147912267972 +1546511083195022 +1540184233737707 +1550521737415308 +1550596863382018 +1550637374548470 +1536795372324334 +1536766603275304 +1551896381450799 +1552042588064296 +1550635176683039 +1550422837992481 +1544331365809334 +1546689452046484 +1552149871140274 +1546713357062187 +1550644243266247 +1551581032549254 +1550501055943223 +1548606198313772 +1550593170932211 +1550874579350664 +1551119592572255 +1552066602027428 +1548154709576340 +1547520595309969 +1549408895157679 +1550390664973783 +1536767647193373 +1545621813156625 +1548073237764225 +1544350405015429 +1550667477268467 +1542169698369484 +1554217848823392 +1550424812016041 +1549779218371166 +1545073488399019 +1547690479477514 +1536767567191845 +1550486247625762 +1537723364395786 +1548723005260846 +1547603302563617 +1550055402649700 +1547178023011074 +1536948720527761 +1551772058528153 +1550325656410411 +1549045725353674 +1552146496695013 +1537023462787936 +1553396370365287 +1550645407351648 +1553050012289054 +1554144315166703 +1551022534421574 +1536771001787027 +1550500233547387 +1551751177327802 +1549761240784194 +1553394385880697 +1539375480640508 +1542535554468118 +1553710796562298 +1550760397738232 +1550254767471836 +1551249985726042 +1552127592091436 +1547055536029179 +1550310947509031 +1537026551788417 +1549323702645285 +1550671674213969 +1551029223052837 +1536767152331793 +1551126324211098 +1546710086948591 +1553242448987999 +1553746602516714 +1550631711571194 +1548778315208141 +1544458606893611 +1553395764486063 +1553049691333770 +1550970614692191 +1550855051045705 +1550395684274259 +1545245450209738 +1537208731185307 +1544445429751588 +1552884252803622 +1553378497384409 +1551124700394431 +1550955474212192 +1539729317928956 +1544250135708156 +1541448117513265 +1550495389666856 +1550954820618985 +1545016020995930 +1538446543142556 +1550942922847592 +1549042240308033 +1536767157006156 +1550964632287470 +1550506446359368 +1549334108131826 +1550486004191941 +1546857451587647 +1546540081148624 +1549176835396727 +1539102086425324 +1546708545236514 +1547505222521272 +1536771731538301 +1552025480396031 +1549539240204322 +1545380555321595 +1551942929448898 +1546228801560048 +1541451276288784 +1551501358420819 +1550341498451424 +1550231071362628 +1550943122152325 +1545350160450430 +1553749324759854 +1538351705252123 +1541082124120163 +1544796071181239 +1556212170910923 +1550422595081832 +1551145381698447 +1550777462278071 +1547723001533951 +1545450825745800 +1555152250332123 +1536770562158568 +1551512979512869 +1540475648211424 +1554509146360990 +1547793975835024 +1549306394771579 +1549811648906066 +1548050031476304 +1550634341158035 +1536536533032970 +1550957120930743 +1550893611305402 +1537288374333270 +1549714986458975 +1545072045796663 +1553140323530842 +1552155058983674 +1550595406032412 +1550407405868503 +1547589001090978 +1552341312400388 +1542337023282710 +1550421300646443 +1551115880093549 +1550609485500569 +1537989322415140 +1550307449286579 +1546971775416005 +1548874700508553 +1538550172005052 +1553239667767416 +1550779585030377 +1550942358780881 +1552029210316378 +1538983206001493 +1549700161573073 +1544342024043734 +1553699694049366 +1541786967333460 +1536766803360169 +1546533147819606 +1550637501007442 +1537005858812215 +1546694632725333 +1537304734347380 +1552590684446082 +1546145190714318 +1544864356802663 +1550598085425227 +1553257764451183 +1551030492391995 +1551138979126226 +1549870904964009 +1551031350338370 +1538615177036546 +1550506773228798 +1537275495115373 +1545077765003863 +1547231323939419 +1552182684850225 +1551212081297901 +1550689818954724 +1537377175251057 +1545454802402801 +1546701020265506 +1553679153216539 +1548516965742847 +1550412054484818 +1536656843514037 +1550572259465255 +1553575885606757 +1549865144416857 +1553225732202273 +1550637197489238 +1550665739071273 +1537087328041146 +1546086375660508 +1550057532710727 +1549772693989002 +1538293370120695 +1541822142547668 +1536766242105522 +1550885629035933 +1545710050674727 +1537259011661566 +1550581339016698 +1552415265469637 +1551677666699820 +1548243251467169 +1551503354368168 +1547784176085901 +1553109287051114 +1551071549221913 +1551939813917617 +1551418292329487 +1536562551955247 +1549667253269360 +1549064112412433 +1551846407697212 +1552970837346034 +1550428070402051 +1546681062252268 +1553298323900473 +1551144107434005 +1545232593843272 +1553087990310006 +1547240415299308 +1536766873015493 +1536769342093375 +1550417612014784 +1536773988134914 +1548701289207930 +1547675723415812 +1541384236317174 +1537466691329544 +1545591502265705 +1546617062013314 +1553607191526199 +1547783049464436 +1537349250860428 +1538643152010580 +1537121236932264 +1550878060604075 +1549868594173246 +1552798635098096 +1549711246041599 +1549403621509065 +1550644252408036 +1550602720385000 +1550951436678856 +1552700086504321 +1547794009130186 +1551029782401264 +1550947200485949 +1548065443336729 +1549518252440403 +1550632864502783 +1548133463750508 +1546317177546494 +1550634740899787 +1536766209631977 +1547516271457512 +1549413724815412 +1550962485549659 +1550511842732348 +1551774437698130 +1550938040116451 +1550773447990123 +1555327183556881 +1550430421558555 +1550706725327540 +1551144916723333 +1550321597922849 +1550963114901404 +1543161879248222 +1549785048531744 +1553024180167123 +1551866532811403 +1537817657390820 +1549687110798506 +1538180289459482 +1537378971213583 +1552367997316993 +1552944880922061 +1545867173392074 +1547785068859022 +1550666329166288 +1546789173647460 +1549313401288437 +1553517989546175 +1553786164912012 +1537319367728133 +1551116160916429 +1550878361055480 +1550640578570420 +1546210117359409 +1553032330997310 +1551496453278628 +1547542881235626 +1550030087703209 +1548488859893201 +1550978279271053 +1550415872752591 +1550323306355033 +1538726572707350 +1550775530944959 +1536766847459976 +1552886346105693 +1539279561576223 +1549885546919931 +1537367493700127 +1538364906745832 +1552513127221237 +1537372890773644 +1537376350860543 +1548061013074466 +1550578173023964 +1536978012589383 +1546700235224481 +1551956055445961 +1553480125050562 +1546698460600876 +1550670997168398 +1551850365360012 +1550643349493190 +1550422863773122 +1551029933449702 +1550960316079203 +1536772508583909 +1549665674046834 +1549786432140924 +1546689042514144 +1550641996125939 +1553298638898719 +1551146279423258 +1549779689970162 +1550510429711544 +1551882213078736 +1550569846790213 +1550975524555938 +1554603948442014 +1549502285914133 +1551053962006827 +1547422650887056 +1547509376450027 +1550937110880026 +1550851564518578 +1544342461392342 +1549223972694235 +1537195104256385 +1550510249088811 +1546617045524999 +1550637068304550 +1537302552778525 +1552048651214717 +1546231681924136 +1552597421633629 +1550579276244473 +1548069049217031 +1553210022113839 +1547977256612088 +1545457286817746 +1545899496566915 +1537355863806024 +1544908573338153 +1550633117626208 +1550691099215852 +1550045317895817 +1546689204561135 +1538613774030698 +1550682206159528 +1536769475189274 +1544697654981540 +1537106060240336 +1550682036838410 +1550506488075349 +1545413692030428 +1549176495734103 +1550700622558944 +1549862073219213 +1550422142235394 +1549787463190809 +1547407994282397 +1548494212845979 +1550946650743262 +1550881791893440 +1547516142586163 +1552430228379695 +1545551619513931 +1550424982083360 +1550600327001776 +1550679625926205 +1552317627039727 +1550854210955924 +1555042844573030 +1549227798668549 +1551257821470814 +1547691119707110 +1553857820814299 +1552061908696311 +1549853120818622 +1551771698645218 +1538625514192764 +1546675598121908 +1550637612637913 +1553126355144963 +1546705257735003 +1553585862253392 +1549982742345648 +1538012284032881 +1545352134008433 +1550412557115092 +1548160137440627 +1549975819681066 +1552498431581185 +1551143892234963 +1552606646270396 +1547781697338545 +1547536292810982 +1550945101375991 +1549602250108326 +1548590640544157 +1546061370604325 +1553663250287830 +1549783581509202 +1552504743247263 +1550443183968612 +1548049910846390 +1542263996102273 +1550485295386144 +1550755050707400 +1551146802326855 +1546675232597284 +1550896417059848 +1552860333550510 +1545415193890348 +1552144154855100 +1551168224184735 +1550440071426566 +1550593242528366 +1550978475615582 +1545089081167520 +1549495322589356 +1538906652068150 +1550163917424863 +1549869968065677 +1553046744735542 +1545914173169196 +1553333226017018 +1537190941905145 +1540254607011475 +1550865261763338 +1536911341618848 +1550847300884894 +1551771058815753 +1551881439908725 +1553575023623724 +1548150792095953 +1550260734489689 +1536766470571656 +1537467124971398 +1546882925540851 +1549039003378304 +1551107502701322 +1549866693167425 +1538737027048535 +1550326860153057 +1545897159267492 +1537088754947125 +1552521924257248 +1551940668302550 +1552404213063884 +1552076182615631 diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/word_get_url.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/word_get_url.py" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/word_get_url.py" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/word_get_url.py" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/word_get_url_phantomjs.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/word_get_url_phantomjs.py" similarity index 100% rename from "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao/word_get_url_phantomjs.py" rename to "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/id_to_excel/word_get_url_phantomjs.py" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/main.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/main.py" deleted file mode 100644 index c27d1185..00000000 --- "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/main.py" +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -#---------------------------------------------------------------------------------------- -# 程序:baijiahao.py -# 版本:1 -# 作者:ly -# 日期:编写日期2017/1/20 -# 语言:Python 2.7.x -# 操作:python GDToOSM.py Table -# 功能:从内部接口爬去百家号url -#------------------------------------------------------------------------------------------ - -import requests -import json,time -import sys -reload(sys) -sys.setdefaultencoding('utf-8') -Type = sys.getfilesystemencoding() -url = "http://baijiahao.baidu.com/api/content/article/listall?sk=super&ak=super&\ - app_id=1541190710072607&_skip=0&_limit=295&status=in:publish,published&\ - _preload_statistic=1&_timg_cover=50,172,1000&_cache=1" - -url = "https://club.jd.com/comment/productPageComments.action?callback=\ -fetchJSON_comment98vv31400&productId=3133813&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0" - -body = requests.get(url).text.encode('utf8').decode('utf8') - -print body[26:] -time.sleep(10) - -body_json = type(eval(body[26:])) - -print body_json -#for i in range(len(body_json['items'])): - #print body_json['items'][i]['id'] diff --git a/UrlSpider/model/UrlSpider.py b/UrlSpider/model/UrlSpider.py index 663e25c2..56ded5a4 100644 --- a/UrlSpider/model/UrlSpider.py +++ b/UrlSpider/model/UrlSpider.py @@ -1,211 +1,231 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -#------------------------------------------------------------------------- -# 程序:UrlSpider.py -# 版本:1 -# 作者:ly -# 日期:编写日期2016/12/25 -# 语言:Python 2.7.x -# 操作:python UrlSpider.py -# 功能:指定任务表,读取url,多线程采集 -# 表结构(id, ip, lon_gd, lat_gd, datetime, flag) -# 采用数据库批量插入优化等表结构优化 -#------------------------------------------------------------------------- -import re ,os ,sys ,time ,json ,random ,MySQLdb ,requesocks ,threading,requests - -#-------------------------------------------------- -#中文编码设置 -reload(sys) -sys.setdefaultencoding('utf-8') -Type = sys.getfilesystemencoding() - -#------------------------------------------------ -# 代理以及tor设置。 -session = requesocks.session() -# session.proxies = {'http':'socks5://127.0.0.1:9050','https':'socks5://127.0.0.1:9050'} - -#------------------------------------------------ -# 可修改的全局变量参数 -Table = "table" # 表名称需修改 -HOST, USER, PASSWD, DB, PORT = 'host', 'user', 'pass', 'dbname', 3306 # 数据库连接参数 -select_sql = "SELECT id,url FROM %s where flag = 3 limit 30000;" # 在数据库中i已经打乱了. -Update_sql = "UPDATE "+Table+" SET date=%s, flag=%s WHERE id =%s;" #数据存储 - -THREAD_COUNT = 50 #开启线程数 -sql_num_base = 200 #自定义的执行批量插入的随机值基数,当此值为1时则每次获取数据均直接插入。 -sql_num_add = 100 #自定义的随机值加数,平均而言,当单独一个线程执行sql_num_base+1/3*sql_num_add次数时执行插入 -# 不可修改全局变量参数 -#------------------------------------------------ -schedule = 0 # 当前线程标志 -ErrorList = [] -WarnList = [] - -class Handle_HTML(threading.Thread): - """docstring for Handle_HTML""" - def __init__(self, lock, ThreadID, tasklist, Total_TaskNum): - super(Handle_HTML, self).__init__() - self.lock = lock - self.ThreadID = ThreadID - self.tasklist = tasklist - self.Total_TaskNum = Total_TaskNum - - def run(self): - - global schedule, ErrorList - connect, cursor = ConnectDB() - self.lock.acquire() - print "The Thread tasklist number :", len(self.tasklist) - self.lock.release() - total = len(self.tasklist) - user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36' - date_list = [] - now_requests_num = 0 - for (id, url) in self.tasklist: - # ------------------------- - # 每个请求开始前进行进度说明,对线程上锁 - self.lock.acquire() - time_Now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - print "Tread-%s:" % self.ThreadID, time_Now, "Already Completed:[%s] ,Also remaining:[%s]" % (schedule, self.Total_TaskNum - schedule) - self.lock.release() - - # ------------------------ - # 可伪造的头部信息 - headers = { - 'User-Agent': user_agent, - 'Referer':'', - 'X-Forwarded-For': ip, - 'Accept':'*/*', - 'Accept-Encoding':'gzip, deflate, sdch', - 'Accept-Language':'zh-CN,zh;q=0.8', - 'Cache-Control':'no-cache', - 'Connection':'keep-alive', - 'Host':'ditu.amap.com', - 'Pragma':'no-cache', - 'Referer':'' - #User-Agent:Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/53.0.2785.143 Chrome/53.0.2785.143 Safari/537.36 - } - URL = url - date = '' - now_requests_num += 1 - #print '*************************************',ip,i#,date_list - # ------------------------- - # 请求的具体请求部分 - try: - # -- 发起 - time.sleep(random.uniform(0, 1)) - response = session.get(URL, headers=headers) - result = response.text.encode('utf-8') - - # --- 请求解析--- 自定义使用正则还是xpath或etree,接口类数据可使用json - if result: - date = result - date_list.append([date,1,id])# 用于批量插入,需要构建为一个列表,1作为flag存入 - else: - date_list.append([date,0,id])# 用于批量插入,需要构建为一个列表,0作为flag存入 - - except Exception as e: - print e - time.sleep(random.uniform(0, 3)) - ErrorList.append("The ip is :[%s] Error:%s\n result:%s" %(ip, e, result)) - - # ------------------------ - # 数据插入部分 - try: - global sql_num_base - sql_num = int(random.uniform(sql_num_base, sql_num_base + 100)) #随机一个限制数,200-300 到则进行插入 - if(now_requests_num >= sql_num): - now_requests_num = 0 - cursor.executemany(Update_sql , date_list) - connect.commit() - date_list = [] - print 'up',time.ctime(),'&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&',sql_num - except Exception ,e: - print e - time.sleep(random.uniform(0, 3)) - ErrorList.append("The ip is :[%s] Error:%s\n result:%s" %(ip, e, result)) - # 切换线程 - self.lock.acquire() - schedule += 1 - self.lock.release() - cursor.executemany(Update_sql , date_list)#大爷的注释,,这里要保存一次 - connect.commit() - connect.close() - - -def ConnectDB(): - "Connect MySQLdb " - connect, cursor = None, None - while True: - try: - connect = MySQLdb.connect( - host=HOST, user=USER, passwd=PASSWD, db=DB, port=PORT, charset='utf8') - cursor = connect.cursor() - break - except MySQLdb.Error, e: - print "Error %d: %s" % (e.args[0], e.args[1]) - return connect, cursor - - -def Thread_Handle(taskList, Total_TaskNum): - '''多线程启动区域--无需修改''' - global THREAD_COUNT - lock = threading.Lock() - WorksThread = [] - every_thread_number = len(taskList) / THREAD_COUNT - if every_thread_number == 0: - THREAD_COUNT = len(taskList) - every_thread_number = 1 - - for i in range(THREAD_COUNT): - if i != THREAD_COUNT - 1: - source_list = taskList[ - i * every_thread_number: (i + 1) * every_thread_number] - Work = Handle_HTML(lock, i, source_list, Total_TaskNum) - else: - source_list = taskList[i * every_thread_number:] - Work = Handle_HTML(lock, i, source_list, Total_TaskNum) - Work.start() - WorksThread.append(Work) - for Work in WorksThread: - Work.join() - - -def main(): - global ErrorList, WarnList - connect, cursor = ConnectDB() - - # 统计表总行数,依据flag = 3 - try: - cursor.execute("SELECT COUNT(*) FROM %s WHERE flag = 3 ;" % Table) - except Exception,e: - print e - TaskNum = cursor.fetchall() - connect.close() - - if TaskNum[0][0] == 0: - print "Warning:There is no need to do the task!!!" - else: - Total_TaskNum = int(TaskNum[0][0]) - while True: - connect, cursor = ConnectDB()# 建立数据库连接 - try: - if cursor.execute(select_sql % Table):# 取任务url - rows = cursor.fetchall() - Thread_Handle(rows, Total_TaskNum)# 线程启动 - else: - break - except Exception, e: - print e - connect.close() - print "_____************_____" - if ErrorList : - for error in ErrorList: - print error - print "Error:", len(ErrorList), "Warning:",len(WarnList) - -if __name__ == '__main__': - print "The Program start time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - start = time.time() - main() - print "The Program end time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), "[%s]" % (time.time() - start) - # raw_input("Please enter any key to exit!") +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +#------------------------------------------------------------------------- +# 程序:UrlSpider.py +# 版本:1 +# 作者:ly +# 日期:编写日期2016/12/25 +# 语言:Python 2.7.x +# 操作:python UrlSpider.py +# 功能:指定任务表,读取url,多线程采集 +# 表结构(id, ip, lon_gd, lat_gd, datetime, flag) +# 采用数据库批量插入优化等表结构优化 +#------------------------------------------------------------------------- +import json +import os +import random +import re +import sys +import time + +import MySQLdb +import requesocks + +import threading,requests + +#-------------------------------------------------- +# 中文编码设置 +reload(sys) +sys.setdefaultencoding('utf-8') +Type = sys.getfilesystemencoding() + +#------------------------------------------------ +# 代理以及tor设置。 +session = requesocks.session() +# session.proxies = {'http':'socks5://127.0.0.1:9050','https':'socks5://127.0.0.1:9050'} + +#------------------------------------------------ +# 可修改的全局变量参数 +Table = "table" # 表名称需修改 +HOST, USER, PASSWD, DB, PORT = 'host', 'user', 'pass', 'dbname', 3306 # 数据库连接参数 +# 在数据库中i已经打乱了. +select_sql = "SELECT id,url FROM %s where flag = 3 limit 30000;" +Update_sql = "UPDATE " + Table + " SET date=%s, flag=%s WHERE id =%s;" # 数据存储 + +THREAD_COUNT = 50 # 开启线程数 +sql_num_base = 200 # 自定义的执行批量插入的随机值基数,当此值为1时则每次获取数据均直接插入。 +sql_num_add = 100 # 自定义的随机值加数,平均而言,当单独一个线程执行sql_num_base+1/3*sql_num_add次数时执行插入 +# 不可修改全局变量参数 +#------------------------------------------------ +schedule = 0 # 当前线程标志 +ErrorList = [] +WarnList = [] + + +class Handle_HTML(threading.Thread): + """docstring for Handle_HTML""" + + def __init__(self, lock, ThreadID, tasklist, Total_TaskNum): + super(Handle_HTML, self).__init__() + self.lock = lock + self.ThreadID = ThreadID + self.tasklist = tasklist + self.Total_TaskNum = Total_TaskNum + + def run(self): + + global schedule, ErrorList + connect, cursor = ConnectDB() + self.lock.acquire() + print "The Thread tasklist number :", len(self.tasklist) + self.lock.release() + total = len(self.tasklist) + user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36' + date_list = [] + now_requests_num = 0 + for (id, url) in self.tasklist: + # ------------------------- + # 每个请求开始前进行进度说明,对线程上锁 + self.lock.acquire() + time_Now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + print "Tread-%s:" % self.ThreadID, time_Now, "Already Completed:[%s] ,Also remaining:[%s]" % (schedule, self.Total_TaskNum - schedule) + self.lock.release() + + # ------------------------ + # 可伪造的头部信息 + headers = { + 'User-Agent': user_agent, + 'Referer': '', + 'X-Forwarded-For': ip, + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, sdch', + 'Accept-Language': 'zh-CN,zh;q=0.8', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'Host': 'ditu.amap.com', + 'Pragma': 'no-cache', + 'Referer': '' + # User-Agent:Mozilla/5.0 (X11; Linux x86_64) + # AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu + # Chromium/53.0.2785.143 Chrome/53.0.2785.143 Safari/537.36 + } + URL = url + date = '' + now_requests_num += 1 + # print '*************************************',ip,i#,date_list + # ------------------------- + # 请求的具体请求部分 + try: + # -- 发起 + time.sleep(random.uniform(0, 1)) + response = session.get(URL, headers=headers) + result = response.text.encode('utf-8') + + # --- 请求解析--- 自定义使用正则还是xpath或etree,接口类数据可使用json + if result: + date = result + # 用于批量插入,需要构建为一个列表,1作为flag存入 + date_list.append([date, 1, id]) + else: + # 用于批量插入,需要构建为一个列表,0作为flag存入 + date_list.append([date, 0, id]) + + except Exception as e: + print e + time.sleep(random.uniform(0, 3)) + ErrorList.append( + "The ip is :[%s] Error:%s\n result:%s" % (ip, e, result)) + + # ------------------------ + # 数据插入部分 + try: + global sql_num_base + # 随机一个限制数,200-300 到则进行插入 + sql_num = int(random.uniform(sql_num_base, sql_num_base + 100)) + if(now_requests_num >= sql_num): + now_requests_num = 0 + cursor.executemany(Update_sql, date_list) + connect.commit() + date_list = [] + print 'up', time.ctime(), '&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&', sql_num + except Exception, e: + print e + time.sleep(random.uniform(0, 3)) + ErrorList.append( + "The ip is :[%s] Error:%s\n result:%s" % (ip, e, result)) + # 切换线程 + self.lock.acquire() + schedule += 1 + self.lock.release() + cursor.executemany(Update_sql, date_list) # 大爷的注释,,这里要保存一次 + connect.commit() + connect.close() + + +def ConnectDB(): + "Connect MySQLdb " + connect, cursor = None, None + while True: + try: + connect = MySQLdb.connect( + host=HOST, user=USER, passwd=PASSWD, db=DB, port=PORT, charset='utf8') + cursor = connect.cursor() + break + except MySQLdb.Error, e: + print "Error %d: %s" % (e.args[0], e.args[1]) + return connect, cursor + + +def Thread_Handle(taskList, Total_TaskNum): + '''多线程启动区域--无需修改''' + global THREAD_COUNT + lock = threading.Lock() + WorksThread = [] + every_thread_number = len(taskList) / THREAD_COUNT + if every_thread_number == 0: + THREAD_COUNT = len(taskList) + every_thread_number = 1 + + for i in range(THREAD_COUNT): + if i != THREAD_COUNT - 1: + source_list = taskList[ + i * every_thread_number: (i + 1) * every_thread_number] + Work = Handle_HTML(lock, i, source_list, Total_TaskNum) + else: + source_list = taskList[i * every_thread_number:] + Work = Handle_HTML(lock, i, source_list, Total_TaskNum) + Work.start() + WorksThread.append(Work) + for Work in WorksThread: + Work.join() + + +def main(): + global ErrorList, WarnList + connect, cursor = ConnectDB() + + # 统计表总行数,依据flag = 3 + try: + cursor.execute("SELECT COUNT(*) FROM %s WHERE flag = 3 ;" % Table) + except Exception, e: + print e + TaskNum = cursor.fetchall() + connect.close() + + if TaskNum[0][0] == 0: + print "Warning:There is no need to do the task!!!" + else: + Total_TaskNum = int(TaskNum[0][0]) + while True: + connect, cursor = ConnectDB() # 建立数据库连接 + try: + if cursor.execute(select_sql % Table): # 取任务url + rows = cursor.fetchall() + Thread_Handle(rows, Total_TaskNum) # 线程启动 + else: + break + except Exception, e: + print e + connect.close() + print "_____************_____" + if ErrorList: + for error in ErrorList: + print error + print "Error:", len(ErrorList), "Warning:", len(WarnList) + +if __name__ == '__main__': + print "The Program start time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + start = time.time() + main() + print "The Program end time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), "[%s]" % (time.time() - start) + # raw_input("Please enter any key to exit!")