Skip to content

Commit

Permalink
pep8优化代码风格
Browse files Browse the repository at this point in the history
  • Loading branch information
luyishisi committed Mar 1, 2017
1 parent 99707cf commit a41a8a3
Show file tree
Hide file tree
Showing 10 changed files with 57,866 additions and 126 deletions.
51 changes: 27 additions & 24 deletions 10.selenium/BAIDU/selenium_so_chrome.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,40 +10,43 @@
# 功能:结合crontab定时启动每天自动登录so网站,刷银牌用
#-------------------------------------------------------------------------

import sys
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time,sys

# 中文编码设置
reload(sys)
sys.setdefaultencoding('utf-8')
Type = sys.getfilesystemencoding()

#加载内核
# 加载内核
#driver = webdriver.PhantomJS()
driver = webdriver.Chrome()
#发起请求
# 发起请求
print 'beging_0'
#driver.get("http://lbsyun.baidu.com/skins/MySkin/resources/iframs/heightAccApi.html")
#driver.get("https://s.m.taobao.com/h5?event_submit_do_new_search_auction=1&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&from=1&sst=1&n=20&buying=buyitnow&q=%E7%9A%AE%E8%A3%A4%E5%A5%B3")
# driver.get("http://lbsyun.baidu.com/skins/MySkin/resources/iframs/heightAccApi.html")
# driver.get("https://s.m.taobao.com/h5?event_submit_do_new_search_auction=1&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&from=1&sst=1&n=20&buying=buyitnow&q=%E7%9A%AE%E8%A3%A4%E5%A5%B3")
driver.get("https://h5.m.taobao.com/awp/core/detail.htm?id=538287375253&abtest=25&rn=51380cce73c6e338c4f512e1d592ddb7&sid=a706d7e5bb79cfe64053bad190a02f4c")
# http://lbsyun.baidu.com/index.php?title=webapi/high-acc-ip

#获取用户名框并输入
# 获取用户名框并输入
print 'beging_1'
#elem = driver.find_element_by_xpath('/html/body/table/tbody/tr[1]/td[2]/input')

#elem.send_keys("171.15.132.58")
# elem.send_keys("171.15.132.58")

#获取密码框并输入
#print 'beging_2'
elem = driver.find_element_by_xpath('//*[@class="desc_page_box normal"]').click()
#elem.send_keys("**")desc_page_box normal
#通过回车键进行登录
#print 'beging_3'
#elem.send_keys(Keys.RETURN)
# 获取密码框并输入
# print 'beging_2'
elem = driver.find_element_by_xpath(
'//*[@class="desc_page_box normal"]').click()
# elem.send_keys("**")desc_page_box normal
# 通过回车键进行登录
# print 'beging_3'
# elem.send_keys(Keys.RETURN)

#time.sleep(10)
# time.sleep(10)
js1 = 'return document.body.scrollHeight'
js2 = 'window.scrollTo(0, document.body.scrollHeight)'

Expand All @@ -52,20 +55,20 @@
old_scroll_height = driver.execute_script(js1)
driver.execute_script(js2)
time.sleep(3)
name = ''+time.ctime().replace(' ','-')+'.png'
name = '' + time.ctime().replace(' ', '-') + '.png'
driver.save_screenshot(name)
#保存页面截图和源码
name = ''+time.ctime().replace(' ','-')+'.png'
# 保存页面截图和源码
name = '' + time.ctime().replace(' ', '-') + '.png'
driver.save_screenshot(name)
#f = open(name_html.encode('utf-8'),'w')
#f.write(driver.page_source)
#f.close()
# f.write(driver.page_source)
# f.close()

#print driver.page_source.encode('utf8')
# print driver.page_source.encode('utf8')

time.sleep(5)

#driver.quit()
#elem.clear()
#time.sleep(10)
# driver.quit()
# elem.clear()
# time.sleep(10)
driver.close()
29 changes: 16 additions & 13 deletions 10.selenium/as-rank/selenium_so_phamtomjs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,37 +10,40 @@
# 功能:结合crontab定时启动每天自动登录so网站,刷银牌用
#-------------------------------------------------------------------------

import sys
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time,sys

# 中文编码设置
reload(sys)
sys.setdefaultencoding('utf-8')
Type = sys.getfilesystemencoding()

#加载内核
# 加载内核
driver = webdriver.PhantomJS()

#发起请求≈
for i in range(10,20):
# 发起请求≈
for i in range(10, 20):
try:
print i,'begin',time.ctime()
print i, 'begin', time.ctime()
num = i
url = 'http://as-rank.caida.org/?mode0=as-info&mode1=as-table&as='+str(num)+'&data-selected-id=39'
url = 'http://as-rank.caida.org/?mode0=as-info&mode1=as-table&as=' + \
str(num) + '&data-selected-id=39'
driver.implicitly_wait(10)
driver.get(url)
#保存页面截图和源码
name = './png/'+str(num)+'.png'
name_html = "./code/"+str(num)+'.html'
# 保存页面截图和源码
name = './png/' + str(num) + '.png'
name_html = "./code/" + str(num) + '.html'

driver.save_screenshot(name)
f = open(name_html,'w')
f = open(name_html, 'w')
f.write(driver.page_source)
f.close()

#time.sleep(5)
print i,'end ',time.ctime()
except Exception,e:
# time.sleep(5)
print i, 'end ', time.ctime()
except Exception, e:
print e
driver.close()
78 changes: 41 additions & 37 deletions 6.爬虫项目源码/1.优酷网/read_useragent_txt_forge.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,53 +11,57 @@
# 读取一个随机的头部User-Agent 信息 添加到请求中此作为基础的伪造,
#
#-------------------------------------------------------------------------
import requests
import random
import re

#发起请求,
def get_request(url,user_agent):
'''参数引入及头信息'''
if len(user_agent ) < 10:
user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0'
#此处修改头字段,
headers = {
'Host':"v.youku.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
'Cache-Control':'no-cache',
"Connection": "keep-alive",
"User-Agent": user_agent,
'Referer':'http://www.youku.com/'
}
try:
html=requests.get(url,headers=headers, timeout=20).text
#print html
return html
except Exception,e:
print Exception,e
return -1
import requests


# 发起请求,


def get_request(url, user_agent):
'''参数引入及头信息'''
if len(user_agent) < 10:
user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0'
# 此处修改头字段,
headers = {
'Host': "v.youku.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
'Cache-Control': 'no-cache',
"Connection": "keep-alive",
"User-Agent": user_agent,
'Referer': 'http://www.youku.com/'
}
try:
html = requests.get(url, headers=headers, timeout=20).text
# print html
return html
except Exception, e:
print Exception, e
return -1

if __name__ == '__main__':
#此url为任意一个具有某视频播放窗口的页面
url = "http://v.youku.com/v_show/id_XMTgzNDI0MjkzNg==.html?from=y1.3-movie-grid-1095-9921.86985-107667.1-1&spm=a2hmv.20009921.yk-slide-107667.5~5~5~5!2~A#paction"
# 此url为任意一个具有某视频播放窗口的页面
url = "http://v.youku.com/v_show/id_XMTgzNDI0MjkzNg==.html?from=y1.3-movie-grid-1095-9921.86985-107667.1-1&spm=a2hmv.20009921.yk-slide-107667.5~5~5~5!2~A#paction"

#导入数据集并随机获取一个User-Agent
user_agent_list = []
f = open('user_agent.txt','r')
for date_line in f:
user_agent_list.append(date_line.replace('\r\n',''))
user_agent = random.choice(user_agent_list)
# 导入数据集并随机获取一个User-Agent
user_agent_list = []
f = open('user_agent.txt', 'r')
for date_line in f:
user_agent_list.append(date_line.replace('\r\n', ''))
user_agent = random.choice(user_agent_list)

#发起请求
html_body = get_request(url,user_agent)
print re.findall('http://player.youku.com/player.php/sid/[A-Za-z0-9=]*/v.swf',html_body)
# 发起请求
html_body = get_request(url, user_agent)
print re.findall('http://player.youku.com/player.php/sid/[A-Za-z0-9=]*/v.swf', html_body)


#-------------------测试结果-------------------------------
# 将此链接放在浏览器中可以直接播放,虽然有广告....至于别的数据太简单那就不抓了.
#[
#u'http://player.youku.com/player.php/sid/XMTgzNDI0MjkzNg==/v.swf',
#u'http://player.youku.com/player.php/sid/XMTgzNDI0MjkzNg==/v.swf'
# u'http://player.youku.com/player.php/sid/XMTgzNDI0MjkzNg==/v.swf',
# u'http://player.youku.com/player.php/sid/XMTgzNDI0MjkzNg==/v.swf'
#]
13 changes: 7 additions & 6 deletions 6.爬虫项目源码/10.百度百科大学/get_url.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#coding:utf-8
import requests
# coding:utf-8
import sys

#中文编码设置
import requests

# 中文编码设置
reload(sys)
sys.setdefaultencoding('utf-8')
Type = sys.getfilesystemencoding()
Expand All @@ -25,14 +26,14 @@
'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
'x-requested-with': "XMLHttpRequest",
'postman-token': "36f249ee-4f92-4669-7491-274692665478"
}
}
print 'a = ['
for i in range(84):
payload_now = payload % i
response = requests.request("POST", url, data=payload_now, headers=headers)
dic = eval(response.content.decode('utf8').encode('utf8'))
dic1 = dic['lemmaList']
for i in dic1:
#print i['lemmaTitle'].decode('utf8').encode('utf8')
print '"'+i['lemmaUrl']+'",'
# print i['lemmaTitle'].decode('utf8').encode('utf8')
print '"' + i['lemmaUrl'] + '",'
print ']'
Loading

0 comments on commit a41a8a3

Please sign in to comment.