Skip to content

Commit

Permalink
淘宝的因存在bug。待修复
Browse files Browse the repository at this point in the history
  • Loading branch information
luyishisi committed Jan 2, 2017
1 parent 57825c0 commit a66c114
Show file tree
Hide file tree
Showing 20 changed files with 3,195 additions and 166 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file modified 6.爬虫项目源码/.DS_Store
Binary file not shown.
Binary file removed 6.爬虫项目源码/11.淘宝/.xlsx
Binary file not shown.
205 changes: 138 additions & 67 deletions 6.爬虫项目源码/11.淘宝/Search.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,67 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#-------------------------------------------------------------------------
# 程序:selenium_so.py
# 版本:0.1
# 程序:search.py
# 版本:1
# 作者:ly
# 日期:编写日期2016/11/23
# 语言:Python 2.7.x
# 操作:python selenuium.py
# 功能:发出请求并且解析
# 操作:python search.py 关键词 存储文件名 (排序呢方式)1或者2
# 功能:
#
#-------------------------------------------------------------------------
import requests
import time,sys,json
import time,sys,json,os
import xlsxwriter
from sys import argv
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from lxml import etree#
import re

# 中文编码设置
reload(sys)
sys.setdefaultencoding('utf-8')
Type = sys.getfilesystemencoding()

def requests_post(key):
url = "https://s.m.taobao.com/search"
payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"event_submit_do_new_search_auction\"\r\n\r\n1\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"_input_charset\"\r\n\r\nutf-8\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"topSearch\"\r\n\r\n1\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"atype\"\r\n\r\nb\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"searchfrom\"\r\n\r\n1\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"action\"\r\n\r\nhome:redirect_app_action\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"from\"\r\n\r\n1\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"q\"\r\n\r\n%s\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"sst\"\r\n\r\n1\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"n\"\r\n\r\n20\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"buying\"\r\n\r\nbuyitnow\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"m\"\r\n\r\napi4h5\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"abtest\"\r\n\r\n27\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"wlsort\"\r\n\r\n27\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"style\"\r\n\r\nlist\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"closeModues\"\r\n\r\nnav,selecthot,onesearch\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"page\"\r\n\r\n1\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--"
headers = {
'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
'accept': "application/json",
'accept-encoding': "gzip, deflate, sdch, br",
'accept-language': "zh-CN,zh;q=0.8",
'cache-control': "no-cache",
'cookie': "thw=ca; cna=mx3eEN4jylICAS9aUXroe7QT; uss=U7BF7ECyL2ASwrEOYQ44DNcPMBUz3pSobKdyq%2Byc6n8FghhgUKX%2BdwO8; _cc_=V32FPkk%2Fhw%3D%3D; tg=0; ali_ab=171.15.132.56.1482321078252.3; uc2=wuf=http%3A%2F%2Fwww.umeng.com%2F%3Fspm%3D2013.1.1997523009.17.QKcgNw; cookie2=1c4616113d31ccff2a11ac55392b45c1; t=5e8eb89f88eaa0e6cc5ac88135841f4d; OUTFOX_SEARCH_USER_ID_NCOO=731539750.9226516; miid=423405037610384452; tkmb=e=E1DoNQOKQ8Bw4vFB6t2Z2ueEDrYVVa64gze6kOnl9rUYX8TY-NEwd6Vld1zxDwD4IfNVgZHErC7Qw2z4l0QSO7miPnyQLVs2v9hsW5EhbenF_tD4Y0_kjSP88i591npA_Qthylo3HFfXZs7UX6XuEPAd4QYtbexQJSdb3DTn-MUbi6LHOW70tnKirEH51Gw9pWV3XPEPAPgAvPM5QRELPMu0DvJ4SySS90WRxBXkWV7iC0Er_lIGgiIDeMwxoAFc-D2lLpi0SOqtD0dafC_ZEg&iv=0&et=1482556362; linezing_session=HWVtZf1l8XH0dACdMJQQDOt4_1482556828494E2ui_9; uc3=sg2=BYiIfEpsMbxtm040yzQn62r4dy8462CfLR73vjezc00%3D&nk2=&id2=&lg2=; hng=CN%7Czh-cn%7CCNY; tracknick=; mt=ci=0_0&cyk=2_1; v=0; _m_h5_tk=1d52cfb51a019fc6b3f3bcafcb88fc5a_1482910547070; _m_h5_tk_enc=37149113fb0caf06a0f086a605721964; _tb_token_=YfuOSRriSmBkwyRo5m4h; uc1=cookie14=UoW%2FX9Q1zQXCpw%3D%3D; JSESSIONID=299315B3B9143C57D3CEFD7E71B56B99; ___rl__test__cookies=1482995634939; l=AoWF82noU/G0VcAybR9piIJLFdq/Hzjl; isg=AhsbL2-dx8agsTsMpa_FVkRjqnlPnL4I5G5pIw1YqpsA7DDOkMBuQAkq9PYd",
'pragma': "no-cache",
'referer': "https://s.m.taobao.com/h5?event_submit_do_new_search_auction=1&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&from=1&q=%E7%9A%AE%E8%A3%A4%E7%94%B7&sst=1&n=20&buying=buyitnow",
'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
'x-requested-with': "XMLHttpRequest",
'postman-token': "761c6c35-7ef9-7fbe-d235-859dd1ea4551"
}
#商品url汇总表
url_list = []

response = requests.request("POST", url, data=payload, headers=headers)
print(response.text)

def main(work,key,num):
#def main(work,key,num,mkname):
def main(work,key,num,mkname,start_price,end_price):
#print type(num),num
if num == '1':
url = '''https://s.m.taobao.com/search?event_submit_do_new_search_auction=1\
&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&\
from=1&q='''+key+'''&sst=1&n=40&buying=buyitnow&m=api4h5&abtest=10&wlsort=10&page=1'''
else:
elif num == '3':
url = '''https://s.m.taobao.com/search?event_submit_do_new_search_auction=1\
&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&\
from=1&q=''' + key + '''&sst=1&n=40&buying=buyitnow&m=api4h5&abtest=3\
&wlsort=3&style=list&closeModues=nav%2Cselecthot%2Conesearch&\
start_price='''+str(start_price)+'''&end_price='''+str(end_price)+'''&page=1\
'''
else :#num == '2':
url = '''https://s.m.taobao.com/search?event_submit_do_new_search_auction=1\
&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&\
from=1&q=''' + key + '''&sst=1&n=40&buying=buyitnow&m=api4h5&abtest=14&\
wlsort=14&style=list&closeModues=nav%2Cselecthot%2Conesearch&sort=_sale&page=1
'''
#%E7%9A%AE%E8%A3%A4%E7%94%B7
#print url
try:
body = requests.get(url)
body = body.text.encode('utf8')
dic_body = eval(body)
except Exception,e:
print "请求出错,请将下列url放于浏览器中看是否可以打开"
print "请求出错,请将下列url放于浏览器中看是否可以打开"
print url
print e
for i in range(40):
print "当前正在采集第 ",i+1," 个"
try:
num_id = dic_body["listItem"][i]['item_id']
except:
num_id = ''
try:
act = dic_body["listItem"][i]['act'] # 付款数
except:
Expand Down Expand Up @@ -102,26 +103,68 @@ def main(work,key,num):
price =''
try:
pic_path = dic_body["listItem"][i]['pic_path'] # 当前价格
img_download(str(i+1),pic_path)
#print pic_path
pic_path = pic_path.replace('60x60','210x210')
pic_name = str(i+1)+'-'+nick
img_download(pic_name,pic_path,mkname+'/pic')
except Exception,e:
print e
pic_path = ''
try:
zkType = dic_body["listItem"][i]['zkType'] # 当前价格
except:
zkType = ''


date = [ name, nick,act,price ,originalPrice,zkType,area,auctionURL_1 , auctionURL_2 ,pic_path]
try:
html_date = download_date(auctionURL_1)
except:
html_date = ''
print html_date
date = [ name, nick,act,price ,originalPrice,zkType,area,auctionURL_1 , auctionURL_2 ,pic_path,html_date,num_id]
#print len(date)
num = i+2
install_table(date,work,num)
# 商品名 店铺 付款人数 当前价格 原始价格 优惠类型 地区 商品url 图片url #
# name nick act price originalPrice zkType area auctionURL pic_path
# 商品名 店铺 付款人数 当前价格 原始价格 优惠类型 地区 商品url 图片url 详情数据#
# name nick act price originalPrice zkType area auctionURL pic_path html_date

def download_date(url):
'''导入商品url,进行详情页面解析'''
if(url.find("taobao")!= -1):
print "检测为淘宝的页面"
try:
driver = webdriver.PhantomJS()
print "正在获取详情页面,url为"
#url ="https://item.taobao.com/item.htm?id=538287375253&abtest=10&rn=07abc745561bdfad6f726eb186dd990e&sid=46f938ba6d759f6e420440bf98b6caea"
url = url
print url
driver.get(url)
driver.implicitly_wait(40) #设置智能超时时间
html = driver.page_source.encode('utf-8')
driver.quit()
except Exception,e:
print "页面加载失败",e
return 0
try:
print '正在解析页面'
selector=etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8'))
context=selector.xpath('//ul[@class="attributes-list"]/li')
list_date = u''
for i in range(len(context)):
a = etree.tostring(context[i], encoding="utf-8")#.encode('utf-8')
b = a.split('>')
end = b[1].split('<')[0]+';'
list_date += end
#print list_date.encode('utf8')
return list_date
except:
print '页面解析失败'
return 0
#else if(url.find("tmall")):
# print "检测为天猫页面,"
# pass

def install_table(date,work,i):
#i = 2
str_list = ['B','C','D','E','F','G','H','I','J','K','L']
'''导入数据列表存入表格中 '''
str_list = ['B','C','D','E','F','G','H','I','J','K','L','M',"O"]
#global worksheet1
try:
work.write('A'+str(i),int(i)-1)
Expand All @@ -136,37 +179,54 @@ def install_table(date,work,i):
print "无法写入"
print e

def img_download(id,url):
print "download_img "
#img = requests.get(url).context()
name = id
r = requests.get(url,timeout = 50)
#name = int(time.time())
f = open('./pic/'+str(name)+'.jpg','wb')
f.write(r.content)
f.close()

def img_download(id,url,mkname):
'''导入图片url,文件夹名,以id为图片名'''
try:
print "主图下载中"
#img = requests.get(url).context()
name = id
r = requests.get(url,timeout = 50)
#name = int(time.time())
f = open('./'+mkname+'/'+str(name)+'.jpg','wb')
f.write(r.content)
f.close()
except :
print "主图下载失败"

def create_mkdir(name):
'''创建文件夹'''
try:
print "开始创建文件夹 ",name
os.mkdir(r'./'+name)
os.mkdir(r'./'+name+"/pic")
except Exception,e:
print e

def create_table(name):
name = name+'.xlsx'
workbook = xlsxwriter.Workbook(name)
worksheet1 = workbook.add_worksheet()
worksheet1.write('A1', 'ID')
worksheet1.write('B1', u"商品名")
worksheet1.write('C1', u'店铺')
worksheet1.write('D1', u'付款人数')
worksheet1.write('E1', u'当前价格')
worksheet1.write('F1', u'原始价格')
worksheet1.write('G1', u'优惠类型')
worksheet1.write('H1', u'地区')
worksheet1.write('I1', u'商品url_1')
worksheet1.write('J1', u'商品url_2')
worksheet1.write('K1', u'图片url')
worksheet1.write('L1', u'time')
#workbook.close()
print '表格构建完成'
return worksheet1,workbook
''' 导入表格名字,在当前目录下创建该表格'''
try:
name = './'+name+'/'+name+'.xlsx'

workbook = xlsxwriter.Workbook(name)
worksheet1 = workbook.add_worksheet()
worksheet1.write('A1', 'ID')
worksheet1.write('B1', u"商品名")
worksheet1.write('C1', u'店铺')
worksheet1.write('D1', u'付款人数')
worksheet1.write('E1', u'当前价格')
worksheet1.write('F1', u'原始价格')
worksheet1.write('G1', u'优惠类型')
worksheet1.write('H1', u'地区')
worksheet1.write('I1', u'商品url_1')
worksheet1.write('J1', u'商品url_2')
worksheet1.write('K1', u'图片url')
worksheet1.write('L1', u'date')
worksheet1.write('M1', u'宝贝id')
#workbook.close()
print '表格构建完成,name',name
return worksheet1,workbook
except Exception,e:
print e

if __name__ == '__main__':
#print argv
Expand All @@ -182,18 +242,29 @@ def create_table(name):
name = ''
try:
num = argv[3]
#print num ,star_price , end_price
except:
print "请指定排序方式 1 为综合排序 2 为销量排序, 当前默认为综合排序"
num = 1
#key = u'皮裤男'
try:
star_price = argv[4]
end_price = argv[5]
except:
star_price = ''
end_price = ''

print '启动采集,关键词为:',key," 存入: ", name
#key = u'皮裤男'
print '启动采集,关键词为:',key," 存入: ", name,"排序为 ",num,star_price,end_price
if ( key == '' or name == '' or num == ''):
print '参数不正确'
print "请按顺序输入参数 关键词 输出文件名 排序方式(1或者2)"
print "例如:python Search.py 皮裤男 皮裤男1 2"
else:
create_mkdir(name)
work,workbook = create_table(name)
main(work,key,num)
#time.sleep(100)
print '开始采集请等待'
#main(work,key,num,name)
main(work,key,num,name,star_price,end_price)
workbook.close()
print '采集完成'
Loading

0 comments on commit a66c114

Please sign in to comment.