Skip to content

Commit

Permalink
每日备份
Browse files Browse the repository at this point in the history
  • Loading branch information
lidachuan211 committed Jan 23, 2018
1 parent 6c45707 commit 4c7b50a
Show file tree
Hide file tree
Showing 26 changed files with 1,216 additions and 49 deletions.
117 changes: 117 additions & 0 deletions 51job/51job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# encoding: utf-8

from selenium import webdriver
import time
import sys
import shop
import tools
from selenium.webdriver.common.action_chains import ActionChains
import traceback

'''设置编码'''
reload(sys)
sys.setdefaultencoding('utf-8')


main_dir='D:\\wjm\\51job'
PAGE=main_dir+'\\page'
SKIP=main_dir+'\\skip'
word=''
city=''
user=''
password=''
current_page=''
skip_words=set();

file_obj=open(PAGE)
for line in file_obj.readlines():
current_page=line
file_obj.close()

file_obj=open(SKIP)
for line in file_obj.readlines():
skip_words.add(unicode(line.strip(), 'utf-8'))
file_obj.close()

'''begin'''
chrome_options = webdriver.ChromeOptions()
dr = webdriver.Chrome(executable_path='D:\wjm\drivers\chromedriver.exe',chrome_options=chrome_options)
dr.get('http://search.51job.com')
city=dr.find_element_by_xpath('//span[@id="work_position_span"]').get_attribute('innerHTML')

def read(url):
dr.get(url)
tools.scrollToEnd(dr)
classx=dr.find_element_by_xpath('//div[@class="dw_choice"]/div[@class="in"]/p').text
while True:
for company in dr.find_elements_by_xpath('//div[@id="resultList"]/div[@class="el"]/span/a'):
company_a=company.get_attribute('title')
if shop.isExsits(company_a):
continue
print company.get_attribute('outerHTML')
dr.execute_script('window.open("'+company.get_attribute('href')+'");')
dr.switch_to.window(dr.window_handles[-1])
company={'company_name':'','shop_name':'','class':classx,'address':'','city':city,'ext':''}

eles=dr.find_elements_by_xpath('//div[@class="con_msg"]/div[@class="in"]/p')
if len(eles)>0:
company['ext']=eles[0].text.strip()

eles = dr.find_elements_by_xpath('//h1')
if len(eles) > 0:
company['company_name'] = eles[0].text.strip()
company['shop_name'] = company['company_name']

eles = dr.find_elements_by_xpath('//a[@class="icon_b i_map"]/../p')
if len(eles) > 0:
company['address'] = eles[0].text.strip()

shop.saveShop(company)
dr.close()
dr.switch_to.window(dr.window_handles[-1])
time.sleep(3)
eles=dr.find_elements_by_xpath('//a[text()="下一页"]')
if len(eles)>0:
eles[0].click()
current_page = dr.current_url
tools.append(PAGE, 'w', current_page)
else:
break



while True:
if current_page=='':
tools.exe_wait(dr, '//*[@id="kwdselectid"]', '//div')
dr.find_element_by_xpath('//*[@id="kwdselectid"]').clear()
dr.find_element_by_xpath('//*[@id="kwdselectid"]').send_keys('招商'.decode('utf-8'))
dr.find_element_by_xpath('//*[@id="select_expect_indtype"]').click()
for li in dr.find_elements_by_xpath('//ul[@id="indtype_click_center_left"]/.//li'):
flg=False
li.click()
tools.exe_wait(dr,'//div[@class="indtype_click_center_right_list de d3" and (@style="display: block;" or not(@style))]','//div')
for em in dr.find_elements_by_xpath('//div[@class="indtype_click_center_right_list de d3" and (@style="display: block;" or not(@style))]/.//em'):
word=em.get_attribute('innerHTML').strip()
if word in skip_words:
continue
else:
for selected in dr.find_elements_by_xpath('//div[@id="indtype_click_multiple_selected"]/span'):
selected.click()
em.click()
dr.find_element_by_xpath('//span[@id="indtype_click_bottom_save"]').click()
dr.find_element_by_xpath('//button[@class="p_but" and @type="submit"]').click()
current_page = dr.current_url
tools.append(PAGE, 'w', current_page)
tools.append(SKIP, 'a', word+"\n")
skip_words.add(word)
flg=True
break
if flg:
break;
if current_page=='':
break
read(current_page)
current_page=''


dr.quit()
32 changes: 32 additions & 0 deletions 51job/shop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# encoding: utf-8

from selenium import webdriver
import time
import sys
import tools
from selenium.webdriver.common.action_chains import ActionChains
import traceback
import json

'''设置编码'''
reload(sys)
sys.setdefaultencoding('utf-8')
company_set = set()
fuck_index=1
def isExsits(shop_name):
if shop_name in company_set:
return True
if len(tools.readFromDB('121.43.168.132','maxwell','ED81A84EC3B290A4EFA26122test','test',3307,
'select * from test.51job_shop where shop_name=%s',(shop_name,)))>0:
company_set.add(shop_name)
return True
else:
return False

def saveShop(shop):
print json.dumps(shop)
tools.executeDB('121.43.168.132','maxwell','ED81A84EC3B290A4EFA26122test','test',3307,
'INSERT IGNORE INTO test.51job_shop (`index`,`company_name`,`shop_name`,`class`,`legal_person`,`address`,`city`,`ext`) '
'VALUES (%s,%s,%s,%s,%s,%s,%s,%s)',
(0,shop['company_name'],shop['shop_name'],shop['class'],'',shop['address'],shop['city'],shop['ext'])
)
112 changes: 112 additions & 0 deletions 51job/tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# encoding: utf-8

from PIL import Image
import MySQLdb
import sys
import time
import httplib
import base64
import urllib
from selenium.webdriver.common.action_chains import ActionChains
import json

'''设置编码'''
reload(sys)
sys.setdefaultencoding('utf-8')


'''write file'''
def append(def_file_name,def_flg,def_line):
def_file_obj=open(def_file_name, def_flg)
def_file_obj.write(def_line)
def_file_obj.close()

def scrollToEnd(dr):
index=10
while True:
index+=10
js = "document.documentElement.scrollTop="+str(index)
dr.execute_script(js)
if index>=10000:
break


def readFromDB(host,user,passwd,db,port,sql,values):
conn = MySQLdb.connect(host=host, user=user, passwd=passwd,db=db, charset='utf8', port=port)
cur = conn.cursor()
info = cur.fetchmany(cur.execute(sql,values))
cur.close()
conn.close()
return info

def executeDB(host,user,passwd,db,port,sql,values):
conn = MySQLdb.connect(host=host, user=user, passwd=passwd, db=db, charset='utf8', port=port)
cur = conn.cursor()
cur.execute(sql,values)
cur.close()
conn.commit()
conn.close()


def getImgBase64(file):
f = open(file, 'rb') # 二进制方式打开图文件
ls_f = base64.b64encode(f.read()) # 读取文件内容,转换为base64编码
f.close()
return ls_f



def wait(dr,xpath):
i = 0
while True:
if i>=30:
break
if len(dr.find_elements_by_xpath(xpath)) > 0:
break
else:
time.sleep(0.1)
i += 1

def exe_wait(dr,xpath,xpath2):
i=0
while True:
if i>=30:
return False
ActionChains(dr).move_to_element(dr.find_element_by_xpath(xpath2)).perform()
if len(dr.find_elements_by_xpath(xpath)) > 0:
break
else:
time.sleep(0.1)
i+=1
return True

def floatOrDefault(x,default):
try:
return float(x)
except:
return default




def login(dr,user,password):
dr.delete_all_cookies()
dr.execute_script('window.open("https://login.tmall.com");')
dr.switch_to.window(dr.window_handles[-1])
'''登陆'''
dr.switch_to.frame(0)
dr.find_element_by_xpath('//div[@class="login-switch"]').click()

ActionChains(dr).move_to_element(dr.find_element_by_xpath('//input[@id="TPL_username_1"]')).perform()
i = 0
while i < 20:
ActionChains(dr).move_by_offset(1, 1).perform()
time.sleep(0.05)
i += 1

dr.find_element_by_xpath('//input[@id="TPL_username_1"]').send_keys(user)
dr.find_element_by_xpath('//input[@id="TPL_password_1"]').send_keys(password)
dr.find_element_by_xpath('//button[@id="J_SubmitStatic"]').click()
if len(dr.window_handles)>0:
dr.close()
dr.switch_to.window(dr.window_handles[-1])
10 changes: 8 additions & 2 deletions alibaba/flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
class flow:

def __init__(self, request,region,all_classi,url_region):
self.max_count = 25
reload(sys)
sys.setdefaultencoding('utf-8')
self.request = request
Expand Down Expand Up @@ -74,11 +73,17 @@ def get_page_size(self, html):
print('page size is none')
return 1

def process_classi(self):
flag = 0
index_count = 0
while index_count<1000 and flag == self.search_classi():
index_count +=1

def search_classi(self):
try:
classi_cnt = self.get_classi_count()
if classi_cnt is None or len(classi_cnt) == 0:
return
return 1
classi = classi_cnt[1]
index = classi_cnt[2]
search_url = self.get_search_url(classi) + str(self.get_page_index(index))
Expand All @@ -87,6 +92,7 @@ def search_classi(self):
self.search_classi_page(html, classi, page_size, index, classi_cnt[0])
except Exception as e:
print 'traceback.format_exc():\n%s' % traceback.format_exc()
return 0

def search_classi_page(self, html, classi, page_size, index, class_id):
if page_size <= 0 or index > page_size:
Expand Down
Binary file added alibaba/image/3911516092648.59.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added alibaba/image/591516092131.4.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added alibaba/image/671516256478.99.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added alibaba/image/7871516092101.26.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions alibaba/login_chrome.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def __init__(self,user_name,password,i):
# executable_path='/Users/zmj/Documents/chromedriver',
# chrome_options=chrome_options)
self.dr = webdriver.Chrome(
executable_path='/Users/qizhouli/Documents/chromedriver',
executable_path='/root/Downloads/chromedriver',
chrome_options=chrome_options)
#self.dr = webdriver.Chrome("/Users/qizhouli/Documents/chromedriver")
# self.dr.set_page_load_timeout(10)
Expand Down Expand Up @@ -292,7 +292,7 @@ def get_captch_img(self):
f = open(img_path, 'rb') # 二进制方式打开图文件
ls_f = base64.b64encode(f.read()) # 读取文件内容,转换为base64编码
f.close()
text = urllib.urlopen('http://121.43.173.22:8070/taobao/captch?img='+str(quote(ls_f))).read()
text = urllib.urlopen('http://39.107.66.223:8070/taobao/captch?img='+str(quote(ls_f))).read()
if text is None or len(text)==0:
return 'fdss'
else:
Expand Down
20 changes: 10 additions & 10 deletions alibaba/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
from login_chrome import login_chrome_1688


IP = '19b'
region = '上海'
all_classi = '医药保健'
url_region = 'province=%C9%CF%BA%A3'
IP = '17'
regions = ['深证','上海','杭州','广州','南京','苏州']
url_regions= ['city=%C9%EE%DB%DA&province=%B9%E3%B6%AB','province=%C9%CF%BA%A3','city=%BA%BC%D6%DD&province=%D5%E3%BD%AD','city=%B9%E3%D6%DD&province=%B9%E3%B6%AB','city=%C4%CF%BE%A9&province=%BD%AD%CB%D5','city=%CB%D5%D6%DD&province=%BD%AD%CB%D5']
user_password = None

def get_user_name():
Expand All @@ -36,13 +35,15 @@ def processing():
return
request = login_chrome_1688(user_password[1], user_password[2], int(user_password[0]))
time.sleep(3)
while True:
for i in range(0,len(regions)):
region = regions[i]
url_region = url_regions[i]
for all_classi in get_all_classi():
print 'all class = ' + str(all_classi[0])
print 'region = %s, all class = %s' % (region,all_classi[0])
try:
f = flow(request,region,all_classi[0],url_region)
f.search_classi()
time.sleep(60)
f.process_classi()
time.sleep(1)
except Exception as e:
print 'traceback.format_exc():\n%s' % traceback.format_exc()
finally:
Expand All @@ -66,9 +67,8 @@ def onsignal_term(a, b):
finally:
sys.exit()



if __name__ == '__main__':
'''注册信号'''
print 'pid = '+ str(os.getpid())
signal.signal(signal.SIGTERM, onsignal_term)
processing()
2 changes: 0 additions & 2 deletions allocation/shanghai_allocation.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,6 @@ def process_clue():
time.sleep(random.uniform(1,10))
except Exception as e:
print clue[0],clue[1].replace('\n','').replace('\r',''),clue[2]
#index += 1
#print index
print traceback.format_exc()


Expand Down
Loading

0 comments on commit 4c7b50a

Please sign in to comment.