-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6c45707
commit 4c7b50a
Showing
26 changed files
with
1,216 additions
and
49 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
# encoding: utf-8 | ||
|
||
from selenium import webdriver | ||
import time | ||
import sys | ||
import shop | ||
import tools | ||
from selenium.webdriver.common.action_chains import ActionChains | ||
import traceback | ||
|
||
'''设置编码''' | ||
reload(sys) | ||
sys.setdefaultencoding('utf-8') | ||
|
||
|
||
main_dir='D:\\wjm\\51job' | ||
PAGE=main_dir+'\\page' | ||
SKIP=main_dir+'\\skip' | ||
word='' | ||
city='' | ||
user='' | ||
password='' | ||
current_page='' | ||
skip_words=set(); | ||
|
||
file_obj=open(PAGE) | ||
for line in file_obj.readlines(): | ||
current_page=line | ||
file_obj.close() | ||
|
||
file_obj=open(SKIP) | ||
for line in file_obj.readlines(): | ||
skip_words.add(unicode(line.strip(), 'utf-8')) | ||
file_obj.close() | ||
|
||
'''begin''' | ||
chrome_options = webdriver.ChromeOptions() | ||
dr = webdriver.Chrome(executable_path='D:\wjm\drivers\chromedriver.exe',chrome_options=chrome_options) | ||
dr.get('http://search.51job.com') | ||
city=dr.find_element_by_xpath('//span[@id="work_position_span"]').get_attribute('innerHTML') | ||
|
||
def read(url): | ||
dr.get(url) | ||
tools.scrollToEnd(dr) | ||
classx=dr.find_element_by_xpath('//div[@class="dw_choice"]/div[@class="in"]/p').text | ||
while True: | ||
for company in dr.find_elements_by_xpath('//div[@id="resultList"]/div[@class="el"]/span/a'): | ||
company_a=company.get_attribute('title') | ||
if shop.isExsits(company_a): | ||
continue | ||
print company.get_attribute('outerHTML') | ||
dr.execute_script('window.open("'+company.get_attribute('href')+'");') | ||
dr.switch_to.window(dr.window_handles[-1]) | ||
company={'company_name':'','shop_name':'','class':classx,'address':'','city':city,'ext':''} | ||
|
||
eles=dr.find_elements_by_xpath('//div[@class="con_msg"]/div[@class="in"]/p') | ||
if len(eles)>0: | ||
company['ext']=eles[0].text.strip() | ||
|
||
eles = dr.find_elements_by_xpath('//h1') | ||
if len(eles) > 0: | ||
company['company_name'] = eles[0].text.strip() | ||
company['shop_name'] = company['company_name'] | ||
|
||
eles = dr.find_elements_by_xpath('//a[@class="icon_b i_map"]/../p') | ||
if len(eles) > 0: | ||
company['address'] = eles[0].text.strip() | ||
|
||
shop.saveShop(company) | ||
dr.close() | ||
dr.switch_to.window(dr.window_handles[-1]) | ||
time.sleep(3) | ||
eles=dr.find_elements_by_xpath('//a[text()="下一页"]') | ||
if len(eles)>0: | ||
eles[0].click() | ||
current_page = dr.current_url | ||
tools.append(PAGE, 'w', current_page) | ||
else: | ||
break | ||
|
||
|
||
|
||
while True: | ||
if current_page=='': | ||
tools.exe_wait(dr, '//*[@id="kwdselectid"]', '//div') | ||
dr.find_element_by_xpath('//*[@id="kwdselectid"]').clear() | ||
dr.find_element_by_xpath('//*[@id="kwdselectid"]').send_keys('招商'.decode('utf-8')) | ||
dr.find_element_by_xpath('//*[@id="select_expect_indtype"]').click() | ||
for li in dr.find_elements_by_xpath('//ul[@id="indtype_click_center_left"]/.//li'): | ||
flg=False | ||
li.click() | ||
tools.exe_wait(dr,'//div[@class="indtype_click_center_right_list de d3" and (@style="display: block;" or not(@style))]','//div') | ||
for em in dr.find_elements_by_xpath('//div[@class="indtype_click_center_right_list de d3" and (@style="display: block;" or not(@style))]/.//em'): | ||
word=em.get_attribute('innerHTML').strip() | ||
if word in skip_words: | ||
continue | ||
else: | ||
for selected in dr.find_elements_by_xpath('//div[@id="indtype_click_multiple_selected"]/span'): | ||
selected.click() | ||
em.click() | ||
dr.find_element_by_xpath('//span[@id="indtype_click_bottom_save"]').click() | ||
dr.find_element_by_xpath('//button[@class="p_but" and @type="submit"]').click() | ||
current_page = dr.current_url | ||
tools.append(PAGE, 'w', current_page) | ||
tools.append(SKIP, 'a', word+"\n") | ||
skip_words.add(word) | ||
flg=True | ||
break | ||
if flg: | ||
break; | ||
if current_page=='': | ||
break | ||
read(current_page) | ||
current_page='' | ||
|
||
|
||
dr.quit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# encoding: utf-8 | ||
|
||
from selenium import webdriver | ||
import time | ||
import sys | ||
import tools | ||
from selenium.webdriver.common.action_chains import ActionChains | ||
import traceback | ||
import json | ||
|
||
'''设置编码''' | ||
reload(sys) | ||
sys.setdefaultencoding('utf-8') | ||
company_set = set() | ||
fuck_index=1 | ||
def isExsits(shop_name): | ||
if shop_name in company_set: | ||
return True | ||
if len(tools.readFromDB('121.43.168.132','maxwell','ED81A84EC3B290A4EFA26122test','test',3307, | ||
'select * from test.51job_shop where shop_name=%s',(shop_name,)))>0: | ||
company_set.add(shop_name) | ||
return True | ||
else: | ||
return False | ||
|
||
def saveShop(shop): | ||
print json.dumps(shop) | ||
tools.executeDB('121.43.168.132','maxwell','ED81A84EC3B290A4EFA26122test','test',3307, | ||
'INSERT IGNORE INTO test.51job_shop (`index`,`company_name`,`shop_name`,`class`,`legal_person`,`address`,`city`,`ext`) ' | ||
'VALUES (%s,%s,%s,%s,%s,%s,%s,%s)', | ||
(0,shop['company_name'],shop['shop_name'],shop['class'],'',shop['address'],shop['city'],shop['ext']) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
# encoding: utf-8 | ||
|
||
from PIL import Image | ||
import MySQLdb | ||
import sys | ||
import time | ||
import httplib | ||
import base64 | ||
import urllib | ||
from selenium.webdriver.common.action_chains import ActionChains | ||
import json | ||
|
||
'''设置编码''' | ||
reload(sys) | ||
sys.setdefaultencoding('utf-8') | ||
|
||
|
||
'''write file''' | ||
def append(def_file_name,def_flg,def_line): | ||
def_file_obj=open(def_file_name, def_flg) | ||
def_file_obj.write(def_line) | ||
def_file_obj.close() | ||
|
||
def scrollToEnd(dr): | ||
index=10 | ||
while True: | ||
index+=10 | ||
js = "document.documentElement.scrollTop="+str(index) | ||
dr.execute_script(js) | ||
if index>=10000: | ||
break | ||
|
||
|
||
def readFromDB(host,user,passwd,db,port,sql,values): | ||
conn = MySQLdb.connect(host=host, user=user, passwd=passwd,db=db, charset='utf8', port=port) | ||
cur = conn.cursor() | ||
info = cur.fetchmany(cur.execute(sql,values)) | ||
cur.close() | ||
conn.close() | ||
return info | ||
|
||
def executeDB(host,user,passwd,db,port,sql,values): | ||
conn = MySQLdb.connect(host=host, user=user, passwd=passwd, db=db, charset='utf8', port=port) | ||
cur = conn.cursor() | ||
cur.execute(sql,values) | ||
cur.close() | ||
conn.commit() | ||
conn.close() | ||
|
||
|
||
def getImgBase64(file): | ||
f = open(file, 'rb') # 二进制方式打开图文件 | ||
ls_f = base64.b64encode(f.read()) # 读取文件内容,转换为base64编码 | ||
f.close() | ||
return ls_f | ||
|
||
|
||
|
||
def wait(dr,xpath): | ||
i = 0 | ||
while True: | ||
if i>=30: | ||
break | ||
if len(dr.find_elements_by_xpath(xpath)) > 0: | ||
break | ||
else: | ||
time.sleep(0.1) | ||
i += 1 | ||
|
||
def exe_wait(dr,xpath,xpath2): | ||
i=0 | ||
while True: | ||
if i>=30: | ||
return False | ||
ActionChains(dr).move_to_element(dr.find_element_by_xpath(xpath2)).perform() | ||
if len(dr.find_elements_by_xpath(xpath)) > 0: | ||
break | ||
else: | ||
time.sleep(0.1) | ||
i+=1 | ||
return True | ||
|
||
def floatOrDefault(x,default): | ||
try: | ||
return float(x) | ||
except: | ||
return default | ||
|
||
|
||
|
||
|
||
def login(dr,user,password): | ||
dr.delete_all_cookies() | ||
dr.execute_script('window.open("https://login.tmall.com");') | ||
dr.switch_to.window(dr.window_handles[-1]) | ||
'''登陆''' | ||
dr.switch_to.frame(0) | ||
dr.find_element_by_xpath('//div[@class="login-switch"]').click() | ||
|
||
ActionChains(dr).move_to_element(dr.find_element_by_xpath('//input[@id="TPL_username_1"]')).perform() | ||
i = 0 | ||
while i < 20: | ||
ActionChains(dr).move_by_offset(1, 1).perform() | ||
time.sleep(0.05) | ||
i += 1 | ||
|
||
dr.find_element_by_xpath('//input[@id="TPL_username_1"]').send_keys(user) | ||
dr.find_element_by_xpath('//input[@id="TPL_password_1"]').send_keys(password) | ||
dr.find_element_by_xpath('//button[@id="J_SubmitStatic"]').click() | ||
if len(dr.window_handles)>0: | ||
dr.close() | ||
dr.switch_to.window(dr.window_handles[-1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.