|
|
from selenium.webdriver import Firefox
|
|
|
from selenium import webdriver
|
|
|
from selenium.webdriver.common.by import By
|
|
|
from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
from selenium.common import exceptions
|
|
|
from selenium.webdriver import FirefoxOptions
|
|
|
from selenium.webdriver.common.by import By
|
|
|
from time import sleep
|
|
|
from selenium.common import exceptions
|
|
|
from sql import sql
|
|
|
from pypinyin import pinyin,Style
|
|
|
import csv
|
|
|
import traceback
|
|
|
|
|
|
def get_dbname(city,job):
|
|
|
"""
|
|
|
将中文字符串转换为拼音字符串,并只保留每个汉字的第一个拼音字母。
|
|
|
"""
|
|
|
pinyin_list = pinyin(city, style=Style.NORMAL)
|
|
|
city = ''.join([p[0] for p in pinyin_list])
|
|
|
pinyin_list = pinyin(job, style=Style.NORMAL)
|
|
|
job = ''.join([p[0] for p in pinyin_list])
|
|
|
return city+'_'+job
|
|
|
|
|
|
def responsedata(job,city,pages):
|
|
|
#无头启动
|
|
|
options = FirefoxOptions()
|
|
|
# options.headless=True
|
|
|
# 反检测
|
|
|
options.add_argument("--proxy-server=127.0.0.1:7890")
|
|
|
driver = Firefox(options=options)
|
|
|
# 设置隐性等待时间为10s
|
|
|
driver.maximize_window()
|
|
|
driver.get('https://www.zhipin.com/web/geek/job?query='+job+"&city="+str(city)+'&page=1')
|
|
|
driver.implicitly_wait(10)
|
|
|
sleep(2)
|
|
|
dics=[]
|
|
|
for page in range(1, pages+1): # 爬取3页
|
|
|
k = 0 # 用来设置每页爬取的数量,每页有30条数据,因全部爬取用selenium较慢,为测试效果每页只爬取5条
|
|
|
sleep(2)
|
|
|
# 滚动条滚到底部
|
|
|
driver.execute_script('document.documentElement.scrollTop = document.documentElement.scrollHeight')
|
|
|
li_lists = driver.find_elements(By.CSS_SELECTOR, '.job-card-wrapper')
|
|
|
print(len(li_lists))
|
|
|
|
|
|
for li in li_lists:
|
|
|
job_name = li.find_element(By.CLASS_NAME, 'job-name').text
|
|
|
job_area = li.find_element(By.CLASS_NAME, 'job-area').text
|
|
|
salary = li.find_element(By.CLASS_NAME, 'salary').text
|
|
|
job_tag = li.find_element(By.CSS_SELECTOR, '.job-card-wrapper .job-card-left .tag-list').text.replace('\n', ',')
|
|
|
job_ability = li.find_element(By.XPATH, './div[2]/ul').text
|
|
|
company_name = li.find_element(By.CLASS_NAME, 'company-name').text
|
|
|
welfare = li.find_element(By.CLASS_NAME, 'info-desc').text
|
|
|
link = li.find_element(By.CLASS_NAME, 'job-card-left').get_attribute('href')
|
|
|
|
|
|
# WebDriverWait(driver, 100).until(
|
|
|
# EC.presence_of_element_located((By.XPATH, '//*[@id="main"]/div[3]/div/div[2]/div[1]/div[2]'))
|
|
|
# )
|
|
|
try:
|
|
|
# 点击详情页
|
|
|
clic = li.find_element(By.CSS_SELECTOR, '.job-card-left')
|
|
|
driver.execute_script('arguments[0].click()', clic)
|
|
|
# 窗口切换到最新打开的页面
|
|
|
driver.switch_to.window(driver.window_handles[-1])
|
|
|
driver.implicitly_wait(5)
|
|
|
driver.execute_script('document.documentElement.scrollTop = document.documentElement.scrollHeight')
|
|
|
job_des = driver.find_element(By.XPATH, '//*[@id="main"]/div[3]/div/div[2]/div[1]/div[2]').text.replace('\n',' ')
|
|
|
except Exception as e:
|
|
|
print("Error:",e)
|
|
|
print('因网络问题,有个别页面无法加载,已进行跳过')
|
|
|
company_info = ''
|
|
|
company_type = ''
|
|
|
address=''
|
|
|
job_des=''
|
|
|
dic = {
|
|
|
"职位名称": job_name,
|
|
|
"地区": job_area,
|
|
|
"薪水": salary,
|
|
|
"标签": job_tag,
|
|
|
"能力要求": job_ability,
|
|
|
"公司名字": company_name,
|
|
|
"公司介绍": company_info,
|
|
|
"福利待遇": welfare,
|
|
|
"职位描述": job_des,
|
|
|
"企业类型": company_type,
|
|
|
"工作地址": address,
|
|
|
"详情链接": link
|
|
|
}
|
|
|
# 写入数据
|
|
|
k += 1
|
|
|
print(dic)
|
|
|
dics.append(dic)
|
|
|
driver.close()
|
|
|
driver.switch_to.window(driver.window_handles[0])
|
|
|
continue
|
|
|
try: # 有的公司没有公司介绍
|
|
|
company_info = driver.find_element(By.CSS_SELECTOR,'.job-body-wrapper .company-info-box .fold-text').text.replace('\n', ' ')
|
|
|
except exceptions.NoSuchElementException:
|
|
|
company_info = ''
|
|
|
try:
|
|
|
company_type = driver.find_element(By.CLASS_NAME, 'company-type').text.replace('企业类型\n', '')
|
|
|
except exceptions.NoSuchElementException:
|
|
|
company_type = ''
|
|
|
address = driver.find_element(By.CLASS_NAME, 'location-address').text
|
|
|
dic = {
|
|
|
"职位名称": job_name,
|
|
|
"地区": job_area,
|
|
|
"薪水": salary,
|
|
|
"标签": job_tag,
|
|
|
"能力要求": job_ability,
|
|
|
"公司名字": company_name,
|
|
|
"公司介绍": company_info,
|
|
|
"福利待遇": welfare,
|
|
|
"职位描述": job_des,
|
|
|
"企业类型": company_type,
|
|
|
"工作地址": address,
|
|
|
"详情链接": link
|
|
|
}
|
|
|
# 写入数据
|
|
|
k += 1
|
|
|
print(dic)
|
|
|
dics.append(dic)
|
|
|
driver.close()
|
|
|
# 窗口切换到第一个页面
|
|
|
driver.switch_to.window(driver.window_handles[0])
|
|
|
if k == 2: # 每页爬取5条数据
|
|
|
break
|
|
|
sleep(2)
|
|
|
|
|
|
|
|
|
try:
|
|
|
# 点击下一页,这里下一页的按钮不好定位,用XPATH的话只对1-4页有用,第五页后面要改成a[11]
|
|
|
if page<=4:
|
|
|
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]')
|
|
|
driver.execute_script('arguments[0].click()', c)
|
|
|
elif page<7:
|
|
|
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[11]')
|
|
|
driver.execute_script('arguments[0].click()', c)
|
|
|
else:
|
|
|
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]/i')
|
|
|
driver.execute_script('arguments[0].click()', c)
|
|
|
WebDriverWait(driver, 100).until(
|
|
|
EC.presence_of_element_located((By.CLASS_NAME, 'job-card-wrapper'))
|
|
|
)
|
|
|
except:
|
|
|
driver.close()
|
|
|
driver.quit()
|
|
|
driver.get('https://www.zhipin.com/web/geek/job?query=' + job + "&city=" + str(city) + '&page=1')
|
|
|
driver.implicitly_wait(10)
|
|
|
if page<=4:
|
|
|
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]')
|
|
|
driver.execute_script('arguments[0].click()', c)
|
|
|
elif page<7:
|
|
|
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[11]')
|
|
|
driver.execute_script('arguments[0].click()', c)
|
|
|
else:
|
|
|
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]/i')
|
|
|
driver.execute_script('arguments[0].click()', c)
|
|
|
WebDriverWait(driver, 100).until(
|
|
|
EC.presence_of_element_located((By.CLASS_NAME, 'job-card-wrapper'))
|
|
|
)
|
|
|
driver.close()
|
|
|
driver.quit()
|
|
|
return dics
|
|
|
|
|
|
|
|
|
|
|
|
def createfile(dicts):
|
|
|
f = open('BOSS直聘1.csv', mode='a', encoding='utf-8-sig', newline='')
|
|
|
csv_writer = csv.DictWriter(f, fieldnames=["职位名称", "地区", "薪水", "标签", "能力要求", "公司名字", "公司介绍", "福利待遇", "职位描述",
|
|
|
"企业类型", "工作地址", "详情链接"])
|
|
|
csv_writer.writeheader() # 写入表头
|
|
|
for dic in dicts:
|
|
|
csv_writer.writerow(dic)
|
|
|
|
|
|
def createsql(city,job,dicts):
|
|
|
table_name = get_dbname(city, job)
|
|
|
sql().create_table(table_name)
|
|
|
sql().Insert_datas(table_name,dicts)
|
|
|
|
|
|
def crawl(city,job,pages):
|
|
|
city_form = {'全国': 100010000, '北京': 101010100, '上海': 101020100, '广州': 101280100, '深圳': 101280600, '杭州': 101210100,
|
|
|
'天津': 101030100, '西安': 101110100, '苏州': 101190400, '武汉': 101200100, '厦门': 101230200, '长沙': 101250100,
|
|
|
'成都': 101270100, '郑州': 101180100}
|
|
|
for _ in city_form:
|
|
|
if city == _:
|
|
|
city_code = city_form[_]
|
|
|
break
|
|
|
dicts=responsedata(job,city_code,pages)
|
|
|
createfile(dicts)
|
|
|
createsql(city, job,dicts)
|
|
|
return 1
|
|
|
|
|
|
|