You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

197 lines
8.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from selenium.webdriver import Firefox
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common import exceptions
from sql import sql
from pypinyin import pinyin,Style
import csv
import traceback
def get_dbname(city,job):
"""
将中文字符串转换为拼音字符串,并只保留每个汉字的第一个拼音字母。
"""
pinyin_list = pinyin(city, style=Style.NORMAL)
city = ''.join([p[0] for p in pinyin_list])
pinyin_list = pinyin(job, style=Style.NORMAL)
job = ''.join([p[0] for p in pinyin_list])
return city+'_'+job
def responsedata(job,city,pages):
#无头启动
options = FirefoxOptions()
# options.headless=True
# 反检测
options.add_argument("--proxy-server=127.0.0.1:7890")
driver = Firefox(options=options)
# 设置隐性等待时间为10s
driver.maximize_window()
driver.get('https://www.zhipin.com/web/geek/job?query='+job+"&city="+str(city)+'&page=1')
driver.implicitly_wait(10)
sleep(2)
dics=[]
for page in range(1, pages+1): # 爬取3页
k = 0 # 用来设置每页爬取的数量每页有30条数据因全部爬取用selenium较慢为测试效果每页只爬取5条
sleep(2)
# 滚动条滚到底部
driver.execute_script('document.documentElement.scrollTop = document.documentElement.scrollHeight')
li_lists = driver.find_elements(By.CSS_SELECTOR, '.job-card-wrapper')
print(len(li_lists))
for li in li_lists:
job_name = li.find_element(By.CLASS_NAME, 'job-name').text
job_area = li.find_element(By.CLASS_NAME, 'job-area').text
salary = li.find_element(By.CLASS_NAME, 'salary').text
job_tag = li.find_element(By.CSS_SELECTOR, '.job-card-wrapper .job-card-left .tag-list').text.replace('\n', ',')
job_ability = li.find_element(By.XPATH, './div[2]/ul').text
company_name = li.find_element(By.CLASS_NAME, 'company-name').text
welfare = li.find_element(By.CLASS_NAME, 'info-desc').text
link = li.find_element(By.CLASS_NAME, 'job-card-left').get_attribute('href')
# WebDriverWait(driver, 100).until(
# EC.presence_of_element_located((By.XPATH, '//*[@id="main"]/div[3]/div/div[2]/div[1]/div[2]'))
# )
try:
# 点击详情页
clic = li.find_element(By.CSS_SELECTOR, '.job-card-left')
driver.execute_script('arguments[0].click()', clic)
# 窗口切换到最新打开的页面
driver.switch_to.window(driver.window_handles[-1])
driver.implicitly_wait(5)
driver.execute_script('document.documentElement.scrollTop = document.documentElement.scrollHeight')
job_des = driver.find_element(By.XPATH, '//*[@id="main"]/div[3]/div/div[2]/div[1]/div[2]').text.replace('\n',' ')
except Exception as e:
print("Error:",e)
print('因网络问题,有个别页面无法加载,已进行跳过')
company_info = ''
company_type = ''
address=''
job_des=''
dic = {
"职位名称": job_name,
"地区": job_area,
"薪水": salary,
"标签": job_tag,
"能力要求": job_ability,
"公司名字": company_name,
"公司介绍": company_info,
"福利待遇": welfare,
"职位描述": job_des,
"企业类型": company_type,
"工作地址": address,
"详情链接": link
}
# 写入数据
k += 1
print(dic)
dics.append(dic)
driver.close()
driver.switch_to.window(driver.window_handles[0])
continue
try: # 有的公司没有公司介绍
company_info = driver.find_element(By.CSS_SELECTOR,'.job-body-wrapper .company-info-box .fold-text').text.replace('\n', ' ')
except exceptions.NoSuchElementException:
company_info = ''
try:
company_type = driver.find_element(By.CLASS_NAME, 'company-type').text.replace('企业类型\n', '')
except exceptions.NoSuchElementException:
company_type = ''
address = driver.find_element(By.CLASS_NAME, 'location-address').text
dic = {
"职位名称": job_name,
"地区": job_area,
"薪水": salary,
"标签": job_tag,
"能力要求": job_ability,
"公司名字": company_name,
"公司介绍": company_info,
"福利待遇": welfare,
"职位描述": job_des,
"企业类型": company_type,
"工作地址": address,
"详情链接": link
}
# 写入数据
k += 1
print(dic)
dics.append(dic)
driver.close()
# 窗口切换到第一个页面
driver.switch_to.window(driver.window_handles[0])
if k == 2: # 每页爬取5条数据
break
sleep(2)
try:
# 点击下一页这里下一页的按钮不好定位用XPATH的话只对1-4页有用第五页后面要改成a[11]
if page<=4:
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]')
driver.execute_script('arguments[0].click()', c)
elif page<7:
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[11]')
driver.execute_script('arguments[0].click()', c)
else:
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]/i')
driver.execute_script('arguments[0].click()', c)
WebDriverWait(driver, 100).until(
EC.presence_of_element_located((By.CLASS_NAME, 'job-card-wrapper'))
)
except:
driver.close()
driver.quit()
driver.get('https://www.zhipin.com/web/geek/job?query=' + job + "&city=" + str(city) + '&page=1')
driver.implicitly_wait(10)
if page<=4:
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]')
driver.execute_script('arguments[0].click()', c)
elif page<7:
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[11]')
driver.execute_script('arguments[0].click()', c)
else:
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]/i')
driver.execute_script('arguments[0].click()', c)
WebDriverWait(driver, 100).until(
EC.presence_of_element_located((By.CLASS_NAME, 'job-card-wrapper'))
)
driver.close()
driver.quit()
return dics
def createfile(dicts):
f = open('BOSS直聘1.csv', mode='a', encoding='utf-8-sig', newline='')
csv_writer = csv.DictWriter(f, fieldnames=["职位名称", "地区", "薪水", "标签", "能力要求", "公司名字", "公司介绍", "福利待遇", "职位描述",
"企业类型", "工作地址", "详情链接"])
csv_writer.writeheader() # 写入表头
for dic in dicts:
csv_writer.writerow(dic)
def createsql(city,job,dicts):
table_name = get_dbname(city, job)
sql().create_table(table_name)
sql().Insert_datas(table_name,dicts)
def crawl(city,job,pages):
city_form = {'全国': 100010000, '北京': 101010100, '上海': 101020100, '广州': 101280100, '深圳': 101280600, '杭州': 101210100,
'天津': 101030100, '西安': 101110100, '苏州': 101190400, '武汉': 101200100, '厦门': 101230200, '长沙': 101250100,
'成都': 101270100, '郑州': 101180100}
for _ in city_form:
if city == _:
city_code = city_form[_]
break
dicts=responsedata(job,city_code,pages)
createfile(dicts)
createsql(city, job,dicts)
return 1