parent
9fe8ed3b4c
commit
b5088a556d
@ -0,0 +1,70 @@
|
|||||||
|
from selenium import webdriver
|
||||||
|
import time
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from lxml.html import etree
|
||||||
|
import csv
|
||||||
|
|
||||||
|
all_data = [] # 保存全部数据
|
||||||
|
url = "https://we.51job.com/pc/search?keyword=python&searchType=2&sortType=0&metro="
|
||||||
|
|
||||||
|
option = webdriver.ChromeOptions()
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=option)
|
||||||
|
|
||||||
|
driver.get(url)
|
||||||
|
time.sleep(1) # 等待网页加载
|
||||||
|
error_page = []
|
||||||
|
|
||||||
|
def get_info(text):
|
||||||
|
doc = etree.HTML(text)
|
||||||
|
job_list = doc.xpath('//div[@class="j_joblist"]/div') # 定位到职位列表
|
||||||
|
data = []
|
||||||
|
for job in job_list:
|
||||||
|
try:
|
||||||
|
job_dict = {}
|
||||||
|
job_dict['招聘时间'] = job.xpath('./a/div/span[@class="time"]/text()')[0]
|
||||||
|
job_dict['职位名称'] = job.xpath('./a/div/span[@class="jname at"]/text()')[0]
|
||||||
|
job_dict['详情链接'] = job.xpath('./a/@href')[0]
|
||||||
|
job_dict['公司名称'] = job.xpath('./div[@class="er"]/a/text()')[0]
|
||||||
|
job_dict['所属行业'] = job.xpath('./div[@class="er"]/p[@class="int at"]/text()')[0]
|
||||||
|
Type_num = job.xpath('./div[@class="er"]/p[@class="dc at"]/text()')[0]
|
||||||
|
job_dict['企业性质'] = Type_num.split('|')[0].replace(' ', '')
|
||||||
|
job_dict['公司人数'] = Type_num.split('|')[-1].replace(' ', '')
|
||||||
|
job_dict['职位关键词'] = ' '.join(job.xpath('./a/p[@class="tags"]//text()'))
|
||||||
|
job_dict['工资'] = job.xpath('./a/p/span[@class="sal"]/text()')[0]
|
||||||
|
job_dict['公司地址'] = job.xpath('./a/p/span[@class="d at"]/span[1]/text()')[0]
|
||||||
|
job_dict['工作经验'] = job.xpath('./a/p/span[@class="d at"]/span[3]/text()')[0]
|
||||||
|
a = job.xpath('./a/p/span[@class="d at"]//span/text()')
|
||||||
|
job_dict['公司地址'] = a[0]
|
||||||
|
job_dict['工作经验'] = a[2]
|
||||||
|
job_dict['学历'] = a[-1]
|
||||||
|
data.append(job_dict)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
for i in range(50): # 爬取40页
|
||||||
|
try:
|
||||||
|
print(f"正在爬取第{i + 1}页的数据")
|
||||||
|
|
||||||
|
html = driver.page_source # 获取网页源码
|
||||||
|
all_data.extend(get_info(html)) # 解析网页源码
|
||||||
|
# 下一页按钮
|
||||||
|
next_btn = driver.find_element(By.XPATH, '//button[@class="btn-next"]')
|
||||||
|
next_btn.click() # 点击下一页
|
||||||
|
time.sleep(1) # 等待网页加载
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def job_save(data): # csv模块保存数据
|
||||||
|
headers = ['招聘时间', '职位名称', '详情链接', '公司名称', '所属行业', '企业性质', "职位关键词", '工资', '公司地址', '学历', '工作经验', '公司人数']
|
||||||
|
with open('python.csv', 'w', encoding='utf-8-sig', newline='') as fp:
|
||||||
|
dict_witer = csv.DictWriter(fp, headers)
|
||||||
|
dict_witer.writeheader()
|
||||||
|
dict_witer.writerows(data)
|
||||||
|
print('数据保存成功!!!')
|
||||||
|
print('-' * 50)
|
||||||
|
|
||||||
|
driver.close()
|
||||||
|
job_save(all_data) # 保存数据
|
||||||
Loading…
Reference in new issue