parent
9fe8ed3b4c
commit
b5088a556d
@ -0,0 +1,70 @@
|
||||
from selenium import webdriver
|
||||
import time
|
||||
from selenium.webdriver.common.by import By
|
||||
from lxml.html import etree
|
||||
import csv
|
||||
|
||||
all_data = [] # 保存全部数据
|
||||
url = "https://we.51job.com/pc/search?keyword=python&searchType=2&sortType=0&metro="
|
||||
|
||||
option = webdriver.ChromeOptions()
|
||||
|
||||
driver = webdriver.Chrome(options=option)
|
||||
|
||||
driver.get(url)
|
||||
time.sleep(1) # 等待网页加载
|
||||
error_page = []
|
||||
|
||||
def get_info(text):
|
||||
doc = etree.HTML(text)
|
||||
job_list = doc.xpath('//div[@class="j_joblist"]/div') # 定位到职位列表
|
||||
data = []
|
||||
for job in job_list:
|
||||
try:
|
||||
job_dict = {}
|
||||
job_dict['招聘时间'] = job.xpath('./a/div/span[@class="time"]/text()')[0]
|
||||
job_dict['职位名称'] = job.xpath('./a/div/span[@class="jname at"]/text()')[0]
|
||||
job_dict['详情链接'] = job.xpath('./a/@href')[0]
|
||||
job_dict['公司名称'] = job.xpath('./div[@class="er"]/a/text()')[0]
|
||||
job_dict['所属行业'] = job.xpath('./div[@class="er"]/p[@class="int at"]/text()')[0]
|
||||
Type_num = job.xpath('./div[@class="er"]/p[@class="dc at"]/text()')[0]
|
||||
job_dict['企业性质'] = Type_num.split('|')[0].replace(' ', '')
|
||||
job_dict['公司人数'] = Type_num.split('|')[-1].replace(' ', '')
|
||||
job_dict['职位关键词'] = ' '.join(job.xpath('./a/p[@class="tags"]//text()'))
|
||||
job_dict['工资'] = job.xpath('./a/p/span[@class="sal"]/text()')[0]
|
||||
job_dict['公司地址'] = job.xpath('./a/p/span[@class="d at"]/span[1]/text()')[0]
|
||||
job_dict['工作经验'] = job.xpath('./a/p/span[@class="d at"]/span[3]/text()')[0]
|
||||
a = job.xpath('./a/p/span[@class="d at"]//span/text()')
|
||||
job_dict['公司地址'] = a[0]
|
||||
job_dict['工作经验'] = a[2]
|
||||
job_dict['学历'] = a[-1]
|
||||
data.append(job_dict)
|
||||
except:
|
||||
pass
|
||||
return data
|
||||
|
||||
|
||||
for i in range(50): # 爬取40页
|
||||
try:
|
||||
print(f"正在爬取第{i + 1}页的数据")
|
||||
|
||||
html = driver.page_source # 获取网页源码
|
||||
all_data.extend(get_info(html)) # 解析网页源码
|
||||
# 下一页按钮
|
||||
next_btn = driver.find_element(By.XPATH, '//button[@class="btn-next"]')
|
||||
next_btn.click() # 点击下一页
|
||||
time.sleep(1) # 等待网页加载
|
||||
except:
|
||||
pass
|
||||
|
||||
def job_save(data): # csv模块保存数据
|
||||
headers = ['招聘时间', '职位名称', '详情链接', '公司名称', '所属行业', '企业性质', "职位关键词", '工资', '公司地址', '学历', '工作经验', '公司人数']
|
||||
with open('python.csv', 'w', encoding='utf-8-sig', newline='') as fp:
|
||||
dict_witer = csv.DictWriter(fp, headers)
|
||||
dict_witer.writeheader()
|
||||
dict_witer.writerows(data)
|
||||
print('数据保存成功!!!')
|
||||
print('-' * 50)
|
||||
|
||||
driver.close()
|
||||
job_save(all_data) # 保存数据
|
||||
Loading…
Reference in new issue