From b5088a556d0efa001c44f6f3d76068d5cdabd19e Mon Sep 17 00:00:00 2001 From: p3gsryuwh <2841528154@qq.com> Date: Thu, 27 Apr 2023 23:46:45 +0800 Subject: [PATCH] ADD file via upload --- 爬取数据.py | 70 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 爬取数据.py diff --git a/爬取数据.py b/爬取数据.py new file mode 100644 index 0000000..a52d8c0 --- /dev/null +++ b/爬取数据.py @@ -0,0 +1,70 @@ +from selenium import webdriver +import time +from selenium.webdriver.common.by import By +from lxml.html import etree +import csv + +all_data = [] # 保存全部数据 +url = "https://we.51job.com/pc/search?keyword=python&searchType=2&sortType=0&metro=" + +option = webdriver.ChromeOptions() + +driver = webdriver.Chrome(options=option) + +driver.get(url) +time.sleep(1) # 等待网页加载 +error_page = [] + +def get_info(text): + doc = etree.HTML(text) + job_list = doc.xpath('//div[@class="j_joblist"]/div') # 定位到职位列表 + data = [] + for job in job_list: + try: + job_dict = {} + job_dict['招聘时间'] = job.xpath('./a/div/span[@class="time"]/text()')[0] + job_dict['职位名称'] = job.xpath('./a/div/span[@class="jname at"]/text()')[0] + job_dict['详情链接'] = job.xpath('./a/@href')[0] + job_dict['公司名称'] = job.xpath('./div[@class="er"]/a/text()')[0] + job_dict['所属行业'] = job.xpath('./div[@class="er"]/p[@class="int at"]/text()')[0] + Type_num = job.xpath('./div[@class="er"]/p[@class="dc at"]/text()')[0] + job_dict['企业性质'] = Type_num.split('|')[0].replace(' ', '') + job_dict['公司人数'] = Type_num.split('|')[-1].replace(' ', '') + job_dict['职位关键词'] = ' '.join(job.xpath('./a/p[@class="tags"]//text()')) + job_dict['工资'] = job.xpath('./a/p/span[@class="sal"]/text()')[0] + job_dict['公司地址'] = job.xpath('./a/p/span[@class="d at"]/span[1]/text()')[0] + job_dict['工作经验'] = job.xpath('./a/p/span[@class="d at"]/span[3]/text()')[0] + a = job.xpath('./a/p/span[@class="d at"]//span/text()') + job_dict['公司地址'] = a[0] + job_dict['工作经验'] = a[2] + job_dict['学历'] = a[-1] + data.append(job_dict) + except: + pass + return data + + +for i in range(50): # 爬取40页 + try: + print(f"正在爬取第{i + 1}页的数据") + + html = driver.page_source # 获取网页源码 + all_data.extend(get_info(html)) # 解析网页源码 + # 下一页按钮 + next_btn = driver.find_element(By.XPATH, '//button[@class="btn-next"]') + next_btn.click() # 点击下一页 + time.sleep(1) # 等待网页加载 + except: + pass + +def job_save(data): # csv模块保存数据 + headers = ['招聘时间', '职位名称', '详情链接', '公司名称', '所属行业', '企业性质', "职位关键词", '工资', '公司地址', '学历', '工作经验', '公司人数'] + with open('python.csv', 'w', encoding='utf-8-sig', newline='') as fp: + dict_witer = csv.DictWriter(fp, headers) + dict_witer.writeheader() + dict_witer.writerows(data) + print('数据保存成功!!!') + print('-' * 50) + +driver.close() +job_save(all_data) # 保存数据