From 7d3ab8c8084827a325ca8b59ddb9be6666e0519f Mon Sep 17 00:00:00 2001 From: p94xcago5 <2609106649@qq.com> Date: Wed, 29 May 2024 22:04:42 +0800 Subject: [PATCH] ADD file via upload --- ...tifulSoup库解析南昌人才招聘网.py | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 BeautifulSoup库解析南昌人才招聘网.py diff --git a/BeautifulSoup库解析南昌人才招聘网.py b/BeautifulSoup库解析南昌人才招聘网.py new file mode 100644 index 0000000..5c344d2 --- /dev/null +++ b/BeautifulSoup库解析南昌人才招聘网.py @@ -0,0 +1,105 @@ +from bs4 import BeautifulSoup +import requests + +indexUrl = "https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index" +headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'} + + +response = requests.get(indexUrl, headers=headers) +response.raise_for_status() # 检查请求是否成功 +htmlTxt = response.text +# print(response.text[:800]) +soup = BeautifulSoup(htmlTxt, 'html.parser') + +# 查找ID为"j_list"的div元素 +j_list_div = soup.find('div', id='j_list') + +urls = [] +# 如果找到了这个div,就查找它内部的所有class为'tit link_gray6'的
标签,并提取其中的标签的链接 +if j_list_div: + for div in j_list_div.find_all('div', class_='tit link_gray6'): # 注意这里直接遍历find_all的结果 + a_tag = div.find('a') + if a_tag and 'href' in a_tag.attrs: # 检查标签是否存在且包含href属性 + url = a_tag['href'] + urls.append(url) + # print(url) + +for url in urls: + response = requests.get(url, headers=headers) + htmlTxt = response.text + soup = BeautifulSoup(htmlTxt, 'html.parser') + + print("-------------" + url + "-------------") + # 获取招聘会名称 + # 查找class为"tit substring link_gray6"的div元素中的a标签 + jobfair_div = soup.find('div', class_='tit substring link_gray6') + if jobfair_div: + a_tag = jobfair_div.find('a') + if a_tag: + # 注意:由于href为"javascript:;",实际文本可能不是通过href获取,而是直接通过a标签的文本内容 + jobfair_name = a_tag.get_text(strip=True) # 提取a标签的文本内容并去除前后空白 + print("招聘会名称:" + jobfair_name) + else: + print("未找到匹配的div元素") + + # 获取主办单位 + # 查找class为"stit"且文本内容为"主办单位"的div + organize_unit_div = soup.find('div', class_='stit', string='主办单位') + # 如果找到了,就查找它后面的第一个class为"txt"的div + if organize_unit_div: + next_sibling = organize_unit_div.find_next_sibling('div', class_='txt') + if next_sibling: + organize_unit_text = next_sibling.get_text(strip=True) # 获取文本并去除前后空白 + print("主办单位:" + organize_unit_text) + else: + print("未找到包含文本内容的div") + else: + print("未找到'主办单位'的div") + + # 获取举办时间 + time_div = soup.find('div', class_='stit', string='招聘会时间') + if time_div: + next_sibling = time_div.find_next_sibling('div', class_='txt') + if next_sibling: + organize_unit_text = next_sibling.get_text(strip=True) # 获取文本并去除前后空白 + print("招聘会时间:" + organize_unit_text) + else: + print("未找到包含文本内容的div") + else: + print("未找到'招聘会时间'的div") + + # 获取联系方式 + phone_div = soup.find('div', class_='stit', string='联系方式') + if phone_div: + next_sibling = phone_div.find_next_sibling('div', class_='txt') + if next_sibling: + organize_unit_text = next_sibling.get_text(strip=True) # 获取文本并去除前后空白 + print("联系方式:" + organize_unit_text) + else: + print("未找到包含文本内容的div") + else: + print("未找到'联系方式'的div") + + # 获取摊位设置及费用 + site_div = soup.find('div', class_='stit', string='摊位设置及费用') + if site_div: + next_sibling = site_div.find_next_sibling('div', class_='txt') + if next_sibling: + organize_unit_text = next_sibling.get_text(strip=True) # 获取文本并去除前后空白 + print("摊位设置及费用:" + organize_unit_text) + else: + print("未找到包含文本内容的div") + else: + print("未找到'摊位设置及费用'的div") + + # 查找所有具有类'comtit'的div元素 + comtit_divs = soup.find_all('div', class_='comtit link_gray6') + # 遍历这些div元素,并查找包含公司名称的标签 + print("参会的企业名称:") + for div in comtit_divs: + # 查找标签 + a_tag = div.find('a') + if a_tag: + # 提取公司名称(这里假设公司名称位于标签内) + company_name = a_tag.find('strong').get_text(strip=True) + print(company_name)