From 7d3ab8c8084827a325ca8b59ddb9be6666e0519f Mon Sep 17 00:00:00 2001
From: p94xcago5 <2609106649@qq.com>
Date: Wed, 29 May 2024 22:04:42 +0800
Subject: [PATCH] ADD file via upload
---
...tifulSoup库解析南昌人才招聘网.py | 105 ++++++++++++++++++
1 file changed, 105 insertions(+)
create mode 100644 BeautifulSoup库解析南昌人才招聘网.py
diff --git a/BeautifulSoup库解析南昌人才招聘网.py b/BeautifulSoup库解析南昌人才招聘网.py
new file mode 100644
index 0000000..5c344d2
--- /dev/null
+++ b/BeautifulSoup库解析南昌人才招聘网.py
@@ -0,0 +1,105 @@
+from bs4 import BeautifulSoup
+import requests
+
+indexUrl = "https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index"
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'}
+
+
+response = requests.get(indexUrl, headers=headers)
+response.raise_for_status() # 检查请求是否成功
+htmlTxt = response.text
+# print(response.text[:800])
+soup = BeautifulSoup(htmlTxt, 'html.parser')
+
+# 查找ID为"j_list"的div元素
+j_list_div = soup.find('div', id='j_list')
+
+urls = []
+# 如果找到了这个div,就查找它内部的所有class为'tit link_gray6'的
标签,并提取其中的
标签的链接
+if j_list_div:
+ for div in j_list_div.find_all('div', class_='tit link_gray6'): # 注意这里直接遍历find_all的结果
+ a_tag = div.find('a')
+ if a_tag and 'href' in a_tag.attrs: # 检查标签是否存在且包含href属性
+ url = a_tag['href']
+ urls.append(url)
+ # print(url)
+
+for url in urls:
+ response = requests.get(url, headers=headers)
+ htmlTxt = response.text
+ soup = BeautifulSoup(htmlTxt, 'html.parser')
+
+ print("-------------" + url + "-------------")
+ # 获取招聘会名称
+ # 查找class为"tit substring link_gray6"的div元素中的a标签
+ jobfair_div = soup.find('div', class_='tit substring link_gray6')
+ if jobfair_div:
+ a_tag = jobfair_div.find('a')
+ if a_tag:
+ # 注意:由于href为"javascript:;",实际文本可能不是通过href获取,而是直接通过a标签的文本内容
+ jobfair_name = a_tag.get_text(strip=True) # 提取a标签的文本内容并去除前后空白
+ print("招聘会名称:" + jobfair_name)
+ else:
+ print("未找到匹配的div元素")
+
+ # 获取主办单位
+ # 查找class为"stit"且文本内容为"主办单位"的div
+ organize_unit_div = soup.find('div', class_='stit', string='主办单位')
+ # 如果找到了,就查找它后面的第一个class为"txt"的div
+ if organize_unit_div:
+ next_sibling = organize_unit_div.find_next_sibling('div', class_='txt')
+ if next_sibling:
+ organize_unit_text = next_sibling.get_text(strip=True) # 获取文本并去除前后空白
+ print("主办单位:" + organize_unit_text)
+ else:
+ print("未找到包含文本内容的div")
+ else:
+ print("未找到'主办单位'的div")
+
+ # 获取举办时间
+ time_div = soup.find('div', class_='stit', string='招聘会时间')
+ if time_div:
+ next_sibling = time_div.find_next_sibling('div', class_='txt')
+ if next_sibling:
+ organize_unit_text = next_sibling.get_text(strip=True) # 获取文本并去除前后空白
+ print("招聘会时间:" + organize_unit_text)
+ else:
+ print("未找到包含文本内容的div")
+ else:
+ print("未找到'招聘会时间'的div")
+
+ # 获取联系方式
+ phone_div = soup.find('div', class_='stit', string='联系方式')
+ if phone_div:
+ next_sibling = phone_div.find_next_sibling('div', class_='txt')
+ if next_sibling:
+ organize_unit_text = next_sibling.get_text(strip=True) # 获取文本并去除前后空白
+ print("联系方式:" + organize_unit_text)
+ else:
+ print("未找到包含文本内容的div")
+ else:
+ print("未找到'联系方式'的div")
+
+ # 获取摊位设置及费用
+ site_div = soup.find('div', class_='stit', string='摊位设置及费用')
+ if site_div:
+ next_sibling = site_div.find_next_sibling('div', class_='txt')
+ if next_sibling:
+ organize_unit_text = next_sibling.get_text(strip=True) # 获取文本并去除前后空白
+ print("摊位设置及费用:" + organize_unit_text)
+ else:
+ print("未找到包含文本内容的div")
+ else:
+ print("未找到'摊位设置及费用'的div")
+
+ # 查找所有具有类'comtit'的div元素
+ comtit_divs = soup.find_all('div', class_='comtit link_gray6')
+ # 遍历这些div元素,并查找包含公司名称的标签
+ print("参会的企业名称:")
+ for div in comtit_divs:
+ # 查找标签
+ a_tag = div.find('a')
+ if a_tag:
+ # 提取公司名称(这里假设公司名称位于标签内)
+ company_name = a_tag.find('strong').get_text(strip=True)
+ print(company_name)