ADD file via upload

1 year ago · 7d3ab8c808
parent 1b4548b355
commit 7d3ab8c808
1 changed files with 105 additions and 0 deletions
--- a/BeautifulSoup库解析南昌人才招聘网.py
+++ b/BeautifulSoup库解析南昌人才招聘网.py
@ -0,0 +1,105 @@
+from bs4 import BeautifulSoup
+import requests
+
+indexUrl = "https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index"
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'}
+
+
+response = requests.get(indexUrl, headers=headers)
+response.raise_for_status()  # 检查请求是否成功
+htmlTxt = response.text
+# print(response.text[:800])
+soup = BeautifulSoup(htmlTxt, 'html.parser')
+
+# 查找ID为"j_list"的div元素
+j_list_div = soup.find('div', id='j_list')
+
+urls = []
+# 如果找到了这个div，就查找它内部的所有class为'tit link_gray6'的<div>标签，并提取其中的<a>标签的链接
+if j_list_div:
+    for div in j_list_div.find_all('div', class_='tit link_gray6'):  # 注意这里直接遍历find_all的结果
+        a_tag = div.find('a')
+        if a_tag and 'href' in a_tag.attrs:  # 检查<a>标签是否存在且包含href属性
+            url = a_tag['href']
+            urls.append(url)
+            # print(url)
+
+for url in urls:
+    response = requests.get(url, headers=headers)
+    htmlTxt = response.text
+    soup = BeautifulSoup(htmlTxt, 'html.parser')
+
+    print("-------------" + url + "-------------")
+    # 获取招聘会名称
+    # 查找class为"tit substring link_gray6"的div元素中的a标签
+    jobfair_div = soup.find('div', class_='tit substring link_gray6')
+    if jobfair_div:
+        a_tag = jobfair_div.find('a')
+        if a_tag:
+            # 注意：由于href为"javascript:;"，实际文本可能不是通过href获取，而是直接通过a标签的文本内容
+            jobfair_name = a_tag.get_text(strip=True)  # 提取a标签的文本内容并去除前后空白
+            print("招聘会名称：" + jobfair_name)
+    else:
+        print("未找到匹配的div元素")
+
+    # 获取主办单位
+    # 查找class为"stit"且文本内容为"主办单位"的div
+    organize_unit_div = soup.find('div', class_='stit', string='主办单位')
+    # 如果找到了，就查找它后面的第一个class为"txt"的div
+    if organize_unit_div:
+        next_sibling = organize_unit_div.find_next_sibling('div', class_='txt')
+        if next_sibling:
+            organize_unit_text = next_sibling.get_text(strip=True)  # 获取文本并去除前后空白
+            print("主办单位：" + organize_unit_text)
+        else:
+            print("未找到包含文本内容的div")
+    else:
+        print("未找到'主办单位'的div")
+
+    # 获取举办时间
+    time_div = soup.find('div', class_='stit', string='招聘会时间')
+    if time_div:
+        next_sibling = time_div.find_next_sibling('div', class_='txt')
+        if next_sibling:
+            organize_unit_text = next_sibling.get_text(strip=True)  # 获取文本并去除前后空白
+            print("招聘会时间：" + organize_unit_text)
+        else:
+            print("未找到包含文本内容的div")
+    else:
+        print("未找到'招聘会时间'的div")
+
+    # 获取联系方式
+    phone_div = soup.find('div', class_='stit', string='联系方式')
+    if phone_div:
+        next_sibling = phone_div.find_next_sibling('div', class_='txt')
+        if next_sibling:
+            organize_unit_text = next_sibling.get_text(strip=True)  # 获取文本并去除前后空白
+            print("联系方式：" + organize_unit_text)
+        else:
+            print("未找到包含文本内容的div")
+    else:
+        print("未找到'联系方式'的div")
+
+    # 获取摊位设置及费用
+    site_div = soup.find('div', class_='stit', string='摊位设置及费用')
+    if site_div:
+        next_sibling = site_div.find_next_sibling('div', class_='txt')
+        if next_sibling:
+            organize_unit_text = next_sibling.get_text(strip=True)  # 获取文本并去除前后空白
+            print("摊位设置及费用：" + organize_unit_text)
+        else:
+            print("未找到包含文本内容的div")
+    else:
+        print("未找到'摊位设置及费用'的div")
+
+    # 查找所有具有类'comtit'的div元素
+    comtit_divs = soup.find_all('div', class_='comtit link_gray6')
+    # 遍历这些div元素，并查找包含公司名称的<a>标签
+    print("参会的企业名称：")
+    for div in comtit_divs:
+        # 查找<a>标签
+        a_tag = div.find('a')
+        if a_tag:
+            # 提取公司名称（这里假设公司名称位于<strong>标签内）
+            company_name = a_tag.find('strong').get_text(strip=True)
+            print(company_name)