|
|
|
@ -0,0 +1,105 @@
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
indexUrl = "https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index"
|
|
|
|
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response = requests.get(indexUrl, headers=headers)
|
|
|
|
|
response.raise_for_status() # 检查请求是否成功
|
|
|
|
|
htmlTxt = response.text
|
|
|
|
|
# print(response.text[:800])
|
|
|
|
|
soup = BeautifulSoup(htmlTxt, 'html.parser')
|
|
|
|
|
|
|
|
|
|
# 查找ID为"j_list"的div元素
|
|
|
|
|
j_list_div = soup.find('div', id='j_list')
|
|
|
|
|
|
|
|
|
|
urls = []
|
|
|
|
|
# 如果找到了这个div,就查找它内部的所有class为'tit link_gray6'的<div>标签,并提取其中的<a>标签的链接
|
|
|
|
|
if j_list_div:
|
|
|
|
|
for div in j_list_div.find_all('div', class_='tit link_gray6'): # 注意这里直接遍历find_all的结果
|
|
|
|
|
a_tag = div.find('a')
|
|
|
|
|
if a_tag and 'href' in a_tag.attrs: # 检查<a>标签是否存在且包含href属性
|
|
|
|
|
url = a_tag['href']
|
|
|
|
|
urls.append(url)
|
|
|
|
|
# print(url)
|
|
|
|
|
|
|
|
|
|
for url in urls:
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
|
|
htmlTxt = response.text
|
|
|
|
|
soup = BeautifulSoup(htmlTxt, 'html.parser')
|
|
|
|
|
|
|
|
|
|
print("-------------" + url + "-------------")
|
|
|
|
|
# 获取招聘会名称
|
|
|
|
|
# 查找class为"tit substring link_gray6"的div元素中的a标签
|
|
|
|
|
jobfair_div = soup.find('div', class_='tit substring link_gray6')
|
|
|
|
|
if jobfair_div:
|
|
|
|
|
a_tag = jobfair_div.find('a')
|
|
|
|
|
if a_tag:
|
|
|
|
|
# 注意:由于href为"javascript:;",实际文本可能不是通过href获取,而是直接通过a标签的文本内容
|
|
|
|
|
jobfair_name = a_tag.get_text(strip=True) # 提取a标签的文本内容并去除前后空白
|
|
|
|
|
print("招聘会名称:" + jobfair_name)
|
|
|
|
|
else:
|
|
|
|
|
print("未找到匹配的div元素")
|
|
|
|
|
|
|
|
|
|
# 获取主办单位
|
|
|
|
|
# 查找class为"stit"且文本内容为"主办单位"的div
|
|
|
|
|
organize_unit_div = soup.find('div', class_='stit', string='主办单位')
|
|
|
|
|
# 如果找到了,就查找它后面的第一个class为"txt"的div
|
|
|
|
|
if organize_unit_div:
|
|
|
|
|
next_sibling = organize_unit_div.find_next_sibling('div', class_='txt')
|
|
|
|
|
if next_sibling:
|
|
|
|
|
organize_unit_text = next_sibling.get_text(strip=True) # 获取文本并去除前后空白
|
|
|
|
|
print("主办单位:" + organize_unit_text)
|
|
|
|
|
else:
|
|
|
|
|
print("未找到包含文本内容的div")
|
|
|
|
|
else:
|
|
|
|
|
print("未找到'主办单位'的div")
|
|
|
|
|
|
|
|
|
|
# 获取举办时间
|
|
|
|
|
time_div = soup.find('div', class_='stit', string='招聘会时间')
|
|
|
|
|
if time_div:
|
|
|
|
|
next_sibling = time_div.find_next_sibling('div', class_='txt')
|
|
|
|
|
if next_sibling:
|
|
|
|
|
organize_unit_text = next_sibling.get_text(strip=True) # 获取文本并去除前后空白
|
|
|
|
|
print("招聘会时间:" + organize_unit_text)
|
|
|
|
|
else:
|
|
|
|
|
print("未找到包含文本内容的div")
|
|
|
|
|
else:
|
|
|
|
|
print("未找到'招聘会时间'的div")
|
|
|
|
|
|
|
|
|
|
# 获取联系方式
|
|
|
|
|
phone_div = soup.find('div', class_='stit', string='联系方式')
|
|
|
|
|
if phone_div:
|
|
|
|
|
next_sibling = phone_div.find_next_sibling('div', class_='txt')
|
|
|
|
|
if next_sibling:
|
|
|
|
|
organize_unit_text = next_sibling.get_text(strip=True) # 获取文本并去除前后空白
|
|
|
|
|
print("联系方式:" + organize_unit_text)
|
|
|
|
|
else:
|
|
|
|
|
print("未找到包含文本内容的div")
|
|
|
|
|
else:
|
|
|
|
|
print("未找到'联系方式'的div")
|
|
|
|
|
|
|
|
|
|
# 获取摊位设置及费用
|
|
|
|
|
site_div = soup.find('div', class_='stit', string='摊位设置及费用')
|
|
|
|
|
if site_div:
|
|
|
|
|
next_sibling = site_div.find_next_sibling('div', class_='txt')
|
|
|
|
|
if next_sibling:
|
|
|
|
|
organize_unit_text = next_sibling.get_text(strip=True) # 获取文本并去除前后空白
|
|
|
|
|
print("摊位设置及费用:" + organize_unit_text)
|
|
|
|
|
else:
|
|
|
|
|
print("未找到包含文本内容的div")
|
|
|
|
|
else:
|
|
|
|
|
print("未找到'摊位设置及费用'的div")
|
|
|
|
|
|
|
|
|
|
# 查找所有具有类'comtit'的div元素
|
|
|
|
|
comtit_divs = soup.find_all('div', class_='comtit link_gray6')
|
|
|
|
|
# 遍历这些div元素,并查找包含公司名称的<a>标签
|
|
|
|
|
print("参会的企业名称:")
|
|
|
|
|
for div in comtit_divs:
|
|
|
|
|
# 查找<a>标签
|
|
|
|
|
a_tag = div.find('a')
|
|
|
|
|
if a_tag:
|
|
|
|
|
# 提取公司名称(这里假设公司名称位于<strong>标签内)
|
|
|
|
|
company_name = a_tag.find('strong').get_text(strip=True)
|
|
|
|
|
print(company_name)
|