diff --git a/e2.py b/e2.py new file mode 100644 index 0000000..789a46a --- /dev/null +++ b/e2.py @@ -0,0 +1,49 @@ +import requests +from bs4 import BeautifulSoup +from lxml import etree + +# 第一题:解析全部招聘会的详细信息的访问地址 +# 目标网页URL +url = 'https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index' + +head = { + 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"} +# 发送HTTP请求 +response = requests.get(url, headers=head) +response.encoding = 'utf-8' +mySoup = BeautifulSoup(response.text, 'lxml') +result = mySoup.select('div.list div.td2 div.tit a, div.webList div.td2 div.tit a') # 直接选择 a 标签 +# 收录每个招聘会的详细信息的访问地址 +recruits = [] +for a_tag in result: + href = a_tag.get('href') # 从每个 a 标签中获取 href 属性 + if href: # 确保 href 存在 + print(href) + recruits.append(href) + +print(len(recruits)) # 获取列表长度 +# 第二题 :请求每个招聘会的详细信息页面,并解析出举办时间。。。 +recruits_show = [] # 现场 +for i in range(len(recruits)): + if 'show' in recruits[i]: + recruits_show.append(recruits[i]) +recruits_last = [] # 网络 +for i in range(len(recruits)): + if 'com' in recruits[i]: + recruits_last.append(recruits[i]) +for url in recruits_show: + res = requests.get(url, headers=head) + res.encoding = 'utf-8' + soup = BeautifulSoup(res.text, 'lxml') + result = soup.select('div.show_left div.txt,div.show_head div.tit a,div.show_right div.txt,div.nc_lf a strong') + for i in result: + print(i.get_text(strip=True),end=',') + print(end='\n\n') +for url in recruits_last: + res = requests.get(url, headers=head) + res.encoding = 'utf-8' + soup = BeautifulSoup(res.text, 'lxml') + result = soup.select('div.titleBox h1,div.dw,li a.companyName') + for i in range(len(result)-1): + print(result[i].get_text(strip=True),end=',') + print(end='\n\n') \ No newline at end of file