From ed2b052496af991f810ea4f0641b3697c8e9d619 Mon Sep 17 00:00:00 2001 From: pjxcmefw3 <2052688587@qq.com> Date: Thu, 30 May 2024 16:33:11 +0800 Subject: [PATCH] ADD file via upload --- ...ifulSoup库解析南昌人才招聘网.txt | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 单元实践 使用BeautifulSoup库解析南昌人才招聘网.txt diff --git a/单元实践 使用BeautifulSoup库解析南昌人才招聘网.txt b/单元实践 使用BeautifulSoup库解析南昌人才招聘网.txt new file mode 100644 index 0000000..1250717 --- /dev/null +++ b/单元实践 使用BeautifulSoup库解析南昌人才招聘网.txt @@ -0,0 +1,83 @@ +from lxml import etree +from bs4 import BeautifulSoup +import re +import requests + +url = "https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index" +headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'} +res = requests.get(url, headers=headers) +html = res.text +# print(html) +url = [] +soup = BeautifulSoup(html, "lxml") +y = soup.select("div.tit") +for i in y: + z = i.find_all(name="a") + for a in z: + k = a.get("href") + if k is not None: + # print(k) + urls.append(k) +# print(urls[1:]) + +ll = [] +for g in url[1:]: + # print(g) + list = [] + res1 = requests.get(g, headers=headers) + htmls = res1.text + # print(htmls) + soup1 = BeautifulSoup(htmls, "lxml") + y1 = soup1.find_all("div", class_='txt') + # print(len(y1)) + if len(y1) == 10: + list.append(y1[0].text.strip()) + list.append(y1[1].text.strip()) + list.append(y1[4].text.strip()) + list.append(86700710) + list.append(y1[5].text.strip()) + + # print(list1) + y2 = soup1.find_all("strong") + list2 = [] + for i in y2: + list2.append(i.text.strip()) + list.append('、'.join(list2)) + ll.append(list) + if len(y1) == 9: + list.append(y1[0].text.strip()) + list.append(y1[1].text.strip()) + list.append(y1[3].text.strip()) + list.append(86700710) + list.append(y1[4].text.strip()) + + # print(list1) + y2 = soup1.find_all("strong") + list2 = [] + for i in y2: + list2.append(i.text.strip()) + list.append('、'.join(list2)) + ll.append(list) + + if len(y1) == 1: + span = soup1.find_all('span') + h1 = soup1.find_all('h1') + + if not h1: + list.append(h1) + else: + list.append(h1[0]) + list.append(span[5].text.strip()) + list.append(span[3].text.strip()) + list.append(86700710) + list.append('') + + # print(list1) + y2 = soup1.select('ul>li>a.companyName') + list2 = [] + for i in y2: + list2.append(i.text.strip()) + list.append('、'.join(list2)) + ll.append(list) +print(ll) \ No newline at end of file