from lxml import etree from bs4 import BeautifulSoup import re import requests url = "https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index" headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'} res = requests.get(url, headers=headers) html = res.text # print(html) url = [] soup = BeautifulSoup(html, "lxml") y = soup.select("div.tit") for i in y: z = i.find_all(name="a") for a in z: k = a.get("href") if k is not None: # print(k) urls.append(k) # print(urls[1:]) ll = [] for g in url[1:]: # print(g) list = [] res1 = requests.get(g, headers=headers) htmls = res1.text # print(htmls) soup1 = BeautifulSoup(htmls, "lxml") y1 = soup1.find_all("div", class_='txt') # print(len(y1)) if len(y1) == 10: list.append(y1[0].text.strip()) list.append(y1[1].text.strip()) list.append(y1[4].text.strip()) list.append(86700710) list.append(y1[5].text.strip()) # print(list1) y2 = soup1.find_all("strong") list2 = [] for i in y2: list2.append(i.text.strip()) list.append('、'.join(list2)) ll.append(list) if len(y1) == 9: list.append(y1[0].text.strip()) list.append(y1[1].text.strip()) list.append(y1[3].text.strip()) list.append(86700710) list.append(y1[4].text.strip()) # print(list1) y2 = soup1.find_all("strong") list2 = [] for i in y2: list2.append(i.text.strip()) list.append('、'.join(list2)) ll.append(list) if len(y1) == 1: span = soup1.find_all('span') h1 = soup1.find_all('h1') if not h1: list.append(h1) else: list.append(h1[0]) list.append(span[5].text.strip()) list.append(span[3].text.strip()) list.append(86700710) list.append('') # print(list1) y2 = soup1.select('ul>li>a.companyName') list2 = [] for i in y2: list2.append(i.text.strip()) list.append('、'.join(list2)) ll.append(list) print(ll)