import re import requests from bs4 import BeautifulSoup url = "https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index" headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"} source = requests.get("https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index",headers=headers).text # print(source) soup = BeautifulSoup(source,'lxml') result = soup.find_all('div',class_="tit link_gray6") href_list = [] for dyt in result: a = dyt.find_all('a') for href in a: if 'href' in href.attrs and href.get('target') == "_blank": href_list.append(href.get('href')) # print(href_list) # print(len(href_list)) for i in href_list: res = requests.get(i, headers=headers) res_text = res.content.decode('utf-8') # print(res_text) soup1 = BeautifulSoup(res_text,'lxml') result1 = soup1.find_all('div',class_="mce-content-body") # print(len(result1)) for p in result1: b = p.find_all('p')[:1] # for mc in b: # print(mc.text) txt_divs = soup1.find_all('div',class_='txt') # print(len(txt_divs)) lists = [] list = [] if len(txt_divs) == 10: lists.append(txt_divs[0].text.strip()) # lists.append(txt_divs[2].text.strip()) lists.append(txt_divs[1].text.strip()) lists.append(txt_divs[4].text.strip()) lists.append(txt_divs[5].text.strip()) lists.append(86700710) # print(lists) if len(txt_divs) == 9: lists.append(txt_divs[0].text.strip()) lists.append(txt_divs[1].text.strip()) lists.append(txt_divs[3].text.strip()) lists.append(txt_divs[4].text.strip()) # lists.append(txt_divs[4].text.strip()) lists.append(86700710) # print(lists) result2 = soup1.find_all('strong') lists2 = [] for ch in result2: lists2.append(ch.text.strip()) # print(lists2) lists.append('、'.join(lists2)) list.append(lists) print(list)