diff --git a/beautifulsoup.py b/beautifulsoup.py new file mode 100644 index 0000000..c6b129d --- /dev/null +++ b/beautifulsoup.py @@ -0,0 +1,70 @@ +import re +import requests +from bs4 import BeautifulSoup + +url = "https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index" +h = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"} +source = requests.get("https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index",headers=h).text +# print(source) +soup = BeautifulSoup(source,'lxml') +result = soup.find_all('div',class_="tit link_gray6") +href_list = [] +# result = soup.select("div.link_gray6>a") +# for i in result: +# c=i['href'] +# print(c) +for dyt in result: + a = dyt.find_all('a') + for href in a: + if 'href' in href.attrs and href.get('target') == "_blank": + href_list.append(href.get('href')) +# print(href_list) +# print(len(href_list)) + + +for i in href_list: + res = requests.get(i, headers=h) + resul = res.content.decode('utf-8') + # print(res_text) + soup1 = BeautifulSoup(resul,'lxml') + result1 = soup1.find_all('div',class_="mce-content-body") + # print(len(result1)) + for p in result1: + b = p.find_all('p') + # for element in b: + # print(element.text) + + txt_divs = soup1.find_all('div',class_='txt') + # print(len(txt_divs)) + lists = [] + list = [] + if len(txt_divs) == 10: + lists.append(txt_divs[0].text.strip()) + # lists.append(txt_divs[2].text.strip()) + lists.append(txt_divs[1].text.strip()) + lists.append(txt_divs[4].text.strip()) + lists.append(txt_divs[5].text.strip()) + lists.append(86700710) + print(lists) + if len(txt_divs) == 9: + lists.append(txt_divs[0].text.strip()) + lists.append(txt_divs[1].text.strip()) + lists.append(txt_divs[3].text.strip()) + lists.append(txt_divs[4].text.strip()) + # lists.append(txt_divs[4].text.strip()) + lists.append(86700710) + print(lists) + + result2 = soup1.find_all('strong') + lists2 = [] + for ch in result2: + lists2.append(ch.text.strip()) + # print(lists2) + lists.append('、'.join(lists2)) + list.append(lists) + # print(list) + + + + +