diff --git a/beautiful.py b/beautiful.py new file mode 100644 index 0000000..ce3a9d0 --- /dev/null +++ b/beautiful.py @@ -0,0 +1,61 @@ +import re +import requests +from bs4 import BeautifulSoup + +url = "https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index" +headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"} +source = requests.get("https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index",headers=headers).text +# print(source) +soup = BeautifulSoup(source,'lxml') +result = soup.find_all('div',class_="tit link_gray6") +href_list = [] +for dyt in result: + a = dyt.find_all('a') + for href in a: + if 'href' in href.attrs and href.get('target') == "_blank": + href_list.append(href.get('href')) +# print(href_list) +# print(len(href_list)) + + +for i in href_list: + res = requests.get(i, headers=headers) + res_text = res.content.decode('utf-8') + # print(res_text) + soup1 = BeautifulSoup(res_text,'lxml') + result1 = soup1.find_all('div',class_="mce-content-body") + # print(len(result1)) + for p in result1: + b = p.find_all('p')[:1] + # for mc in b: + # print(mc.text) + + txt_divs = soup1.find_all('div',class_='txt') + # print(len(txt_divs)) + lists = [] + list = [] + if len(txt_divs) == 10: + lists.append(txt_divs[0].text.strip()) + # lists.append(txt_divs[2].text.strip()) + lists.append(txt_divs[1].text.strip()) + lists.append(txt_divs[4].text.strip()) + lists.append(txt_divs[5].text.strip()) + lists.append(86700710) + # print(lists) + if len(txt_divs) == 9: + lists.append(txt_divs[0].text.strip()) + lists.append(txt_divs[1].text.strip()) + lists.append(txt_divs[3].text.strip()) + lists.append(txt_divs[4].text.strip()) + # lists.append(txt_divs[4].text.strip()) + lists.append(86700710) + # print(lists) + + result2 = soup1.find_all('strong') + lists2 = [] + for ch in result2: + lists2.append(ch.text.strip()) + # print(lists2) + lists.append('、'.join(lists2)) + list.append(lists) + print(list) \ No newline at end of file