diff --git a/BS4.py b/BS4.py new file mode 100644 index 0000000..be93c96 --- /dev/null +++ b/BS4.py @@ -0,0 +1,52 @@ +import re +import requests +from bs4 import BeautifulSoup +url='https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index' +header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"} +response=requests.get(url,headers=header) +source=response.text +#print(source) +all_url=[] +all_text=[] +all_content=[] +all_company = [] +soup=BeautifulSoup(source,"lxml") +result=soup.select('.tit>a') +for r in result: + urls=r.get('href') + # print(urls) + all_url.append(urls) +for link in all_url: + response=requests.get(link,headers=header) + source1=response.text + #print(source1) + soup1 = BeautifulSoup(source1, "lxml") + name=soup1.select(".tit>a") + for i in name: + #print(i.get_text()) + all_content.append(i.get_text()) + names=','.join(all_content) + + content1=soup1.find_all(class_='stit',string=re.compile("主办单位")) + for c1 in content1: + for s1 in c1.next_siblings: + if s1.name == 'div' and 'txt' in s1.get('class', []): + all_content.append(s1.get_text(strip=True)) + cc=','.join(all_content) + #print(s1.get_text(strip=True)) + content4=soup1.find_all(class_='stit',string=re.compile("联系方式")) + for c4 in content4: + for s4 in c4.next_siblings: + if s4.name=='div' and 'txt' in s4.get('class', []): + all_content.append(s4.get_text(strip=True)) + ss=','.join(all_content) + enterprise_name=soup1.select(".comtit>a") + for e in enterprise_name: + ename=e.get_text() + all_company.append(ename) + cname= '、'.join(all_company) +all_text.append(names) +all_text.append(cc) +all_text.append(ss) +all_text.append(cname) +print(all_text) \ No newline at end of file