diff --git a/BS4.py b/BS4.py deleted file mode 100644 index e2c90b2..0000000 --- a/BS4.py +++ /dev/null @@ -1,52 +0,0 @@ -import re -import requests -from bs4 import BeautifulSoup -url='https://www.ncrczpw.com/index.php?m=jobfair&c=index&a=index' -header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"} -response=requests.get(url,headers=header) -source=response.text -#print(source) -all_url=[] -all_text=[] -all_content=[] -all_company = [] -soup=BeautifulSoup(source,"lxml") -result=soup.select('.tit>a') -for r in result: - urls=r.get('href') - # print(urls) - all_url.append(urls) -for link in all_url[:10]: - response=requests.get(link,headers=header) - source1=response.text - #print(source1) - soup1 = BeautifulSoup(source1, "lxml") - name=soup1.select(".tit>a") - for i in name: - #print(i.get_text()) - all_content.append(i.get_text()) - names=','.join(all_content) - - content1=soup1.find_all(class_='stit',string=re.compile("主办单位")) - for c1 in content1: - for s1 in c1.next_siblings: - if s1.name == 'div' and 'txt' in s1.get('class', []): - all_content.append(s1.get_text(strip=True)) - cc=','.join(all_content) - #print(s1.get_text(strip=True)) - content4=soup1.find_all(class_='stit',string=re.compile("联系方式")) - for c4 in content4: - for s4 in c4.next_siblings: - if s4.name=='div' and 'txt' in s4.get('class', []): - all_content.append(s4.get_text(strip=True)) - ss=','.join(all_content) - enterprise_name=soup1.select(".comtit>a") - for e in enterprise_name: - ename=e.get_text() - all_company.append(ename) - cname= '、'.join(all_company) -all_text.append(names) -all_text.append(cc) -all_text.append(ss) -all_text.append(cname) -print(all_text) \ No newline at end of file