diff --git a/jiandan.py b/jiandan.py new file mode 100644 index 0000000..1dbc024 --- /dev/null +++ b/jiandan.py @@ -0,0 +1,59 @@ +import requests +from bs4 import BeautifulSoup +hh=[] +ln=[] +texts=[] +url='https://www.autohome.com.cn/hangye/news/' +headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/128'} +response=requests.get(url,headers=headers) +response.encoding=response.apparent_encoding #防出现乱码 +res_text=response.text +#print(res_text) +soup=BeautifulSoup(res_text,"lxml") +sout=soup.select('div.u-list-item>a') +for i in sout: + href=i.get('href') + hh.append(href) + #print(href) +for s in hh: + html="https:"+s #补全网址 + ln.append(html) +#print(ln) +for j in ln[:5]: + #print(j) + resp = requests.get(j, headers=headers) + resp.encoding = response.apparent_encoding + h = resp.text + #print(h) + soup1 = BeautifulSoup(h, "lxml") + soupt1 = soup1.select('div.article-details') + for cc in soupt1: + # print(cc) + for cc1 in cc.select('h1'): + titl = cc1.text.replace("\r", "").replace("\n", "").replace(" ","") + texts.append(titl) + #print(titl) + for cc5 in cc.select('div.article-info>div'): + name = cc5.text.replace("\r", "").replace("\n", "").replace(" ","") + texts.append(name) + # print(name) + for cc2 in cc.select('div.article-info>span.time'): + time = cc2.text.replace("\r", "").replace(" ", "") + texts.append(time) + # print(time) + for cc4 in cc.select('p[data-paraid]'): + text = cc4.text.replace("\r", "").replace(" ", "").replace("\xa0", "").replace("\u3000", "") + # print(text) + texts.append(text) +#print(texts) +for x in texts: + sou=x+'\n' + ff=open('cc.txt','a',encoding='utf-8') + ff.write(sou) + ff.flush() + + + + + +