import requests from bs4 import BeautifulSoup hh=[] ln=[] texts=[] url='https://www.autohome.com.cn/hangye/news/' headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/128'} response=requests.get(url,headers=headers) response.encoding=response.apparent_encoding #防出现乱码 res_text=response.text #print(res_text) soup=BeautifulSoup(res_text,"lxml") sout=soup.select('div.u-list-item>a') for i in sout: href=i.get('href') hh.append(href) #print(href) for s in hh: html="https:"+s #补全网址 ln.append(html) #print(ln) for j in ln[:5]: #print(j) resp = requests.get(j, headers=headers) resp.encoding = response.apparent_encoding h = resp.text #print(h) soup1 = BeautifulSoup(h, "lxml") soupt1 = soup1.select('div.article-details') for cc in soupt1: # print(cc) for cc1 in cc.select('h1'): titl = cc1.text.replace("\r", "").replace("\n", "").replace(" ","") texts.append(titl) #print(titl) for cc5 in cc.select('div.article-info>div'): name = cc5.text.replace("\r", "").replace("\n", "").replace(" ","") texts.append(name) # print(name) for cc2 in cc.select('div.article-info>span.time'): time = cc2.text.replace("\r", "").replace(" ", "") texts.append(time) # print(time) for cc4 in cc.select('p[data-paraid]'): text = cc4.text.replace("\r", "").replace(" ", "").replace("\xa0", "").replace("\u3000", "") # print(text) texts.append(text) #print(texts) for x in texts: sou=x+'\n' ff=open('cc.txt','a',encoding='utf-8') ff.write(sou) ff.flush()