qichezhijia

5 months ago · 21be2ca462
parent 9a18da742c
commit 21be2ca462
1 changed files with 59 additions and 0 deletions
--- a/jiandan.py
+++ b/jiandan.py
@ -0,0 +1,59 @@
+import requests
+from bs4 import BeautifulSoup
+hh=[]
+ln=[]
+texts=[]
+url='https://www.autohome.com.cn/hangye/news/'
+headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/128'}
+response=requests.get(url,headers=headers)
+response.encoding=response.apparent_encoding   #防出现乱码
+res_text=response.text
+#print(res_text)
+soup=BeautifulSoup(res_text,"lxml")
+sout=soup.select('div.u-list-item>a')
+for i in sout:
+    href=i.get('href')
+    hh.append(href)
+    #print(href)
+for s in hh:
+        html="https:"+s  #补全网址
+        ln.append(html)
+#print(ln)
+for j in ln[:5]:
+    #print(j)
+    resp = requests.get(j, headers=headers)
+    resp.encoding = response.apparent_encoding
+    h = resp.text
+    #print(h)
+    soup1 = BeautifulSoup(h, "lxml")
+    soupt1 = soup1.select('div.article-details')
+    for cc in soupt1:
+        # print(cc)
+        for cc1 in cc.select('h1'):
+            titl = cc1.text.replace("\r", "").replace("\n", "").replace(" ","")
+            texts.append(titl)
+            #print(titl)
+        for cc5 in cc.select('div.article-info>div'):
+            name = cc5.text.replace("\r", "").replace("\n", "").replace(" ","")
+            texts.append(name)
+            # print(name)
+        for cc2 in cc.select('div.article-info>span.time'):
+            time = cc2.text.replace("\r", "").replace(" ", "")
+            texts.append(time)
+            # print(time)
+        for cc4 in cc.select('p[data-paraid]'):
+            text = cc4.text.replace("\r", "").replace(" ", "").replace("\xa0", "").replace("\u3000", "")
+            # print(text)
+            texts.append(text)
+#print(texts)
+for x in texts:
+    sou=x+'\n'
+    ff=open('cc.txt','a',encoding='utf-8')
+    ff.write(sou)
+    ff.flush()
+
+
+
+
+
+