data2

5 years ago · cb28d808eb
parent 78b2b35fce
commit cb28d808eb
1 changed files with 51 additions and 0 deletions
--- a/src/urls爬取.py
+++ b/src/urls爬取.py
@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+
+
+from bs4 import BeautifulSoup
+import requests
+
+def get_html(url):
+    try:
+        response = requests.get(url,headers=headers)
+        response.encoding = 'GBK'
+        response.encoding = 'utf-8'
+#        response.encoding = 'gbk'
+        html = response.text
+        return html
+    except:
+        print('请求网址出错')
+        
+def write(txt,txtname):
+    with open(txtname+'.txt', 'w', encoding='UTF-8') as f:
+        f.write(str(txt) + '\n')
+        f.close()      
+        
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',}
+o_urls = list()
+keyword = input()
+with open('page_urls\\'+str(keyword) + 'page_urls.txt','r',encoding = 'UTF-8') as f:
+    lines = f.readlines()
+    cnt = 0
+    for url in eval(lines[0]):
+        cnt += 1
+        try:
+            soup = BeautifulSoup(get_html(url),'lxml')
+            for i in range(10*(cnt-1)+1,10*cnt+2):
+                subs = soup.find_all(id = str(i))
+                if subs:
+                    tmp = subs[0].find('h3')
+                    if tmp:
+                        tmp = tmp.find('a')
+                        if tmp:
+                            tmp = tmp.get('href')
+                            o_urls.append(tmp)
+                            print(tmp)
+        except:
+            print('false')
+    f.close()
+
+with open('urls\\'+str(keyword) + 'urls.txt','w',encoding = 'UTF-8') as f:
+    f.write(str(o_urls))
+    f.close()
+
+