data3

5 years ago · ab0cd734c4
parent cb28d808eb
commit ab0cd734c4
1 changed files with 39 additions and 0 deletions
--- a/src/数据库信息爬取.py
+++ b/src/数据库信息爬取.py
@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+from bs4 import BeautifulSoup
+import requests
+
+def get_html(url):
+    try:
+        response = requests.get(url,headers=headers)
+        response.encoding = 'GBK'
+        response.encoding = 'utf-8'
+#        response.encoding = 'gbk'
+        html = response.text
+        return html
+    except:
+        print('请求网址出错')
+        
+def write(txt,txtname):
+    with open(txtname+'.txt', 'w', encoding='UTF-8') as f:
+        f.write(str(txt) + '\n')
+        f.close()      
+        
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',}
+keyword = input()
+
+with open('urls\\'+str(keyword)+'urls.txt','r', encoding='UTF-8') as f:
+    urls = f.read()
+    urls = eval(urls)
+    f.close()
+
+for i in range(len(urls)):
+    url = str(urls[i])
+    with open('信息\\'+str(keyword)+'\\'+str(keyword)+str(i+1)+'.txt','w', encoding='UTF-8') as f:
+        try:
+            soup = BeautifulSoup(get_html(url),'lxml')
+            f.write(soup.text)
+            f.close()
+        except:
+            print('false')
+