From ab0cd734c4e4a3dc52d7402f2944aab8044da01b Mon Sep 17 00:00:00 2001
From: p36049127 <jie1255811751@qq.com>
Date: Sat, 10 Jul 2021 21:10:34 +0800
Subject: [PATCH] data3

---
 src/数据库信息爬取.py | 39 ++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 src/数据库信息爬取.py

diff --git a/src/数据库信息爬取.py b/src/数据库信息爬取.py
new file mode 100644
index 0000000..5f86f22
--- /dev/null
+++ b/src/数据库信息爬取.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+from bs4 import BeautifulSoup
+import requests
+
+def get_html(url):
+    try:
+        response = requests.get(url,headers=headers)
+        response.encoding = 'GBK'
+        response.encoding = 'utf-8'
+#        response.encoding = 'gbk'
+        html = response.text
+        return html
+    except:
+        print('请求网址出错')
+        
+def write(txt,txtname):
+    with open(txtname+'.txt', 'w', encoding='UTF-8') as f:
+        f.write(str(txt) + '\n')
+        f.close()      
+        
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',}
+keyword = input()
+
+with open('urls\\'+str(keyword)+'urls.txt','r', encoding='UTF-8') as f:
+    urls = f.read()
+    urls = eval(urls)
+    f.close()
+
+for i in range(len(urls)):
+    url = str(urls[i])
+    with open('信息\\'+str(keyword)+'\\'+str(keyword)+str(i+1)+'.txt','w', encoding='UTF-8') as f:
+        try:
+            soup = BeautifulSoup(get_html(url),'lxml')
+            f.write(soup.text)
+            f.close()
+        except:
+            print('false')
+