From ab0cd734c4e4a3dc52d7402f2944aab8044da01b Mon Sep 17 00:00:00 2001 From: p36049127 Date: Sat, 10 Jul 2021 21:10:34 +0800 Subject: [PATCH] data3 --- src/数据库信息爬取.py | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 src/数据库信息爬取.py diff --git a/src/数据库信息爬取.py b/src/数据库信息爬取.py new file mode 100644 index 0000000..5f86f22 --- /dev/null +++ b/src/数据库信息爬取.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +from bs4 import BeautifulSoup +import requests + +def get_html(url): + try: + response = requests.get(url,headers=headers) + response.encoding = 'GBK' + response.encoding = 'utf-8' +# response.encoding = 'gbk' + html = response.text + return html + except: + print('请求网址出错') + +def write(txt,txtname): + with open(txtname+'.txt', 'w', encoding='UTF-8') as f: + f.write(str(txt) + '\n') + f.close() + +headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',} +keyword = input() + +with open('urls\\'+str(keyword)+'urls.txt','r', encoding='UTF-8') as f: + urls = f.read() + urls = eval(urls) + f.close() + +for i in range(len(urls)): + url = str(urls[i]) + with open('信息\\'+str(keyword)+'\\'+str(keyword)+str(i+1)+'.txt','w', encoding='UTF-8') as f: + try: + soup = BeautifulSoup(get_html(url),'lxml') + f.write(soup.text) + f.close() + except: + print('false') +