From cb28d808eb5b9bbd655c52ecad34a77d33923b1b Mon Sep 17 00:00:00 2001 From: p36049127 Date: Sat, 10 Jul 2021 21:10:13 +0800 Subject: [PATCH] data2 --- src/urls爬取.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 src/urls爬取.py diff --git a/src/urls爬取.py b/src/urls爬取.py new file mode 100644 index 0000000..7d8c9e7 --- /dev/null +++ b/src/urls爬取.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + + +from bs4 import BeautifulSoup +import requests + +def get_html(url): + try: + response = requests.get(url,headers=headers) + response.encoding = 'GBK' + response.encoding = 'utf-8' +# response.encoding = 'gbk' + html = response.text + return html + except: + print('请求网址出错') + +def write(txt,txtname): + with open(txtname+'.txt', 'w', encoding='UTF-8') as f: + f.write(str(txt) + '\n') + f.close() + +headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',} +o_urls = list() +keyword = input() +with open('page_urls\\'+str(keyword) + 'page_urls.txt','r',encoding = 'UTF-8') as f: + lines = f.readlines() + cnt = 0 + for url in eval(lines[0]): + cnt += 1 + try: + soup = BeautifulSoup(get_html(url),'lxml') + for i in range(10*(cnt-1)+1,10*cnt+2): + subs = soup.find_all(id = str(i)) + if subs: + tmp = subs[0].find('h3') + if tmp: + tmp = tmp.find('a') + if tmp: + tmp = tmp.get('href') + o_urls.append(tmp) + print(tmp) + except: + print('false') + f.close() + +with open('urls\\'+str(keyword) + 'urls.txt','w',encoding = 'UTF-8') as f: + f.write(str(o_urls)) + f.close() + +