forked from p54670231/Idea
parent
78b2b35fce
commit
cb28d808eb
@ -0,0 +1,51 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
def get_html(url):
|
||||
try:
|
||||
response = requests.get(url,headers=headers)
|
||||
response.encoding = 'GBK'
|
||||
response.encoding = 'utf-8'
|
||||
# response.encoding = 'gbk'
|
||||
html = response.text
|
||||
return html
|
||||
except:
|
||||
print('请求网址出错')
|
||||
|
||||
def write(txt,txtname):
|
||||
with open(txtname+'.txt', 'w', encoding='UTF-8') as f:
|
||||
f.write(str(txt) + '\n')
|
||||
f.close()
|
||||
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',}
|
||||
o_urls = list()
|
||||
keyword = input()
|
||||
with open('page_urls\\'+str(keyword) + 'page_urls.txt','r',encoding = 'UTF-8') as f:
|
||||
lines = f.readlines()
|
||||
cnt = 0
|
||||
for url in eval(lines[0]):
|
||||
cnt += 1
|
||||
try:
|
||||
soup = BeautifulSoup(get_html(url),'lxml')
|
||||
for i in range(10*(cnt-1)+1,10*cnt+2):
|
||||
subs = soup.find_all(id = str(i))
|
||||
if subs:
|
||||
tmp = subs[0].find('h3')
|
||||
if tmp:
|
||||
tmp = tmp.find('a')
|
||||
if tmp:
|
||||
tmp = tmp.get('href')
|
||||
o_urls.append(tmp)
|
||||
print(tmp)
|
||||
except:
|
||||
print('false')
|
||||
f.close()
|
||||
|
||||
with open('urls\\'+str(keyword) + 'urls.txt','w',encoding = 'UTF-8') as f:
|
||||
f.write(str(o_urls))
|
||||
f.close()
|
||||
|
||||
|
||||
Loading…
Reference in new issue