You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
48 lines
1.8 KiB
48 lines
1.8 KiB
import requests
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
import parsel
|
|
import os
|
|
import concurrent.futures
|
|
def get_response(html_url):
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'
|
|
}
|
|
response = requests.get(url=html_url, headers=headers)
|
|
return response
|
|
def get_list_url(html_url):
|
|
html_data = get_response(html_url).text
|
|
name = re.findall('<h1>(.*?)</h1>', html_data)[0]
|
|
soup = BeautifulSoup(html_data, 'html.parser')
|
|
link_tags = soup.find_all('dd')
|
|
url_list = []
|
|
for tag in link_tags:
|
|
if tag.find('a'):
|
|
url_list.append(tag.find('a')['href'])
|
|
return name, url_list
|
|
def get_content(html_url):
|
|
html_data = get_response(html_url).text
|
|
title = re.findall(r'<h1 class="wap_none">(.*?)</h1>', html_data)[0]
|
|
content = re.findall('<div id="chaptercontent" class="Readarea ReadAjax_content">(.*?)<p class="readinline">', html_data,re.S)[0].replace('<br /><br />', '\n')
|
|
return title, content
|
|
def save(name, title, content):
|
|
file = f'{name}\\'
|
|
if not os.path.exists(file):
|
|
os.mkdir(file)
|
|
with open(file + title + '.txt', mode='a', encoding='utf-8') as f:
|
|
f.write(title)
|
|
f.write('\n')
|
|
f.write(content)
|
|
f.write('\n')
|
|
print(title, '已经保存')
|
|
def main(home_url):
|
|
title, content = get_content(html_url=home_url)
|
|
save(name, title, content)
|
|
if __name__ == '__main__':
|
|
url = 'https://www.bqguu.cc/book/176453/'
|
|
name, url_list = get_list_url(html_url=url)
|
|
exe = concurrent.futures.ThreadPoolExecutor(max_workers=7)
|
|
for url in url_list:
|
|
index_url = 'https://www.bqguu.cc/' + url
|
|
exe.submit(main, index_url)
|
|
exe.shutdown() |