import requests import re from bs4 import BeautifulSoup import parsel import os import concurrent.futures def get_response(html_url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0' } response = requests.get(url=html_url, headers=headers) return response def get_list_url(html_url): html_data = get_response(html_url).text name = re.findall('

(.*?)

', html_data)[0] soup = BeautifulSoup(html_data, 'html.parser') link_tags = soup.find_all('dd') url_list = [] for tag in link_tags: if tag.find('a'): url_list.append(tag.find('a')['href']) return name, url_list def get_content(html_url): html_data = get_response(html_url).text title = re.findall(r'

(.*?)

', html_data)[0] content = re.findall('
(.*?)

', html_data,re.S)[0].replace('

', '\n') return title, content def save(name, title, content): file = f'{name}\\' if not os.path.exists(file): os.mkdir(file) with open(file + title + '.txt', mode='a', encoding='utf-8') as f: f.write(title) f.write('\n') f.write(content) f.write('\n') print(title, '已经保存') def main(home_url): title, content = get_content(html_url=home_url) save(name, title, content) if __name__ == '__main__': url = 'https://www.bqguu.cc/book/176453/' name, url_list = get_list_url(html_url=url) exe = concurrent.futures.ThreadPoolExecutor(max_workers=7) for url in url_list: index_url = 'https://www.bqguu.cc/' + url exe.submit(main, index_url) exe.shutdown()