diff --git a/xiancheng.py b/xiancheng.py new file mode 100644 index 0000000..8777334 --- /dev/null +++ b/xiancheng.py @@ -0,0 +1,48 @@ +import requests +import re +from bs4 import BeautifulSoup +import parsel +import os +import concurrent.futures +def get_response(html_url): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0' + } + response = requests.get(url=html_url, headers=headers) + return response +def get_list_url(html_url): + html_data = get_response(html_url).text + name = re.findall('

(.*?)

', html_data)[0] + soup = BeautifulSoup(html_data, 'html.parser') + link_tags = soup.find_all('dd') + url_list = [] + for tag in link_tags: + if tag.find('a'): + url_list.append(tag.find('a')['href']) + return name, url_list +def get_content(html_url): + html_data = get_response(html_url).text + title = re.findall(r'

(.*?)

', html_data)[0] + content = re.findall('
(.*?)

', html_data,re.S)[0].replace('

', '\n') + return title, content +def save(name, title, content): + file = f'{name}\\' + if not os.path.exists(file): + os.mkdir(file) + with open(file + title + '.txt', mode='a', encoding='utf-8') as f: + f.write(title) + f.write('\n') + f.write(content) + f.write('\n') + print(title, '已经保存') +def main(home_url): + title, content = get_content(html_url=home_url) + save(name, title, content) +if __name__ == '__main__': + url = 'https://www.bqguu.cc/book/176453/' + name, url_list = get_list_url(html_url=url) + exe = concurrent.futures.ThreadPoolExecutor(max_workers=7) + for url in url_list: + index_url = 'https://www.bqguu.cc/' + url + exe.submit(main, index_url) + exe.shutdown() \ No newline at end of file