From cddb5567235937e4d3a03b9e4bdd93039c87bd96 Mon Sep 17 00:00:00 2001 From: pnth2uekf <707689196@qq.com> Date: Sat, 1 Jun 2024 18:02:15 +0800 Subject: [PATCH] ADD file via upload --- sousuo.py | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 sousuo.py diff --git a/sousuo.py b/sousuo.py new file mode 100644 index 0000000..7093e5f --- /dev/null +++ b/sousuo.py @@ -0,0 +1,94 @@ +# 导入数据请求模块 --> 第三方模块, 需要安装 +import requests +# 导入正则表达式模块 --> 内置模块, 不需要安装 +import re +from bs4 import BeautifulSoup +# 导入数据解析模块 --> 第三方模块, 需要安装 +import parsel +# 导入文件操作模块 --> 内置模块, 不需要安装 +import os + +def get_response(html_url): + # 模拟浏览器 headers 请求头 + headers = { + # user-agent 用户代理 表示浏览器基本身份信息 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0' + } + response = requests.get(url=html_url, headers=headers) + return response +def get_list_url(html_url): + # 调用发送请求函数 + html_data = get_response(html_url).text + # 提取小说名字 + name = re.findall('

(.*?)

', html_data)[0] + # 提取章节url + soup = BeautifulSoup(html_data, 'html.parser') + link_tags = soup.find_all('dd') + url_list = [] + for tag in link_tags: + if tag.find('a'): + url_list.append(tag.find('a')['href']) + return name, url_list +def get_content(html_url): + # 调用发送请求函数 + html_data = get_response(html_url).text + soup = BeautifulSoup(html_data, 'html.parser') + # 提取标题 + title_tag = soup.find('h1', class_='wap_none') + if title_tag: + title = title_tag.text.strip() + else: + title = "未找到标题" + # 提取内容 + content_tag = soup.find('div', id='chaptercontent') + if content_tag: + content = content_tag.text.strip().replace('
', '\n') + else: + content = "未找到内容" + return title, content +def save(name, title, content): + # 自动创建一个文件夹 + file = f'{name}\\' + if not os.path.exists(file): + os.mkdir(file) + with open(file + title + '.txt', mode='a', encoding='utf-8') as f: + # 写入内容 + f.write(title) + f.write('\n') + f.write(content) + f.write('\n') + print(title, '已经保存') +def get_novel_id(html_url): + # 调用发送请求函数 + novel_data = get_response(html_url=html_url).text + selector = parsel.Selector(novel_data) + href = selector.css('.blocks a::attr(href)').getall() + return href + +def search(word): + search_url = f'https://www.biqugen.net/modules/article/search.php?searchkey={word}' + search_data = get_response(html_url=search_url).text + selector = parsel.Selector(search_data) + lis = selector.css('.grid tr') + for li in lis: + name = li.css('td.odd a::text').get() + novel_id = li.css('td.odd a::attr(href)').get().split('/')[-2] + writer = li.css('td.odd:nth-child(2)::text').get() + print(name, novel_id, writer) + + + +def main(home_url): + href = get_novel_id(html_url=home_url) + for novel_id in href: + novel_url = f'https://www.biqugen.net{novel_id}' + name, url_list = get_list_url(html_url=novel_url) + print(name, url_list) + for url in url_list: + index_url = 'https://www.biqugen.net/' + url + title, content = get_content(html_url=index_url) + save(name, title, content) + +if __name__ == '__main__': + word = input('请输入你搜索的小说名:') + search(word) \ No newline at end of file