From cddb5567235937e4d3a03b9e4bdd93039c87bd96 Mon Sep 17 00:00:00 2001
From: pnth2uekf <707689196@qq.com>
Date: Sat, 1 Jun 2024 18:02:15 +0800
Subject: [PATCH] ADD file via upload
---
sousuo.py | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 94 insertions(+)
create mode 100644 sousuo.py
diff --git a/sousuo.py b/sousuo.py
new file mode 100644
index 0000000..7093e5f
--- /dev/null
+++ b/sousuo.py
@@ -0,0 +1,94 @@
+# 导入数据请求模块 --> 第三方模块, 需要安装
+import requests
+# 导入正则表达式模块 --> 内置模块, 不需要安装
+import re
+from bs4 import BeautifulSoup
+# 导入数据解析模块 --> 第三方模块, 需要安装
+import parsel
+# 导入文件操作模块 --> 内置模块, 不需要安装
+import os
+
+def get_response(html_url):
+ # 模拟浏览器 headers 请求头
+ headers = {
+ # user-agent 用户代理 表示浏览器基本身份信息
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
+ }
+ response = requests.get(url=html_url, headers=headers)
+ return response
+def get_list_url(html_url):
+ # 调用发送请求函数
+ html_data = get_response(html_url).text
+ # 提取小说名字
+ name = re.findall('
(.*?)
', html_data)[0]
+ # 提取章节url
+ soup = BeautifulSoup(html_data, 'html.parser')
+ link_tags = soup.find_all('dd')
+ url_list = []
+ for tag in link_tags:
+ if tag.find('a'):
+ url_list.append(tag.find('a')['href'])
+ return name, url_list
+def get_content(html_url):
+ # 调用发送请求函数
+ html_data = get_response(html_url).text
+ soup = BeautifulSoup(html_data, 'html.parser')
+ # 提取标题
+ title_tag = soup.find('h1', class_='wap_none')
+ if title_tag:
+ title = title_tag.text.strip()
+ else:
+ title = "未找到标题"
+ # 提取内容
+ content_tag = soup.find('div', id='chaptercontent')
+ if content_tag:
+ content = content_tag.text.strip().replace('
', '\n')
+ else:
+ content = "未找到内容"
+ return title, content
+def save(name, title, content):
+ # 自动创建一个文件夹
+ file = f'{name}\\'
+ if not os.path.exists(file):
+ os.mkdir(file)
+ with open(file + title + '.txt', mode='a', encoding='utf-8') as f:
+ # 写入内容
+ f.write(title)
+ f.write('\n')
+ f.write(content)
+ f.write('\n')
+ print(title, '已经保存')
+def get_novel_id(html_url):
+ # 调用发送请求函数
+ novel_data = get_response(html_url=html_url).text
+ selector = parsel.Selector(novel_data)
+ href = selector.css('.blocks a::attr(href)').getall()
+ return href
+
+def search(word):
+ search_url = f'https://www.biqugen.net/modules/article/search.php?searchkey={word}'
+ search_data = get_response(html_url=search_url).text
+ selector = parsel.Selector(search_data)
+ lis = selector.css('.grid tr')
+ for li in lis:
+ name = li.css('td.odd a::text').get()
+ novel_id = li.css('td.odd a::attr(href)').get().split('/')[-2]
+ writer = li.css('td.odd:nth-child(2)::text').get()
+ print(name, novel_id, writer)
+
+
+
+def main(home_url):
+ href = get_novel_id(html_url=home_url)
+ for novel_id in href:
+ novel_url = f'https://www.biqugen.net{novel_id}'
+ name, url_list = get_list_url(html_url=novel_url)
+ print(name, url_list)
+ for url in url_list:
+ index_url = 'https://www.biqugen.net/' + url
+ title, content = get_content(html_url=index_url)
+ save(name, title, content)
+
+if __name__ == '__main__':
+ word = input('请输入你搜索的小说名:')
+ search(word)
\ No newline at end of file