parent
9009811b3e
commit
cddb556723
@ -0,0 +1,94 @@
|
||||
# 导入数据请求模块 --> 第三方模块, 需要安装
|
||||
import requests
|
||||
# 导入正则表达式模块 --> 内置模块, 不需要安装
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
# 导入数据解析模块 --> 第三方模块, 需要安装
|
||||
import parsel
|
||||
# 导入文件操作模块 --> 内置模块, 不需要安装
|
||||
import os
|
||||
|
||||
def get_response(html_url):
|
||||
# 模拟浏览器 headers 请求头
|
||||
headers = {
|
||||
# user-agent 用户代理 表示浏览器基本身份信息
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
|
||||
}
|
||||
response = requests.get(url=html_url, headers=headers)
|
||||
return response
|
||||
def get_list_url(html_url):
|
||||
# 调用发送请求函数
|
||||
html_data = get_response(html_url).text
|
||||
# 提取小说名字
|
||||
name = re.findall('<h1>(.*?)</h1>', html_data)[0]
|
||||
# 提取章节url
|
||||
soup = BeautifulSoup(html_data, 'html.parser')
|
||||
link_tags = soup.find_all('dd')
|
||||
url_list = []
|
||||
for tag in link_tags:
|
||||
if tag.find('a'):
|
||||
url_list.append(tag.find('a')['href'])
|
||||
return name, url_list
|
||||
def get_content(html_url):
|
||||
# 调用发送请求函数
|
||||
html_data = get_response(html_url).text
|
||||
soup = BeautifulSoup(html_data, 'html.parser')
|
||||
# 提取标题
|
||||
title_tag = soup.find('h1', class_='wap_none')
|
||||
if title_tag:
|
||||
title = title_tag.text.strip()
|
||||
else:
|
||||
title = "未找到标题"
|
||||
# 提取内容
|
||||
content_tag = soup.find('div', id='chaptercontent')
|
||||
if content_tag:
|
||||
content = content_tag.text.strip().replace('<br>', '\n')
|
||||
else:
|
||||
content = "未找到内容"
|
||||
return title, content
|
||||
def save(name, title, content):
|
||||
# 自动创建一个文件夹
|
||||
file = f'{name}\\'
|
||||
if not os.path.exists(file):
|
||||
os.mkdir(file)
|
||||
with open(file + title + '.txt', mode='a', encoding='utf-8') as f:
|
||||
# 写入内容
|
||||
f.write(title)
|
||||
f.write('\n')
|
||||
f.write(content)
|
||||
f.write('\n')
|
||||
print(title, '已经保存')
|
||||
def get_novel_id(html_url):
|
||||
# 调用发送请求函数
|
||||
novel_data = get_response(html_url=html_url).text
|
||||
selector = parsel.Selector(novel_data)
|
||||
href = selector.css('.blocks a::attr(href)').getall()
|
||||
return href
|
||||
|
||||
def search(word):
|
||||
search_url = f'https://www.biqugen.net/modules/article/search.php?searchkey={word}'
|
||||
search_data = get_response(html_url=search_url).text
|
||||
selector = parsel.Selector(search_data)
|
||||
lis = selector.css('.grid tr')
|
||||
for li in lis:
|
||||
name = li.css('td.odd a::text').get()
|
||||
novel_id = li.css('td.odd a::attr(href)').get().split('/')[-2]
|
||||
writer = li.css('td.odd:nth-child(2)::text').get()
|
||||
print(name, novel_id, writer)
|
||||
|
||||
|
||||
|
||||
def main(home_url):
|
||||
href = get_novel_id(html_url=home_url)
|
||||
for novel_id in href:
|
||||
novel_url = f'https://www.biqugen.net{novel_id}'
|
||||
name, url_list = get_list_url(html_url=novel_url)
|
||||
print(name, url_list)
|
||||
for url in url_list:
|
||||
index_url = 'https://www.biqugen.net/' + url
|
||||
title, content = get_content(html_url=index_url)
|
||||
save(name, title, content)
|
||||
|
||||
if __name__ == '__main__':
|
||||
word = input('请输入你搜索的小说名:')
|
||||
search(word)
|
Loading…
Reference in new issue