|
|
|
@ -0,0 +1,84 @@
|
|
|
|
|
import requests
|
|
|
|
|
import csv
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
url = "https://www.biquge7.xyz/50227"
|
|
|
|
|
headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 1. 编写函数get_source
|
|
|
|
|
def get_source(url, headers):
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
|
|
return response.text
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
# 2. 解析网页源代码,获取原序和所有章节的网页地址
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_chapter_links(source):
|
|
|
|
|
soup = BeautifulSoup(source, 'html.parser')
|
|
|
|
|
# 假设章节链接在a标签的href属性中,并且这些a标签在某个特定的div或列表中
|
|
|
|
|
# 这里的CSS选择器需要根据实际的HTML结构来修改
|
|
|
|
|
chapter_links = [base_url + link.get('href') for link in soup.select('div.list > ul > li > a')]
|
|
|
|
|
#print(chapter_links)
|
|
|
|
|
return chapter_links
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 3. 访问每一章节页面,获取每一章节的网页源代码
|
|
|
|
|
def fetch_chapter_sources(chapter_links, headers):
|
|
|
|
|
chapter_sources = {}
|
|
|
|
|
for link in chapter_links:
|
|
|
|
|
#print(link)
|
|
|
|
|
source = get_source(link, headers)
|
|
|
|
|
if source:
|
|
|
|
|
chapter_sources[link] = source
|
|
|
|
|
#print(chapter_sources)
|
|
|
|
|
return chapter_sources
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 4. 解析每一章小说的正文,和每章小说的标题
|
|
|
|
|
def parse_chapter_content(source):
|
|
|
|
|
soup = BeautifulSoup(source, 'html.parser')
|
|
|
|
|
# 假设标题和正文分别位于特定的HTML元素中
|
|
|
|
|
# 这里的CSS选择器需要根据实际的HTML结构来修改
|
|
|
|
|
title = soup.select_one('h1.list_tit').get_text(strip=True)
|
|
|
|
|
content = soup.select_one('div.text').get_text(strip=True, separator='\n')
|
|
|
|
|
print(title)
|
|
|
|
|
return title, content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 5. 将每一章节的小说正文存储到本地的txt文件中
|
|
|
|
|
def save_chapters_to_txt(chapters):
|
|
|
|
|
for link, source in chapters.items():
|
|
|
|
|
title, content = parse_chapter_content(source)
|
|
|
|
|
with open(f'{title}.txt', 'w', encoding='utf-8') as f:
|
|
|
|
|
f.write(content)
|
|
|
|
|
|
|
|
|
|
# 6. 将每章的网页地址、标题和正文长度写入到csv中
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_to_csv(chapters, csv_filename):
|
|
|
|
|
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
|
|
|
|
|
fieldnames = ['链接', '标题', '正文长度']
|
|
|
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
|
|
|
writer.writeheader()
|
|
|
|
|
for link, source in chapters.items():
|
|
|
|
|
title, content = parse_chapter_content(source)
|
|
|
|
|
writer.writerow({'链接': link, '标题': title, '正文长度': len(content)})
|
|
|
|
|
|
|
|
|
|
# 主程序
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
base_url = "https://www.biquge7.xyz"
|
|
|
|
|
source = get_source(url, headers)
|
|
|
|
|
# print(source)
|
|
|
|
|
if source:
|
|
|
|
|
chapter_links = parse_chapter_links(source)
|
|
|
|
|
# print(chapter_links)
|
|
|
|
|
chapters = fetch_chapter_sources(chapter_links, headers)
|
|
|
|
|
# print(chapters)
|
|
|
|
|
save_chapters_to_txt(chapters)
|
|
|
|
|
save_to_csv(chapters, '偷偷藏不住.csv')
|