You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

84 lines
3.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import csv
from bs4 import BeautifulSoup
url = "https://www.biquge7.xyz/50227"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"
}
# 1. 编写函数get_source
def get_source(url, headers):
response = requests.get(url, headers=headers)
return response.text
return None
# 2. 解析网页源代码,获取原序和所有章节的网页地址
def parse_chapter_links(source):
soup = BeautifulSoup(source, 'html.parser')
# 假设章节链接在a标签的href属性中并且这些a标签在某个特定的div或列表中
# 这里的CSS选择器需要根据实际的HTML结构来修改
chapter_links = [base_url + link.get('href') for link in soup.select('div.list > ul > li > a')]
#print(chapter_links)
return chapter_links
# 3. 访问每一章节页面,获取每一章节的网页源代码
def fetch_chapter_sources(chapter_links, headers):
chapter_sources = {}
for link in chapter_links:
#print(link)
source = get_source(link, headers)
if source:
chapter_sources[link] = source
#print(chapter_sources)
return chapter_sources
# 4. 解析每一章小说的正文,和每章小说的标题
def parse_chapter_content(source):
soup = BeautifulSoup(source, 'html.parser')
# 假设标题和正文分别位于特定的HTML元素中
# 这里的CSS选择器需要根据实际的HTML结构来修改
title = soup.select_one('h1.list_tit').get_text(strip=True)
content = soup.select_one('div.text').get_text(strip=True, separator='\n')
print(title)
return title, content
# 5. 将每一章节的小说正文存储到本地的txt文件中
def save_chapters_to_txt(chapters):
for link, source in chapters.items():
title, content = parse_chapter_content(source)
with open(f'{title}.txt', 'w', encoding='utf-8') as f:
f.write(content)
# 6. 将每章的网页地址、标题和正文长度写入到csv中
def save_to_csv(chapters, csv_filename):
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['链接', '标题', '正文长度']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for link, source in chapters.items():
title, content = parse_chapter_content(source)
writer.writerow({'链接': link, '标题': title, '正文长度': len(content)})
# 主程序
if __name__ == "__main__":
base_url = "https://www.biquge7.xyz"
source = get_source(url, headers)
# print(source)
if source:
chapter_links = parse_chapter_links(source)
# print(chapter_links)
chapters = fetch_chapter_sources(chapter_links, headers)
# print(chapters)
save_chapters_to_txt(chapters)
save_to_csv(chapters, '偷偷藏不住.csv')