qjz/xiaoshuo612.py

import requests
import csv
from bs4 import BeautifulSoup

url = "https://www.biquge7.xyz/50227"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"
}


# 1. 编写函数get_source
def get_source(url, headers):
        response = requests.get(url, headers=headers)
        return response.text
        return None

    # 2. 解析网页源代码，获取原序和所有章节的网页地址


def parse_chapter_links(source):
    soup = BeautifulSoup(source, 'html.parser')
    # 假设章节链接在a标签的href属性中，并且这些a标签在某个特定的div或列表中
    # 这里的CSS选择器需要根据实际的HTML结构来修改
    chapter_links = [base_url + link.get('href') for link in soup.select('div.list > ul > li > a')]
    #print(chapter_links)
    return chapter_links


# 3. 访问每一章节页面，获取每一章节的网页源代码
def fetch_chapter_sources(chapter_links, headers):
    chapter_sources = {}
    for link in chapter_links:
        #print(link)
        source = get_source(link, headers)
        if source:
            chapter_sources[link] = source
    #print(chapter_sources)
    return chapter_sources


# 4. 解析每一章小说的正文，和每章小说的标题
def parse_chapter_content(source):
    soup = BeautifulSoup(source, 'html.parser')
    # 假设标题和正文分别位于特定的HTML元素中
    # 这里的CSS选择器需要根据实际的HTML结构来修改
    title = soup.select_one('h1.list_tit').get_text(strip=True)
    content = soup.select_one('div.text').get_text(strip=True, separator='\n')
    print(title)
    return title, content


# 5. 将每一章节的小说正文存储到本地的txt文件中
def save_chapters_to_txt(chapters):
    for link, source in chapters.items():
        title, content = parse_chapter_content(source)
        with open(f'{title}.txt', 'w', encoding='utf-8') as f:
            f.write(content)

        # 6. 将每章的网页地址、标题和正文长度写入到csv中


def save_to_csv(chapters, csv_filename):
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['链接', '标题', '正文长度']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for link, source in chapters.items():
            title, content = parse_chapter_content(source)
            writer.writerow({'链接': link, '标题': title, '正文长度': len(content)})

        # 主程序


if __name__ == "__main__":
    base_url = "https://www.biquge7.xyz"
    source = get_source(url, headers)
    # print(source)
    if source:
        chapter_links = parse_chapter_links(source)
        # print(chapter_links)
        chapters = fetch_chapter_sources(chapter_links, headers)
        # print(chapters)
        save_chapters_to_txt(chapters)
        save_to_csv(chapters, '偷偷藏不住.csv')