www.xingyueboke.com/苏东坡.py

import requests
from lxml import etree
import os
import csv


def get_source(url):
    """
    获取网页源代码
    :param url:
    :return: 网页源代码
    """
    response = requests.get(url)
    response.raise_for_status()  # 检查请求是否成功
    response = response.text.encode('iso-8859-1').decode('utf-8')
    return response


def get_chapter_links(source):
    """
    获取每一章节的url
    :param source:
    :return: 章节的url
    """
    html = etree.HTML(source)
    chapter_links = html.xpath('//div[@class="book-list clearfix"]//a/@href')
    # print(chapter_links)
    return chapter_links


def get_chapter_content(chapter_url):
    """
    获取章节的标题和正文
    :param chapter_url:
    :return:
    """
    chapter_source = get_source(chapter_url)
    html = etree.HTML(chapter_source)
    title = html.xpath('//*[@id="nr_title"]/text()')[0]
    content = html.xpath('//*[@id="nr1"]/div/p/text()')
    content = '\n'.join(content)  # 合并段落文本
    return title, content


# print(get_chapter_content("https://www.xingyueboke.com/sudongpozhuan/85218.html"))

def save_chapter_to_txt(title, content):
    """
    将每一章节的标题和内容写入本地
    :param title:
    :param content:
    :return:
    """
    with open(f"D:/{title}.txt", "w", encoding="utf-8") as file:
        file.write(content)


def write_to_csv(chapter_links, chapter_data):
    with open('苏东坡传.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['url', 'title', 'content_length']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for url, (title, content) in zip(chapter_links, chapter_data):
            writer.writerow({'url': url, 'title': title, 'content_length': len(content)})


def main():
    novel_url = "https://www.xingyueboke.com/sudongpozhuan/"
    source = get_source(novel_url)
    chapter_links = get_chapter_links(source)
    chapter_data = []

    for link in chapter_links:
        title, content = get_chapter_content(link)
        save_chapter_to_txt(title, content)
        chapter_data.append((title, content))

    write_to_csv(chapter_links, chapter_data)


if __name__ == "__main__":
    main()
ADD file via upload 8 months ago			`import requests`
			`from lxml import etree`
			`import os`
			`import csv`


			`def get_source(url):`
			`"""`
			`获取网页源代码`
			`:param url:`
			`:return: 网页源代码`
			`"""`
			`response = requests.get(url)`
			`response.raise_for_status() # 检查请求是否成功`
			`response = response.text.encode('iso-8859-1').decode('utf-8')`
			`return response`


			`def get_chapter_links(source):`
			`"""`
			`获取每一章节的url`
			`:param source:`
			`:return: 章节的url`
			`"""`
			`html = etree.HTML(source)`
			`chapter_links = html.xpath('//div[@class="book-list clearfix"]//a/@href')`
			`# print(chapter_links)`
			`return chapter_links`


			`def get_chapter_content(chapter_url):`
			`"""`
			`获取章节的标题和正文`
			`:param chapter_url:`
			`:return:`
			`"""`
			`chapter_source = get_source(chapter_url)`
			`html = etree.HTML(chapter_source)`
			`title = html.xpath('//*[@id="nr_title"]/text()')[0]`
			`content = html.xpath('//*[@id="nr1"]/div/p/text()')`
			`content = '\n'.join(content) # 合并段落文本`
			`return title, content`


			`# print(get_chapter_content("https://www.xingyueboke.com/sudongpozhuan/85218.html"))`

			`def save_chapter_to_txt(title, content):`
			`"""`
			`将每一章节的标题和内容写入本地`
			`:param title:`
			`:param content:`
			`:return:`
			`"""`
			`with open(f"D:/{title}.txt", "w", encoding="utf-8") as file:`
			`file.write(content)`


			`def write_to_csv(chapter_links, chapter_data):`
			`with open('苏东坡传.csv', 'w', newline='', encoding='utf-8') as csvfile:`
			`fieldnames = ['url', 'title', 'content_length']`
			`writer = csv.DictWriter(csvfile, fieldnames=fieldnames)`
			`writer.writeheader()`
			`for url, (title, content) in zip(chapter_links, chapter_data):`
			`writer.writerow({'url': url, 'title': title, 'content_length': len(content)})`


			`def main():`
			`novel_url = "https://www.xingyueboke.com/sudongpozhuan/"`
			`source = get_source(novel_url)`
			`chapter_links = get_chapter_links(source)`
			`chapter_data = []`

			`for link in chapter_links:`
			`title, content = get_chapter_content(link)`
			`save_chapter_to_txt(title, content)`
			`chapter_data.append((title, content))`

			`write_to_csv(chapter_links, chapter_data)`


			`if __name__ == "__main__":`
			`main()`