import requests from lxml import etree import os import csv def get_source(url): """ 获取网页源代码 :param url: :return: 网页源代码 """ response = requests.get(url) response.raise_for_status() # 检查请求是否成功 response = response.text.encode('iso-8859-1').decode('utf-8') return response def get_chapter_links(source): """ 获取每一章节的url :param source: :return: 章节的url """ html = etree.HTML(source) chapter_links = html.xpath('//div[@class="book-list clearfix"]//a/@href') # print(chapter_links) return chapter_links def get_chapter_content(chapter_url): """ 获取章节的标题和正文 :param chapter_url: :return: """ chapter_source = get_source(chapter_url) html = etree.HTML(chapter_source) title = html.xpath('//*[@id="nr_title"]/text()')[0] content = html.xpath('//*[@id="nr1"]/div/p/text()') content = '\n'.join(content) # 合并段落文本 return title, content # print(get_chapter_content("https://www.xingyueboke.com/sudongpozhuan/85218.html")) def save_chapter_to_txt(title, content): """ 将每一章节的标题和内容写入本地 :param title: :param content: :return: """ with open(f"D:/{title}.txt", "w", encoding="utf-8") as file: file.write(content) def write_to_csv(chapter_links, chapter_data): with open('苏东坡传.csv', 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['url', 'title', 'content_length'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for url, (title, content) in zip(chapter_links, chapter_data): writer.writerow({'url': url, 'title': title, 'content_length': len(content)}) def main(): novel_url = "https://www.xingyueboke.com/sudongpozhuan/" source = get_source(novel_url) chapter_links = get_chapter_links(source) chapter_data = [] for link in chapter_links: title, content = get_chapter_content(link) save_chapter_to_txt(title, content) chapter_data.append((title, content)) write_to_csv(chapter_links, chapter_data) if __name__ == "__main__": main()