diff --git a/苏东坡.py b/苏东坡.py new file mode 100644 index 0000000..79d7c7c --- /dev/null +++ b/苏东坡.py @@ -0,0 +1,82 @@ +import requests +from lxml import etree +import os +import csv + + +def get_source(url): + """ + 获取网页源代码 + :param url: + :return: 网页源代码 + """ + response = requests.get(url) + response.raise_for_status() # 检查请求是否成功 + response = response.text.encode('iso-8859-1').decode('utf-8') + return response + + +def get_chapter_links(source): + """ + 获取每一章节的url + :param source: + :return: 章节的url + """ + html = etree.HTML(source) + chapter_links = html.xpath('//div[@class="book-list clearfix"]//a/@href') + # print(chapter_links) + return chapter_links + + +def get_chapter_content(chapter_url): + """ + 获取章节的标题和正文 + :param chapter_url: + :return: + """ + chapter_source = get_source(chapter_url) + html = etree.HTML(chapter_source) + title = html.xpath('//*[@id="nr_title"]/text()')[0] + content = html.xpath('//*[@id="nr1"]/div/p/text()') + content = '\n'.join(content) # 合并段落文本 + return title, content + + +# print(get_chapter_content("https://www.xingyueboke.com/sudongpozhuan/85218.html")) + +def save_chapter_to_txt(title, content): + """ + 将每一章节的标题和内容写入本地 + :param title: + :param content: + :return: + """ + with open(f"D:/{title}.txt", "w", encoding="utf-8") as file: + file.write(content) + + +def write_to_csv(chapter_links, chapter_data): + with open('苏东坡传.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['url', 'title', 'content_length'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for url, (title, content) in zip(chapter_links, chapter_data): + writer.writerow({'url': url, 'title': title, 'content_length': len(content)}) + + +def main(): + novel_url = "https://www.xingyueboke.com/sudongpozhuan/" + source = get_source(novel_url) + chapter_links = get_chapter_links(source) + chapter_data = [] + + for link in chapter_links: + title, content = get_chapter_content(link) + save_chapter_to_txt(title, content) + chapter_data.append((title, content)) + + write_to_csv(chapter_links, chapter_data) + + +if __name__ == "__main__": + main()