ADD file via upload

10 months ago · 13a93b11d4
parent 91951c7758
commit 13a93b11d4
1 changed files with 82 additions and 0 deletions
--- a/苏东坡.py
+++ b/苏东坡.py
@ -0,0 +1,82 @@
+import requests
+from lxml import etree
+import os
+import csv
+
+
+def get_source(url):
+    """
+    获取网页源代码
+    :param url:
+    :return: 网页源代码
+    """
+    response = requests.get(url)
+    response.raise_for_status()  # 检查请求是否成功
+    response = response.text.encode('iso-8859-1').decode('utf-8')
+    return response
+
+
+def get_chapter_links(source):
+    """
+    获取每一章节的url
+    :param source:
+    :return: 章节的url
+    """
+    html = etree.HTML(source)
+    chapter_links = html.xpath('//div[@class="book-list clearfix"]//a/@href')
+    # print(chapter_links)
+    return chapter_links
+
+
+def get_chapter_content(chapter_url):
+    """
+    获取章节的标题和正文
+    :param chapter_url:
+    :return:
+    """
+    chapter_source = get_source(chapter_url)
+    html = etree.HTML(chapter_source)
+    title = html.xpath('//*[@id="nr_title"]/text()')[0]
+    content = html.xpath('//*[@id="nr1"]/div/p/text()')
+    content = '\n'.join(content)  # 合并段落文本
+    return title, content
+
+
+# print(get_chapter_content("https://www.xingyueboke.com/sudongpozhuan/85218.html"))
+
+def save_chapter_to_txt(title, content):
+    """
+    将每一章节的标题和内容写入本地
+    :param title:
+    :param content:
+    :return:
+    """
+    with open(f"D:/{title}.txt", "w", encoding="utf-8") as file:
+        file.write(content)
+
+
+def write_to_csv(chapter_links, chapter_data):
+    with open('苏东坡传.csv', 'w', newline='', encoding='utf-8') as csvfile:
+        fieldnames = ['url', 'title', 'content_length']
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        for url, (title, content) in zip(chapter_links, chapter_data):
+            writer.writerow({'url': url, 'title': title, 'content_length': len(content)})
+
+
+def main():
+    novel_url = "https://www.xingyueboke.com/sudongpozhuan/"
+    source = get_source(novel_url)
+    chapter_links = get_chapter_links(source)
+    chapter_data = []
+
+    for link in chapter_links:
+        title, content = get_chapter_content(link)
+        save_chapter_to_txt(title, content)
+        chapter_data.append((title, content))
+
+    write_to_csv(chapter_links, chapter_data)
+
+
+if __name__ == "__main__":
+    main()