From fbb88fcb5087459a98f009ede493cdb7dec98db5 Mon Sep 17 00:00:00 2001
From: ph275ue6c <2370007971@qq.com>
Date: Wed, 17 Apr 2024 17:18:27 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BA=86=E8=8B=8F=E4=B8=9C?=
 =?UTF-8?q?=E5=9D=A1=E4=BC=A0=E7=9A=84=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 苏东坡传.py | 109 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 苏东坡传.py

diff --git a/苏东坡传.py b/苏东坡传.py
new file mode 100644
index 0000000..cad27a8
--- /dev/null
+++ b/苏东坡传.py
@@ -0,0 +1,109 @@
+import requests
+from lxml import etree
+import csv
+import os
+
+start_url = "https://www.xingyueboke.com/sudongpozhuan/"
+# 获取首页源代码
+def get_source(url = start_url):
+    h = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"}
+
+    response = requests.get(url,headers=h)
+    if response.status_code == 200:
+        return response.text
+    else:
+        print("请求失败,状态码为{}".format(response.status_code))
+        return ''
+
+#获取原序和所有28章的网页地址
+def get_html():
+    urls = []  # 初始化一个空列表
+    for i in range(10, 38):
+        url = 'https://www.xingyueboke.com/sudongpozhuan/852' + str(i) + '.html'
+        urls.append(url)  # 将字符串URL添加到列表中
+    return urls
+
+# for url in get_html():
+#     print(url)
+
+#获取每一章节的网页源代码
+def get_text():
+    html = [''] * 28  # 初始化一个包含28个空字符串的列表
+    urls = get_html()
+    for index, url in enumerate(urls):
+        head = {
+            'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36"
+        }
+        response = requests.get(url=url, headers=head)
+        response.encoding = 'utf-8'
+        if response.status_code == 200:
+            html[index] = response.text  # 使用整数索引
+        else:
+            print("请求失败,状态码为{}".format(response.status_code))
+            html[index] = ''  # 如果请求失败，可以在相应位置存储一个空字符串或进行其他处理
+    return html
+
+# for index, text in enumerate(get_text()):
+#     print(f"章节 {index+1} 的网页源代码:")
+#     print(text)
+#     print("\n")
+# 解析每一章小说的正文，和每章小说的标题。
+def get_article():
+    html_list = get_text()  # 获取所有章节的网页源代码
+    articles = []  # 用于存储所有章节的标题和内容
+    for html in html_list:
+        selector = etree.HTML(html)
+        # 获取标题
+        title = selector.xpath('//*[@id="nr_title"]/text()')
+        if title:
+            title = title[0].strip()  # 去除可能存在的空白字符
+        else:
+            title = ''
+        # 获取正文
+        content = selector.xpath('string(//*[@id="nr1"]/div)')
+        # if content:
+        #     content = content[0].strip()  # 去除可能存在的空白字符
+        # else:
+        #     content = ''
+        articles.append((title, content))  # 将标题和内容作为元组添加到列表中
+    return articles
+# 打印所有章节的标题和内容
+# for title,content in get_article():
+#     print(f"标题: {title}")
+#     print(f"内容: {content}")
+#     print("\n")  # 打印一个空行以分隔不同章节的内容
+
+# 将每一章节的小说正文存储到本地的txt文件中，文件命名为章节的标题。
+def save():
+    articles = get_article()  # 获取所有章节的标题和内容
+    for title, content in articles:
+        filename = title + '.txt'
+        # 检查文件是否已经存在，避免重复写入或覆盖
+        if not os.path.exists(filename):
+            with open(filename, 'w', encoding='utf-8') as f:
+                f.write(content)
+        else:
+            print(f"文件 {filename} 已存在，跳过写入。")
+save()
+
+# 将每章的网页地址、标题和正文长度写入到csv中，csv的文件名为苏东坡传。
+def save_to_csv(filename):
+    fieldnames = ['url', 'title', 'content_length']  # 定义CSV文件的字段名
+    with open(filename, mode='w', newline='', encoding='utf-8') as file:
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+
+        articles = get_article()  # 获取所有章节的标题和内容
+        urls = get_html()  # 获取所有章节的URL
+
+        for url, (title, content) in zip(urls, articles):
+            content_length = len(content) if content else 0  # 如果content为空，则长度为0
+            row = {
+                'url': url,
+                'title': title,
+                'content_length': content_length
+            }
+            writer.writerow(row)
+
+        # 调用 save_to_csv 函数，文件名是 '苏东坡传.csv'
+save_to_csv('苏东坡传.csv')