From cefbd68281235ed39fdb91e650d891a037dadf64 Mon Sep 17 00:00:00 2001
From: ph275ue6c <2370007971@qq.com>
Date: Wed, 17 Apr 2024 16:16:04 +0800
Subject: [PATCH] ADD file via upload

---
 2.py | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 2.py

diff --git a/2.py b/2.py
new file mode 100644
index 0000000..c49aa69
--- /dev/null
+++ b/2.py
@@ -0,0 +1,63 @@
+import requests
+import os
+import csv
+from lxml import etree
+
+start_url="https://www.xingyueboke.com/sudongpozhuan/"
+h={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"}
+
+def get_source(url=start_url):
+    response = requests.get(url,headers=h)
+    if response.status_code== 200:
+        return response.content.decode('utf-8')
+    else:
+        print("请求失败,状态码为{}".format(response.status_code))
+        return ""
+
+source = get_source()
+
+def get_chapter_urls(start_source):
+    selector = etree.HTML(start_source)
+    urls=selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href')
+    right_urls=[]
+    for url in urls:
+        right_urls.append(url)
+    return right_urls
+
+urls = get_chapter_urls(source)
+
+def get_article(article_html):
+    selector = etree.HTML(article_html)
+    title = selector.xpath('//h1/text()')[0]
+    content = selector.xpath('string(//div[@id="nr1"]/div)')
+    return title,content
+
+def save(title,content):
+    filename = "苏东坡传/" + title + ".txt"
+
+    if not os.path.exists("苏东坡传"):
+        os.makedirs("苏东坡传")
+    with open(filename, "w", encoding='utf-8') as f:
+        f.write(content)
+
+def saveCsv(list):
+    filename = "苏东坡传/苏东坡传.csv"
+    if not os.path.exists("苏东坡传"):
+        os.makedirs("苏东坡传")
+    with open(filename, "w", encoding="utf-8",newline='') as f:
+        w = csv.writer(f)
+        w.writerow(["网页地址", "标题", "正文长度"])
+        w.writerows(list)
+list =[]
+for url in urls:
+    article_html = get_source(url)
+    title, content = get_article(article_html)
+    print(title)
+    list.append([url,title,len(content)])
+    save(title, content)
+    saveCsv(list)
+
+# if __name__=='__main__':
+#     source = get_source()
+#     urls=get_chapter_urls(source)
+