From 184a7251489d27ec35488ddaed54f29373c65359 Mon Sep 17 00:00:00 2001
From: pqk5c82f3 <2042437448@qq.com>
Date: Sat, 13 Apr 2024 12:29:28 +0800
Subject: [PATCH] ADD file via upload

---
 xiaozu.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 xiaozu.py

diff --git a/xiaozu.py b/xiaozu.py
new file mode 100644
index 0000000..56616b9
--- /dev/null
+++ b/xiaozu.py
@@ -0,0 +1,67 @@
+import requests
+from lxml import etree
+import os
+import csv
+url = "https://www.xingyueboke.com/sudongpozhuan/"
+headers = {
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.1311 SLBChan/109"}
+
+
+def get_source(url):
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        response.encoding = "utf-8"
+        return response.text
+        # return response.content.decode('utf8')
+    else:
+        print("请求失败，状态码为{}".format(response.status_code))
+        return ""
+
+source = get_source(url)
+print(source)
+
+def get_chapter_urls(start_source):
+    selector = etree.HTML(start_source)
+    urls = selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href')
+    rights_urls = []
+    for url in urls:
+        rights_urls.append(url)
+    return rights_urls
+
+url1 = get_chapter_urls(source)
+print(url1)
+
+def get_article(article_html):
+    selectors = etree.HTML(article_html)
+    title = selectors.xpath('//h1/text()')[0]
+    content = selectors.xpath('string(//div[@id="nr1"]/div)')
+    return title,content
+
+def save(title,content):
+    filename = "苏坡东传/"+title +".txt"
+    dirname = os.path.dirname(filename)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+    with open(filename,"a+",encoding='utf-8') as f:
+        f.write(content)
+
+def saveCsv(articles_list):
+    filename = "苏东坡传/苏东坡传.csv"
+    if not os.path.exists("苏东坡传"):
+        os.makedirs("苏东坡传")
+        with open(filename, "w", encoding="utf-8",newline='') as f:
+            w = csv.writer(f)
+            for article in articles_list:
+                w.writerow(article)
+articles_list =[]
+
+
+for chapter_url in url1:
+    article_html = get_source(chapter_url)
+    if article_html:
+        title, content = get_article(article_html)
+        print(title)
+        print(content)
+        save(title, content)
+        articles_list.append([chapter_url, title, len(content)])
+saveCsv(articles_list)
\ No newline at end of file