Update README.md

2 years ago · 7c79d9d08e
parent 0a02b2676c
commit 7c79d9d08e
1 changed files with 59 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,61 @@
 # sudongpozhuan
+import requests
+from lxml import etree
+import csv
+start_url="https://www.xingyueboke.com/sudongpozhuan/"
+h={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"}
+def get_source(url=start_url):
+    r=requests.get(url,headers=h)
+    if r.status_code==200:
+        return r.content.decode('utf-8')
+        return r.text
+    else:
+        print("失败")
+        return ""
+def get_chapter_urls(start_source):
+    se=etree.HTML(start_source)
+    urls=se.xpath('//*[@id="content-list"]/div[2]/ul/li/a/@href')
+    return urls
+def get_article(article_html):
+    selector=etree.HTML(article_html)
+    title=selector.xpath('//*[@id="nr_title"]/text()')
+    content=selector.xpath('string(//*[@id="nr1"]/div)')
+    return title,content
+def save(title,content):
+    import os
+    fi="苏东波传"+title[0]+".txt"
+    # dirname=os.path.dirname("苏东坡传")
+    # if not os.path.exists(dirname):
+    #     os.makedirs(dirname)
+    with open(fi,'a+',encoding='utf-8') as f:
+        f.write(content)
+def czd(urls,titles,contents):
+    data = []
+    for i in range(len(urls)-1):
+        data.append([urls[i],titles[i],contents[i]])
+    with open("苏东坡传.csv",'a',encoding='utf-8',newline='') as f:
+        writer=csv.writer(f)
+        writer.writerow(["网站","标题","文章长度"])
+        writer.writerows(data)
+
+
+if __name__=="__main__":
+    source=get_source()#网站源码
+    urls=get_chapter_urls(source)#文章网站
+    titles = []
+    contents = []
+    for url in urls:
+        article_html=get_source(url)
+        title,content=get_article(article_html)
+        titles.append(title)
+        contents.append(len(content))
+        # print(title)
+        # print(content)
+        save(title,content)
+    # print(titles)
+    # print(contents)
+    czd(urls,titles,contents)
+
+
+