From f5f105df14e197eb2f33fca5c3a9d4b910af5f64 Mon Sep 17 00:00:00 2001 From: pj8pb7f2s <2891634653@qq.com> Date: Fri, 12 Apr 2024 12:33:22 +0800 Subject: [PATCH] Add sdpz --- sdpz | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 sdpz diff --git a/sdpz b/sdpz new file mode 100644 index 0000000..36fd29e --- /dev/null +++ b/sdpz @@ -0,0 +1,69 @@ +import requests +from lxml import etree +import csv +import os + +start_url = 'https://www.xingyueboke.com/sudongpozhuan/' +header = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} + + +def get_source(url=start_url): + response = requests.get(url, headers=header) + if response.status_code == 200: + response.encoding = 'utf-8' + return response.text + else: + print("请求失败,状态码为{}".format(response.status_code)) + return "" + + +source = get_source() +# print(source) + + +def get_chapter_urls(start_source): + selector = etree.HTML(start_source) + urls = selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') + rights_urls = [] + for url in urls: + rights_urls.append(url) + return rights_urls + + +url1 = get_chapter_urls(source) +# print(url1) + + +def get_article(article_html): + selector = etree.HTML(article_html) + title = selector.xpath('//h1/text()')[0] + content = selector.xpath('string(//div[@id="nr1"]/div)') + return title, content + + +def save(title, content): + filename = "苏东坡传/" + title + ".txt" + + if not os.path.exists("苏东坡传"): + os.makedirs("苏东坡传") + with open(filename, "w", encoding='utf-8') as f: + f.write(content) + +def savaCsv(list): + filename = "苏东坡传/苏东坡传.csv" + if not os.path.exists("苏东坡传"): + os.makedirs("苏东坡传") + with open(filename, "w", encoding="utf-8",newline='') as f: + w = csv.writer(f) + w.writerow(["网页地址", "标题", "正文长度"]) + w.writerows(list) +list =[] +for url in url1: + article_html = get_source(url) + # print(article_html) + title, content = get_article(article_html) + print(title) + list.append([url,title,len(content)]) + save(title, content) +savaCsv(list)