From 184a7251489d27ec35488ddaed54f29373c65359 Mon Sep 17 00:00:00 2001 From: pqk5c82f3 <2042437448@qq.com> Date: Sat, 13 Apr 2024 12:29:28 +0800 Subject: [PATCH] ADD file via upload --- xiaozu.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 xiaozu.py diff --git a/xiaozu.py b/xiaozu.py new file mode 100644 index 0000000..56616b9 --- /dev/null +++ b/xiaozu.py @@ -0,0 +1,67 @@ +import requests +from lxml import etree +import os +import csv +url = "https://www.xingyueboke.com/sudongpozhuan/" +headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.1311 SLBChan/109"} + + +def get_source(url): + response = requests.get(url, headers=headers) + if response.status_code == 200: + response.encoding = "utf-8" + return response.text + # return response.content.decode('utf8') + else: + print("请求失败,状态码为{}".format(response.status_code)) + return "" + +source = get_source(url) +print(source) + +def get_chapter_urls(start_source): + selector = etree.HTML(start_source) + urls = selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') + rights_urls = [] + for url in urls: + rights_urls.append(url) + return rights_urls + +url1 = get_chapter_urls(source) +print(url1) + +def get_article(article_html): + selectors = etree.HTML(article_html) + title = selectors.xpath('//h1/text()')[0] + content = selectors.xpath('string(//div[@id="nr1"]/div)') + return title,content + +def save(title,content): + filename = "苏坡东传/"+title +".txt" + dirname = os.path.dirname(filename) + if not os.path.exists(dirname): + os.makedirs(dirname) + with open(filename,"a+",encoding='utf-8') as f: + f.write(content) + +def saveCsv(articles_list): + filename = "苏东坡传/苏东坡传.csv" + if not os.path.exists("苏东坡传"): + os.makedirs("苏东坡传") + with open(filename, "w", encoding="utf-8",newline='') as f: + w = csv.writer(f) + for article in articles_list: + w.writerow(article) +articles_list =[] + + +for chapter_url in url1: + article_html = get_source(chapter_url) + if article_html: + title, content = get_article(article_html) + print(title) + print(content) + save(title, content) + articles_list.append([chapter_url, title, len(content)]) +saveCsv(articles_list) \ No newline at end of file