From d6f92c8bae03f21636f1515e141822be4f1f11ed Mon Sep 17 00:00:00 2001 From: mxfqc3yh7 <1306884653@qq.com> Date: Wed, 17 Apr 2024 11:21:24 +0800 Subject: [PATCH] ADD file via upload --- 2.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 2.py diff --git a/2.py b/2.py new file mode 100644 index 0000000..fced11e --- /dev/null +++ b/2.py @@ -0,0 +1,43 @@ +import requests +from lxml import etree +import csv + +def get_source(url, headers): + r = requests.get(url, headers=headers) + dom = etree.HTML(r.text) + url_list = dom.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') + return url_list + + +def get_title(url, headers): + + r = requests.get(url, headers=headers) + dom = etree.HTML(r.content.decode("utf-8")) + biaoti = dom.xpath('//h1/text()')[0] + zw = dom.xpath('//article/div[1]/div//text()') + return biaoti, zw + + +def save_txt(biaoti, zw): + a="" + for i in zw: + a=a+i + with open(biaoti+".txt",'a+',encoding='utf-8') as f: + f.write(a) +def save_csv(list): + headers=["网址","标题","正文长度"] + with open("苏东坡传.csv",'w+',encoding='utf-8') as f: + w = csv.writer(f) + w.writerow(headers) + w.writerows(list) + +url = "https://www.xingyueboke.com/sudongpozhuan/" +headers = {"User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} +url_list = get_source(url, headers) +list =[] +for i in url_list: + biaoti, zw = get_title(i, headers) + save_txt(biaoti,zw) + list.append([i,biaoti,len(zw)]) + save_csv(list)