diff --git a/2.py b/2.py new file mode 100644 index 0000000..fced11e --- /dev/null +++ b/2.py @@ -0,0 +1,43 @@ +import requests +from lxml import etree +import csv + +def get_source(url, headers): + r = requests.get(url, headers=headers) + dom = etree.HTML(r.text) + url_list = dom.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') + return url_list + + +def get_title(url, headers): + + r = requests.get(url, headers=headers) + dom = etree.HTML(r.content.decode("utf-8")) + biaoti = dom.xpath('//h1/text()')[0] + zw = dom.xpath('//article/div[1]/div//text()') + return biaoti, zw + + +def save_txt(biaoti, zw): + a="" + for i in zw: + a=a+i + with open(biaoti+".txt",'a+',encoding='utf-8') as f: + f.write(a) +def save_csv(list): + headers=["网址","标题","正文长度"] + with open("苏东坡传.csv",'w+',encoding='utf-8') as f: + w = csv.writer(f) + w.writerow(headers) + w.writerows(list) + +url = "https://www.xingyueboke.com/sudongpozhuan/" +headers = {"User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} +url_list = get_source(url, headers) +list =[] +for i in url_list: + biaoti, zw = get_title(i, headers) + save_txt(biaoti,zw) + list.append([i,biaoti,len(zw)]) + save_csv(list)