diff --git a/2.py b/2.py deleted file mode 100644 index fced11e..0000000 --- a/2.py +++ /dev/null @@ -1,43 +0,0 @@ -import requests -from lxml import etree -import csv - -def get_source(url, headers): - r = requests.get(url, headers=headers) - dom = etree.HTML(r.text) - url_list = dom.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') - return url_list - - -def get_title(url, headers): - - r = requests.get(url, headers=headers) - dom = etree.HTML(r.content.decode("utf-8")) - biaoti = dom.xpath('//h1/text()')[0] - zw = dom.xpath('//article/div[1]/div//text()') - return biaoti, zw - - -def save_txt(biaoti, zw): - a="" - for i in zw: - a=a+i - with open(biaoti+".txt",'a+',encoding='utf-8') as f: - f.write(a) -def save_csv(list): - headers=["网址","标题","正文长度"] - with open("苏东坡传.csv",'w+',encoding='utf-8') as f: - w = csv.writer(f) - w.writerow(headers) - w.writerows(list) - -url = "https://www.xingyueboke.com/sudongpozhuan/" -headers = {"User-Agent": - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} -url_list = get_source(url, headers) -list =[] -for i in url_list: - biaoti, zw = get_title(i, headers) - save_txt(biaoti,zw) - list.append([i,biaoti,len(zw)]) - save_csv(list)