import requests from lxml import etree import csv

def get_source(url, headers): r = requests.get(url, headers=headers) dom = etree.HTML(r.text) url_list = dom.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') return url_list

def get_title(url, headers):

r = requests.get(url, headers=headers)
dom = etree.HTML(r.content.decode("utf-8"))
biaoti = dom.xpath('//h1/text()')[0]
zw = dom.xpath('//article/div[1]/div//text()')
return biaoti, zw

def save_txt(biaoti, zw): a="" for i in zw: a=a+i with open(biaoti+".txt",'a+',encoding='utf-8') as f: f.write(a) def save_csv(list): headers=["网址","标题","正文长度"] with open("苏东坡传.csv",'w+',encoding='utf-8') as f: w = csv.writer(f) w.writerow(headers) w.writerows(list)

url = "" headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36 Edg/"} url_list = get_source(url, headers) list =[] for i in url_list: biaoti, zw = get_title(i, headers) save_txt(biaoti,zw) list.append([i,biaoti,len(zw)]) save_csv(list)