You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ncccu94970 42e2df21df
Update README.md
11 months ago
README.md Update README.md 11 months ago

README.md

import requests from lxml import etree import csv

def get_source(url, headers): r = requests.get(url, headers=headers) dom = etree.HTML(r.text) url_list = dom.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') return url_list

def get_title(url, headers):

r = requests.get(url, headers=headers)
dom = etree.HTML(r.content.decode("utf-8"))
biaoti = dom.xpath('//h1/text()')[0]
zw = dom.xpath('//article/div[1]/div//text()')
return biaoti, zw

def save_txt(biaoti, zw): a="" for i in zw: a=a+i with open(biaoti+".txt",'a+',encoding='utf-8') as f: f.write(a) def save_csv(list): headers=["网址","标题","正文长度"] with open("苏东坡传.csv",'w+',encoding='utf-8') as f: w = csv.writer(f) w.writerow(headers) w.writerows(list)

url = "https://www.xingyueboke.com/sudongpozhuan/" headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} url_list = get_source(url, headers) list =[] for i in url_list: biaoti, zw = get_title(i, headers) save_txt(biaoti,zw) list.append([i,biaoti,len(zw)]) save_csv(list)