You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ncccu94970 042b822531
ADD file via upload
11 months ago
2.py ADD file via upload 11 months ago
README.md Update README.md 11 months ago

README.md

import requests from lxml import etree import csv

def get_source(url, headers): r = requests.get(url, headers=headers) dom = etree.HTML(r.text) url_list = dom.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') return url_list

def get_title(url, headers):

r = requests.get(url, headers=headers)
dom = etree.HTML(r.content.decode("utf-8"))
biaoti = dom.xpath('//h1/text()')[0]
zw = dom.xpath('//article/div[1]/div//text()')
return biaoti, zw

def save_txt(biaoti, zw): a="" for i in zw: a=a+i with open(biaoti+".txt",'a+',encoding='utf-8') as f: f.write(a) def save_csv(list): headers=["网址","标题","正文长度"] with open("苏东坡传.csv",'w+',encoding='utf-8') as f: w = csv.writer(f) w.writerow(headers) w.writerows(list)

url = "https://www.xingyueboke.com/sudongpozhuan/" headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} url_list = get_source(url, headers) list =[] for i in url_list: biaoti, zw = get_title(i, headers) save_txt(biaoti,zw) list.append([i,biaoti,len(zw)]) save_csv(list)