You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
import requests
|
|
|
|
from lxml import etree
|
|
|
|
import csv
|
|
|
|
|
|
|
|
def get_source(url, headers):
|
|
|
|
r = requests.get(url, headers=headers)
|
|
|
|
dom = etree.HTML(r.text)
|
|
|
|
url_list = dom.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href')
|
|
|
|
return url_list
|
|
|
|
|
|
|
|
|
|
|
|
def get_title(url, headers):
|
|
|
|
|
|
|
|
r = requests.get(url, headers=headers)
|
|
|
|
dom = etree.HTML(r.content.decode("utf-8"))
|
|
|
|
biaoti = dom.xpath('//h1/text()')[0]
|
|
|
|
zw = dom.xpath('//article/div[1]/div//text()')
|
|
|
|
return biaoti, zw
|
|
|
|
|
|
|
|
|
|
|
|
def save_txt(biaoti, zw):
|
|
|
|
a=""
|
|
|
|
for i in zw:
|
|
|
|
a=a+i
|
|
|
|
with open(biaoti+".txt",'a+',encoding='utf-8') as f:
|
|
|
|
f.write(a)
|
|
|
|
def save_csv(list):
|
|
|
|
headers=["网址","标题","正文长度"]
|
|
|
|
with open("苏东坡传.csv",'w+',encoding='utf-8') as f:
|
|
|
|
w = csv.writer(f)
|
|
|
|
w.writerow(headers)
|
|
|
|
w.writerows(list)
|
|
|
|
|
|
|
|
url = "https://www.xingyueboke.com/sudongpozhuan/"
|
|
|
|
headers = {"User-Agent":
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"}
|
|
|
|
url_list = get_source(url, headers)
|
|
|
|
list =[]
|
|
|
|
for i in url_list:
|
|
|
|
biaoti, zw = get_title(i, headers)
|
|
|
|
save_txt(biaoti,zw)
|
|
|
|
list.append([i,biaoti,len(zw)])
|
|
|
|
save_csv(list)
|
|
|
|
|