|
|
|
# sudongpozhuan
|
|
|
|
import requests
|
|
|
|
from lxml import etree
|
|
|
|
import csv
|
|
|
|
start_url="https://www.xingyueboke.com/sudongpozhuan/"
|
|
|
|
h={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"}
|
|
|
|
def get_source(url=start_url):
|
|
|
|
r=requests.get(url,headers=h)
|
|
|
|
if r.status_code==200:
|
|
|
|
return r.content.decode('utf-8')
|
|
|
|
return r.text
|
|
|
|
else:
|
|
|
|
print("失败")
|
|
|
|
return ""
|
|
|
|
def get_chapter_urls(start_source):
|
|
|
|
se=etree.HTML(start_source)
|
|
|
|
urls=se.xpath('//*[@id="content-list"]/div[2]/ul/li/a/@href')
|
|
|
|
return urls
|
|
|
|
def get_article(article_html):
|
|
|
|
selector=etree.HTML(article_html)
|
|
|
|
title=selector.xpath('//*[@id="nr_title"]/text()')
|
|
|
|
content=selector.xpath('string(//*[@id="nr1"]/div)')
|
|
|
|
return title,content
|
|
|
|
def save(title,content):
|
|
|
|
import os
|
|
|
|
fi="苏东波传"+title[0]+".txt"
|
|
|
|
# dirname=os.path.dirname("苏东坡传")
|
|
|
|
# if not os.path.exists(dirname):
|
|
|
|
# os.makedirs(dirname)
|
|
|
|
with open(fi,'a+',encoding='utf-8') as f:
|
|
|
|
f.write(content)
|
|
|
|
def czd(urls,titles,contents):
|
|
|
|
data = []
|
|
|
|
for i in range(len(urls)-1):
|
|
|
|
data.append([urls[i],titles[i],contents[i]])
|
|
|
|
with open("苏东坡传.csv",'a',encoding='utf-8',newline='') as f:
|
|
|
|
writer=csv.writer(f)
|
|
|
|
writer.writerow(["网站","标题","文章长度"])
|
|
|
|
writer.writerows(data)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__=="__main__":
|
|
|
|
source=get_source()#网站源码
|
|
|
|
urls=get_chapter_urls(source)#文章网站
|
|
|
|
titles = []
|
|
|
|
contents = []
|
|
|
|
for url in urls:
|
|
|
|
article_html=get_source(url)
|
|
|
|
title,content=get_article(article_html)
|
|
|
|
titles.append(title)
|
|
|
|
contents.append(len(content))
|
|
|
|
# print(title)
|
|
|
|
# print(content)
|
|
|
|
save(title,content)
|
|
|
|
# print(titles)
|
|
|
|
# print(contents)
|
|
|
|
czd(urls,titles,contents)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|