parent
a38ca4dd3c
commit
1482e60748
@ -0,0 +1,56 @@
|
|||||||
|
import requests
|
||||||
|
from lxml import etree
|
||||||
|
import csv
|
||||||
|
start_url="https://www.xingyueboke.com/sudongpozhuan/"
|
||||||
|
h={"User-Agent":
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"}
|
||||||
|
def get_source(url=start_url):
|
||||||
|
response= requests.get(url,headers=h)
|
||||||
|
if response.status_code==200:
|
||||||
|
response.encoding="utf-8"
|
||||||
|
return response.text
|
||||||
|
else:
|
||||||
|
print("请求失败,状态码为{}".format(response.status_code))
|
||||||
|
return ""
|
||||||
|
def get_chapter_urls(start_source):
|
||||||
|
selector=etree.HTML(start_source)
|
||||||
|
urls=selector.xpath('//*[@id="content-list"]/div[2]/ul/li/a/@href')
|
||||||
|
return urls
|
||||||
|
def get_article(acticle_html):
|
||||||
|
selector= etree.HTML(acticle_html)
|
||||||
|
title=selector.xpath('//h1/text()')[0]
|
||||||
|
content=selector.xpath('string(//div[@id="nr1"]/div)')
|
||||||
|
return title,content
|
||||||
|
def save(title,content):
|
||||||
|
import os
|
||||||
|
filename="苏东坡传/"+title+".txt"
|
||||||
|
dirname=os.path.dirname(filename)
|
||||||
|
if not os.path.exists(dirname):
|
||||||
|
os.mkdir(dirname)
|
||||||
|
with open(filename,"a+",encoding='utf-8') as f:
|
||||||
|
f.write(content)
|
||||||
|
def save_to_csv(chapter_data):
|
||||||
|
with open("苏东坡传.csv","w",newline="",encoding="utf-8") as csvfile:
|
||||||
|
writer=csv.writer(csvfile)
|
||||||
|
writer.writerow(["网页地址","标题","正文长度"])
|
||||||
|
for data in chapter_data:
|
||||||
|
writer.writerow(data)
|
||||||
|
if __name__ =='__main__':
|
||||||
|
source= get_source(start_url)
|
||||||
|
urls=get_chapter_urls(source)
|
||||||
|
chapter_data=[]
|
||||||
|
for url in urls:
|
||||||
|
article_html=get_source(url)
|
||||||
|
title,content=get_article(article_html)
|
||||||
|
print(title)
|
||||||
|
print(content)
|
||||||
|
if title and content:
|
||||||
|
save(title, content)
|
||||||
|
chapter_data.append([url, title, len(content)])
|
||||||
|
save_to_csv(chapter_data)
|
||||||
|
|
||||||
|
|
||||||
|
html=get_source(start_url)
|
||||||
|
print(html)
|
||||||
|
url=get_source(html)
|
||||||
|
print(url)
|
Loading…
Reference in new issue