You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

70 lines
1.9 KiB

8 months ago
import requests
from lxml import etree
import csv
import os
start_url = 'https://www.xingyueboke.com/sudongpozhuan/'
header = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"}
def get_source(url=start_url):
response = requests.get(url, headers=header)
if response.status_code == 200:
response.encoding = 'utf-8'
return response.text
else:
print("请求失败,状态码为{}".format(response.status_code))
return ""
source = get_source()
# print(source)
def get_chapter_urls(start_source):
selector = etree.HTML(start_source)
urls = selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href')
rights_urls = []
for url in urls:
rights_urls.append(url)
return rights_urls
url1 = get_chapter_urls(source)
# print(url1)
def get_article(article_html):
selector = etree.HTML(article_html)
title = selector.xpath('//h1/text()')[0]
content = selector.xpath('string(//div[@id="nr1"]/div)')
return title, content
def save(title, content):
filename = "苏东坡传/" + title + ".txt"
if not os.path.exists("苏东坡传"):
os.makedirs("苏东坡传")
with open(filename, "w", encoding='utf-8') as f:
f.write(content)
def savaCsv(list):
filename = "苏东坡传/苏东坡传.csv"
if not os.path.exists("苏东坡传"):
os.makedirs("苏东坡传")
with open(filename, "w", encoding="utf-8",newline='') as f:
w = csv.writer(f)
w.writerow(["网页地址", "标题", "正文长度"])
w.writerows(list)
list =[]
for url in url1:
article_html = get_source(url)
# print(article_html)
title, content = get_article(article_html)
print(title)
list.append([url,title,len(content)])
save(title, content)
savaCsv(list)