You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

110 lines
4.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from lxml import etree
import csv
import os
start_url = "https://www.xingyueboke.com/sudongpozhuan/"
# 获取首页源代码
def get_source(url = start_url):
h = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"}
response = requests.get(url,headers=h)
if response.status_code == 200:
return response.text
else:
print("请求失败,状态码为{}".format(response.status_code))
return ''
#获取原序和所有28章的网页地址
def get_html():
urls = [] # 初始化一个空列表
for i in range(10, 38):
url = 'https://www.xingyueboke.com/sudongpozhuan/852' + str(i) + '.html'
urls.append(url) # 将字符串URL添加到列表中
return urls
# for url in get_html():
# print(url)
#获取每一章节的网页源代码
def get_text():
html = [''] * 28 # 初始化一个包含28个空字符串的列表
urls = get_html()
for index, url in enumerate(urls):
head = {
'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36"
}
response = requests.get(url=url, headers=head)
response.encoding = 'utf-8'
if response.status_code == 200:
html[index] = response.text # 使用整数索引
else:
print("请求失败,状态码为{}".format(response.status_code))
html[index] = '' # 如果请求失败,可以在相应位置存储一个空字符串或进行其他处理
return html
# for index, text in enumerate(get_text()):
# print(f"章节 {index+1} 的网页源代码:")
# print(text)
# print("\n")
# 解析每一章小说的正文,和每章小说的标题。
def get_article():
html_list = get_text() # 获取所有章节的网页源代码
articles = [] # 用于存储所有章节的标题和内容
for html in html_list:
selector = etree.HTML(html)
# 获取标题
title = selector.xpath('//*[@id="nr_title"]/text()')
if title:
title = title[0].strip() # 去除可能存在的空白字符
else:
title = ''
# 获取正文
content = selector.xpath('string(//*[@id="nr1"]/div)')
# if content:
# content = content[0].strip() # 去除可能存在的空白字符
# else:
# content = ''
articles.append((title, content)) # 将标题和内容作为元组添加到列表中
return articles
# 打印所有章节的标题和内容
# for title,content in get_article():
# print(f"标题: {title}")
# print(f"内容: {content}")
# print("\n") # 打印一个空行以分隔不同章节的内容
# 将每一章节的小说正文存储到本地的txt文件中文件命名为章节的标题。
def save():
articles = get_article() # 获取所有章节的标题和内容
for title, content in articles:
filename = title + '.txt'
# 检查文件是否已经存在,避免重复写入或覆盖
if not os.path.exists(filename):
with open(filename, 'w', encoding='utf-8') as f:
f.write(content)
else:
print(f"文件 {filename} 已存在,跳过写入。")
save()
# 将每章的网页地址、标题和正文长度写入到csv中csv的文件名为苏东坡传。
def save_to_csv(filename):
fieldnames = ['url', 'title', 'content_length'] # 定义CSV文件的字段名
with open(filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
articles = get_article() # 获取所有章节的标题和内容
urls = get_html() # 获取所有章节的URL
for url, (title, content) in zip(urls, articles):
content_length = len(content) if content else 0 # 如果content为空则长度为0
row = {
'url': url,
'title': title,
'content_length': content_length
}
writer.writerow(row)
# 调用 save_to_csv 函数,文件名是 '苏东坡传.csv'
save_to_csv('苏东坡传.csv')