|
|
@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
import csv
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start_url = "https://www.xingyueboke.com/sudongpozhuan/"
|
|
|
|
|
|
|
|
# 获取首页源代码
|
|
|
|
|
|
|
|
def get_source(url = start_url):
|
|
|
|
|
|
|
|
h = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response = requests.get(url,headers=h)
|
|
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
|
|
|
return response.text
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
print("请求失败,状态码为{}".format(response.status_code))
|
|
|
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#获取原序和所有28章的网页地址
|
|
|
|
|
|
|
|
def get_html():
|
|
|
|
|
|
|
|
urls = [] # 初始化一个空列表
|
|
|
|
|
|
|
|
for i in range(10, 38):
|
|
|
|
|
|
|
|
url = 'https://www.xingyueboke.com/sudongpozhuan/852' + str(i) + '.html'
|
|
|
|
|
|
|
|
urls.append(url) # 将字符串URL添加到列表中
|
|
|
|
|
|
|
|
return urls
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# for url in get_html():
|
|
|
|
|
|
|
|
# print(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#获取每一章节的网页源代码
|
|
|
|
|
|
|
|
def get_text():
|
|
|
|
|
|
|
|
html = [''] * 28 # 初始化一个包含28个空字符串的列表
|
|
|
|
|
|
|
|
urls = get_html()
|
|
|
|
|
|
|
|
for index, url in enumerate(urls):
|
|
|
|
|
|
|
|
head = {
|
|
|
|
|
|
|
|
'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
response = requests.get(url=url, headers=head)
|
|
|
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
|
|
|
html[index] = response.text # 使用整数索引
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
print("请求失败,状态码为{}".format(response.status_code))
|
|
|
|
|
|
|
|
html[index] = '' # 如果请求失败,可以在相应位置存储一个空字符串或进行其他处理
|
|
|
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# for index, text in enumerate(get_text()):
|
|
|
|
|
|
|
|
# print(f"章节 {index+1} 的网页源代码:")
|
|
|
|
|
|
|
|
# print(text)
|
|
|
|
|
|
|
|
# print("\n")
|
|
|
|
|
|
|
|
# 解析每一章小说的正文,和每章小说的标题。
|
|
|
|
|
|
|
|
def get_article():
|
|
|
|
|
|
|
|
html_list = get_text() # 获取所有章节的网页源代码
|
|
|
|
|
|
|
|
articles = [] # 用于存储所有章节的标题和内容
|
|
|
|
|
|
|
|
for html in html_list:
|
|
|
|
|
|
|
|
selector = etree.HTML(html)
|
|
|
|
|
|
|
|
# 获取标题
|
|
|
|
|
|
|
|
title = selector.xpath('//*[@id="nr_title"]/text()')
|
|
|
|
|
|
|
|
if title:
|
|
|
|
|
|
|
|
title = title[0].strip() # 去除可能存在的空白字符
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
title = ''
|
|
|
|
|
|
|
|
# 获取正文
|
|
|
|
|
|
|
|
content = selector.xpath('string(//*[@id="nr1"]/div)')
|
|
|
|
|
|
|
|
# if content:
|
|
|
|
|
|
|
|
# content = content[0].strip() # 去除可能存在的空白字符
|
|
|
|
|
|
|
|
# else:
|
|
|
|
|
|
|
|
# content = ''
|
|
|
|
|
|
|
|
articles.append((title, content)) # 将标题和内容作为元组添加到列表中
|
|
|
|
|
|
|
|
return articles
|
|
|
|
|
|
|
|
# 打印所有章节的标题和内容
|
|
|
|
|
|
|
|
# for title,content in get_article():
|
|
|
|
|
|
|
|
# print(f"标题: {title}")
|
|
|
|
|
|
|
|
# print(f"内容: {content}")
|
|
|
|
|
|
|
|
# print("\n") # 打印一个空行以分隔不同章节的内容
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 将每一章节的小说正文存储到本地的txt文件中,文件命名为章节的标题。
|
|
|
|
|
|
|
|
def save():
|
|
|
|
|
|
|
|
articles = get_article() # 获取所有章节的标题和内容
|
|
|
|
|
|
|
|
for title, content in articles:
|
|
|
|
|
|
|
|
filename = title + '.txt'
|
|
|
|
|
|
|
|
# 检查文件是否已经存在,避免重复写入或覆盖
|
|
|
|
|
|
|
|
if not os.path.exists(filename):
|
|
|
|
|
|
|
|
with open(filename, 'w', encoding='utf-8') as f:
|
|
|
|
|
|
|
|
f.write(content)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
print(f"文件 {filename} 已存在,跳过写入。")
|
|
|
|
|
|
|
|
save()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 将每章的网页地址、标题和正文长度写入到csv中,csv的文件名为苏东坡传。
|
|
|
|
|
|
|
|
def save_to_csv(filename):
|
|
|
|
|
|
|
|
fieldnames = ['url', 'title', 'content_length'] # 定义CSV文件的字段名
|
|
|
|
|
|
|
|
with open(filename, mode='w', newline='', encoding='utf-8') as file:
|
|
|
|
|
|
|
|
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
|
|
|
|
|
|
|
writer.writeheader()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
articles = get_article() # 获取所有章节的标题和内容
|
|
|
|
|
|
|
|
urls = get_html() # 获取所有章节的URL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for url, (title, content) in zip(urls, articles):
|
|
|
|
|
|
|
|
content_length = len(content) if content else 0 # 如果content为空,则长度为0
|
|
|
|
|
|
|
|
row = {
|
|
|
|
|
|
|
|
'url': url,
|
|
|
|
|
|
|
|
'title': title,
|
|
|
|
|
|
|
|
'content_length': content_length
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
writer.writerow(row)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 调用 save_to_csv 函数,文件名是 '苏东坡传.csv'
|
|
|
|
|
|
|
|
save_to_csv('苏东坡传.csv')
|