From fbb88fcb5087459a98f009ede493cdb7dec98db5 Mon Sep 17 00:00:00 2001 From: ph275ue6c <2370007971@qq.com> Date: Wed, 17 Apr 2024 17:18:27 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BA=86=E8=8B=8F=E4=B8=9C?= =?UTF-8?q?=E5=9D=A1=E4=BC=A0=E7=9A=84=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 苏东坡传.py | 109 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 苏东坡传.py diff --git a/苏东坡传.py b/苏东坡传.py new file mode 100644 index 0000000..cad27a8 --- /dev/null +++ b/苏东坡传.py @@ -0,0 +1,109 @@ +import requests +from lxml import etree +import csv +import os + +start_url = "https://www.xingyueboke.com/sudongpozhuan/" +# 获取首页源代码 +def get_source(url = start_url): + h = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} + + response = requests.get(url,headers=h) + if response.status_code == 200: + return response.text + else: + print("请求失败,状态码为{}".format(response.status_code)) + return '' + +#获取原序和所有28章的网页地址 +def get_html(): + urls = [] # 初始化一个空列表 + for i in range(10, 38): + url = 'https://www.xingyueboke.com/sudongpozhuan/852' + str(i) + '.html' + urls.append(url) # 将字符串URL添加到列表中 + return urls + +# for url in get_html(): +# print(url) + +#获取每一章节的网页源代码 +def get_text(): + html = [''] * 28 # 初始化一个包含28个空字符串的列表 + urls = get_html() + for index, url in enumerate(urls): + head = { + 'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36" + } + response = requests.get(url=url, headers=head) + response.encoding = 'utf-8' + if response.status_code == 200: + html[index] = response.text # 使用整数索引 + else: + print("请求失败,状态码为{}".format(response.status_code)) + html[index] = '' # 如果请求失败,可以在相应位置存储一个空字符串或进行其他处理 + return html + +# for index, text in enumerate(get_text()): +# print(f"章节 {index+1} 的网页源代码:") +# print(text) +# print("\n") +# 解析每一章小说的正文,和每章小说的标题。 +def get_article(): + html_list = get_text() # 获取所有章节的网页源代码 + articles = [] # 用于存储所有章节的标题和内容 + for html in html_list: + selector = etree.HTML(html) + # 获取标题 + title = selector.xpath('//*[@id="nr_title"]/text()') + if title: + title = title[0].strip() # 去除可能存在的空白字符 + else: + title = '' + # 获取正文 + content = selector.xpath('string(//*[@id="nr1"]/div)') + # if content: + # content = content[0].strip() # 去除可能存在的空白字符 + # else: + # content = '' + articles.append((title, content)) # 将标题和内容作为元组添加到列表中 + return articles +# 打印所有章节的标题和内容 +# for title,content in get_article(): +# print(f"标题: {title}") +# print(f"内容: {content}") +# print("\n") # 打印一个空行以分隔不同章节的内容 + +# 将每一章节的小说正文存储到本地的txt文件中,文件命名为章节的标题。 +def save(): + articles = get_article() # 获取所有章节的标题和内容 + for title, content in articles: + filename = title + '.txt' + # 检查文件是否已经存在,避免重复写入或覆盖 + if not os.path.exists(filename): + with open(filename, 'w', encoding='utf-8') as f: + f.write(content) + else: + print(f"文件 {filename} 已存在,跳过写入。") +save() + +# 将每章的网页地址、标题和正文长度写入到csv中,csv的文件名为苏东坡传。 +def save_to_csv(filename): + fieldnames = ['url', 'title', 'content_length'] # 定义CSV文件的字段名 + with open(filename, mode='w', newline='', encoding='utf-8') as file: + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + + articles = get_article() # 获取所有章节的标题和内容 + urls = get_html() # 获取所有章节的URL + + for url, (title, content) in zip(urls, articles): + content_length = len(content) if content else 0 # 如果content为空,则长度为0 + row = { + 'url': url, + 'title': title, + 'content_length': content_length + } + writer.writerow(row) + + # 调用 save_to_csv 函数,文件名是 '苏东坡传.csv' +save_to_csv('苏东坡传.csv')