You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
83 lines
2.3 KiB
83 lines
2.3 KiB
import requests
|
|
from lxml import etree
|
|
import os
|
|
import csv
|
|
|
|
|
|
def get_source(url):
|
|
"""
|
|
获取网页源代码
|
|
:param url:
|
|
:return: 网页源代码
|
|
"""
|
|
response = requests.get(url)
|
|
response.raise_for_status() # 检查请求是否成功
|
|
response = response.text.encode('iso-8859-1').decode('utf-8')
|
|
return response
|
|
|
|
|
|
def get_chapter_links(source):
|
|
"""
|
|
获取每一章节的url
|
|
:param source:
|
|
:return: 章节的url
|
|
"""
|
|
html = etree.HTML(source)
|
|
chapter_links = html.xpath('//div[@class="book-list clearfix"]//a/@href')
|
|
# print(chapter_links)
|
|
return chapter_links
|
|
|
|
|
|
def get_chapter_content(chapter_url):
|
|
"""
|
|
获取章节的标题和正文
|
|
:param chapter_url:
|
|
:return:
|
|
"""
|
|
chapter_source = get_source(chapter_url)
|
|
html = etree.HTML(chapter_source)
|
|
title = html.xpath('//*[@id="nr_title"]/text()')[0]
|
|
content = html.xpath('//*[@id="nr1"]/div/p/text()')
|
|
content = '\n'.join(content) # 合并段落文本
|
|
return title, content
|
|
|
|
|
|
# print(get_chapter_content("https://www.xingyueboke.com/sudongpozhuan/85218.html"))
|
|
|
|
def save_chapter_to_txt(title, content):
|
|
"""
|
|
将每一章节的标题和内容写入本地
|
|
:param title:
|
|
:param content:
|
|
:return:
|
|
"""
|
|
with open(f"D:/{title}.txt", "w", encoding="utf-8") as file:
|
|
file.write(content)
|
|
|
|
|
|
def write_to_csv(chapter_links, chapter_data):
|
|
with open('苏东坡传.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
|
fieldnames = ['url', 'title', 'content_length']
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
for url, (title, content) in zip(chapter_links, chapter_data):
|
|
writer.writerow({'url': url, 'title': title, 'content_length': len(content)})
|
|
|
|
|
|
def main():
|
|
novel_url = "https://www.xingyueboke.com/sudongpozhuan/"
|
|
source = get_source(novel_url)
|
|
chapter_links = get_chapter_links(source)
|
|
chapter_data = []
|
|
|
|
for link in chapter_links:
|
|
title, content = get_chapter_content(link)
|
|
save_chapter_to_txt(title, content)
|
|
chapter_data.append((title, content))
|
|
|
|
write_to_csv(chapter_links, chapter_data)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|