ADD file via upload

main
p7wtmgvel 5 months ago
parent 3b77022b59
commit 26071462cf

@ -0,0 +1,84 @@
import requests
import csv
from bs4 import BeautifulSoup
url = "https://www.biquge7.xyz/50227"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"
}
# 1. 编写函数get_source
def get_source(url, headers):
response = requests.get(url, headers=headers)
return response.text
return None
# 2. 解析网页源代码,获取原序和所有章节的网页地址
def parse_chapter_links(source):
soup = BeautifulSoup(source, 'html.parser')
# 假设章节链接在a标签的href属性中并且这些a标签在某个特定的div或列表中
# 这里的CSS选择器需要根据实际的HTML结构来修改
chapter_links = [base_url + link.get('href') for link in soup.select('div.list > ul > li > a')]
#print(chapter_links)
return chapter_links
# 3. 访问每一章节页面,获取每一章节的网页源代码
def fetch_chapter_sources(chapter_links, headers):
chapter_sources = {}
for link in chapter_links:
#print(link)
source = get_source(link, headers)
if source:
chapter_sources[link] = source
#print(chapter_sources)
return chapter_sources
# 4. 解析每一章小说的正文,和每章小说的标题
def parse_chapter_content(source):
soup = BeautifulSoup(source, 'html.parser')
# 假设标题和正文分别位于特定的HTML元素中
# 这里的CSS选择器需要根据实际的HTML结构来修改
title = soup.select_one('h1.list_tit').get_text(strip=True)
content = soup.select_one('div.text').get_text(strip=True, separator='\n')
print(title)
return title, content
# 5. 将每一章节的小说正文存储到本地的txt文件中
def save_chapters_to_txt(chapters):
for link, source in chapters.items():
title, content = parse_chapter_content(source)
with open(f'{title}.txt', 'w', encoding='utf-8') as f:
f.write(content)
# 6. 将每章的网页地址、标题和正文长度写入到csv中
def save_to_csv(chapters, csv_filename):
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['链接', '标题', '正文长度']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for link, source in chapters.items():
title, content = parse_chapter_content(source)
writer.writerow({'链接': link, '标题': title, '正文长度': len(content)})
# 主程序
if __name__ == "__main__":
base_url = "https://www.biquge7.xyz"
source = get_source(url, headers)
# print(source)
if source:
chapter_links = parse_chapter_links(source)
# print(chapter_links)
chapters = fetch_chapter_sources(chapter_links, headers)
# print(chapters)
save_chapters_to_txt(chapters)
save_to_csv(chapters, '偷偷藏不住.csv')
Loading…
Cancel
Save