import requests import csv from bs4 import BeautifulSoup url = "https://www.biquge7.xyz/50227" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0" } # 1. 编写函数get_source def get_source(url, headers): response = requests.get(url, headers=headers) return response.text return None # 2. 解析网页源代码,获取原序和所有章节的网页地址 def parse_chapter_links(source): soup = BeautifulSoup(source, 'html.parser') # 假设章节链接在a标签的href属性中,并且这些a标签在某个特定的div或列表中 # 这里的CSS选择器需要根据实际的HTML结构来修改 chapter_links = [base_url + link.get('href') for link in soup.select('div.list > ul > li > a')] #print(chapter_links) return chapter_links # 3. 访问每一章节页面,获取每一章节的网页源代码 def fetch_chapter_sources(chapter_links, headers): chapter_sources = {} for link in chapter_links: #print(link) source = get_source(link, headers) if source: chapter_sources[link] = source #print(chapter_sources) return chapter_sources # 4. 解析每一章小说的正文,和每章小说的标题 def parse_chapter_content(source): soup = BeautifulSoup(source, 'html.parser') # 假设标题和正文分别位于特定的HTML元素中 # 这里的CSS选择器需要根据实际的HTML结构来修改 title = soup.select_one('h1.list_tit').get_text(strip=True) content = soup.select_one('div.text').get_text(strip=True, separator='\n') print(title) return title, content # 5. 将每一章节的小说正文存储到本地的txt文件中 def save_chapters_to_txt(chapters): for link, source in chapters.items(): title, content = parse_chapter_content(source) with open(f'{title}.txt', 'w', encoding='utf-8') as f: f.write(content) # 6. 将每章的网页地址、标题和正文长度写入到csv中 def save_to_csv(chapters, csv_filename): with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['链接', '标题', '正文长度'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for link, source in chapters.items(): title, content = parse_chapter_content(source) writer.writerow({'链接': link, '标题': title, '正文长度': len(content)}) # 主程序 if __name__ == "__main__": base_url = "https://www.biquge7.xyz" source = get_source(url, headers) # print(source) if source: chapter_links = parse_chapter_links(source) # print(chapter_links) chapters = fetch_chapter_sources(chapter_links, headers) # print(chapters) save_chapters_to_txt(chapters) save_to_csv(chapters, '偷偷藏不住.csv')