import requests from lxml import etree import os import csv url = "https://www.xingyueboke.com/sudongpozhuan/" headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"} def get_source(url): response = requests.get(url, headers=headers) if response.status_code == 200: response.encoding = "utf-8" return response.text # return response.content.decode('utf8') else: print("请求失败,状态码为{}".format(response.status_code)) return "" def get_page_source(url): response = requests.get(url) if response.status_code == 200: response.encoding = "utf-8" return response.text else: return None chapter_url = "https://www.xingyueboke.com/sudongpozhuan/85210.html" chapter_source = get_page_source(chapter_url) print(chapter_source) source = get_source(url) print(source) def get_chapter_urls(start_source): selector = etree.HTML(start_source) urls = selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') rights_urls = [] for url in urls: rights_urls.append(url) return rights_urls url1 = get_chapter_urls(source) # print(url1) def get_article(article_html): selectors = etree.HTML(article_html) title = selectors.xpath('//h1/text()')[0] content = selectors.xpath('string(//div[@id="nr1"]/div)') return title,content def save(title,content): a= "苏东坡传/"+title +".txt" b = os.path.dirname(a) if not os.path.exists(b): os.makedirs(b) with open(a,"a+",encoding='utf-8') as f: f.write(content) def saveCsv(articles_list): a = "苏东坡传/苏东坡传.csv" if not os.path.exists("苏东坡传"): os.makedirs("苏东坡传") with open(a, "w", encoding="utf-8",newline='') as f: w = csv.writer(f) for article in articles_list: w.writerow(article) articles_list =[] for chapter_url in url1: article_html = get_source(chapter_url) if article_html: title, content = get_article(article_html) print(title) # print(content) save(title, content) articles_list.append([chapter_url, title, len(content)]) saveCsv(articles_list)