import requests from lxml import etree import os import csv url = "https://www.xingyueboke.com/sudongpozhuan/" headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.1311 SLBChan/109"} def get_source(url): response = requests.get(url, headers=headers) if response.status_code == 200: response.encoding = "utf-8" return response.text # return response.content.decode('utf8') else: print("请求失败,状态码为{}".format(response.status_code)) return "" source = get_source(url) print(source) def get_chapter_urls(start_source): selector = etree.HTML(start_source) urls = selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') rights_urls = [] for url in urls: rights_urls.append(url) return rights_urls url1 = get_chapter_urls(source) print(url1) def get_article(article_html): selectors = etree.HTML(article_html) title = selectors.xpath('//h1/text()')[0] content = selectors.xpath('string(//div[@id="nr1"]/div)') return title,content def save(title,content): filename = "苏坡东传/"+title +".txt" dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename,"a+",encoding='utf-8') as f: f.write(content) def saveCsv(articles_list): filename = "苏东坡传/苏东坡传.csv" if not os.path.exists("苏东坡传"): os.makedirs("苏东坡传") with open(filename, "w", encoding="utf-8",newline='') as f: w = csv.writer(f) for article in articles_list: w.writerow(article) articles_list =[] for chapter_url in url1: article_html = get_source(chapter_url) if article_html: title, content = get_article(article_html) print(title) print(content) save(title, content) articles_list.append([chapter_url, title, len(content)]) saveCsv(articles_list)