import requests from lxml import etree import csv import os start_url = 'https://www.xingyueboke.com/sudongpozhuan/' header = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} def get_source(url=start_url): response = requests.get(url, headers=header) if response.status_code == 200: response.encoding = 'utf-8' return response.text else: print("请求失败,状态码为{}".format(response.status_code)) return "" source = get_source() # print(source) def get_chapter_urls(start_source): selector = etree.HTML(start_source) urls = selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') rights_urls = [] for url in urls: rights_urls.append(url) return rights_urls url1 = get_chapter_urls(source) # print(url1) def get_article(article_html): selector = etree.HTML(article_html) title = selector.xpath('//h1/text()')[0] content = selector.xpath('string(//div[@id="nr1"]/div)') return title, content def save(title, content): filename = "苏东坡传/" + title + ".txt" if not os.path.exists("苏东坡传"): os.makedirs("苏东坡传") with open(filename, "w", encoding='utf-8') as f: f.write(content) def savaCsv(list): filename = "苏东坡传/苏东坡传.csv" if not os.path.exists("苏东坡传"): os.makedirs("苏东坡传") with open(filename, "w", encoding="utf-8",newline='') as f: w = csv.writer(f) w.writerow(["网页地址", "标题", "正文长度"]) w.writerows(list) list =[] for url in url1: article_html = get_source(url) # print(article_html) title, content = get_article(article_html) print(title) list.append([url,title,len(content)]) save(title, content) savaCsv(list)