import requests import os import csv from lxml import etree start_url = 'https://www.xingyueboke.com/sudongpozhuan/' h = { 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0" } def get_source(url=start_url): response = requests.get(url, headers=h) if response.status_code == 200: return response.content.decode('utf-8') return response.text else: print("请求失败,状态码为{}".format(response.status_code)) return "" def get_chapter_urls(start_source): selector = etree.HTML(start_source) urls = selector.xpath('//*[@id="content-list"]/div[2]/ul/li/a/@href') right_urls = [] for i in range(85209, 85238): url = 'https://www.xingyueboke.com/sudongpozhuan/' + str(i) + '.html' right_urls.append(url) return right_urls def get_article(article_html): selector = etree.HTML(article_html) title = selector.xpath('//*[@id="nr_title"]/text()') content = selector.xpath('string(//*[@id="nr1"]/div)') return title, content def save(title, content): filter = "苏东波传" + title[0] + ".txt" with open(filter, 'a+', encoding='utf-8') as f: f.write(content) def czd(urls,titles,contents): data = [] for i in range(len(urls)-1): data.append([urls[i],titles[i],contents[i]]) with open("苏东坡传.csv",'a',encoding='ANSI',newline='') as f: writer=csv.writer(f) writer.writerow(["网站","标题","文章长度"]) writer.writerows(data) if __name__=="__main__": source=get_source() # print(source) urls=get_chapter_urls(source) titles = [] contents = [] for url in urls: article_html=get_source(url) title,content=get_article(article_html) titles.append(title) contents.append(len(content)) save(title,content) csv(urls,titles,contents)