import requests import os import csv from lxml import etree start_url="https://www.xingyueboke.com/sudongpozhuan/" h={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} def get_source(url=start_url): response = requests.get(url,headers=h) if response.status_code== 200: return response.content.decode('utf-8') else: print("请求失败,状态码为{}".format(response.status_code)) return "" source = get_source() def get_chapter_urls(start_source): selector = etree.HTML(start_source) urls=selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') right_urls=[] for url in urls: right_urls.append(url) return right_urls urls = get_chapter_urls(source) def get_article(article_html): selector = etree.HTML(article_html) title = selector.xpath('//h1/text()')[0] content = selector.xpath('string(//div[@id="nr1"]/div)') return title,content def save(title,content): filename = "苏东坡传/" + title + ".txt" if not os.path.exists("苏东坡传"): os.makedirs("苏东坡传") with open(filename, "w", encoding='utf-8') as f: f.write(content) def saveCsv(list): filename = "苏东坡传/苏东坡传.csv" if not os.path.exists("苏东坡传"): os.makedirs("苏东坡传") with open(filename, "w", encoding="utf-8",newline='') as f: w = csv.writer(f) w.writerow(["网页地址", "标题", "正文长度"]) w.writerows(list) list =[] for url in urls: article_html = get_source(url) title, content = get_article(article_html) print(title) list.append([url,title,len(content)]) save(title, content) saveCsv(list) # if __name__=='__main__': # source = get_source() # urls=get_chapter_urls(source)