import requests from lxml import etree start_url = "https://www.xingyueboke.com/sudongpozhuan/" head = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} def get_source(start_url, head): response = requests.get(start_url, headers=head) response.encoding = 'utf-8' return response.text def get_chapterurl(start_source): select = etree.HTML(start_source) urls = select.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') return urls def get_article(article_html): select = etree.HTML(article_html) # title = select.xpath('/html/body/div[1]/article/header/h1/text()') # content = select.xpath('//*[@id="nr1"]/div/p/text()') title = select.xpath('//h1[@class="post-title"]/text()') content = select.xpath('//div[@id="nr1"]/div/p/text()') return title,content # # # def save(title, content): with open("dufu.text", "w", encoding='utf-8') as f: for j in content: f.write(j) # # # # # # # # # # # # source=get_source(start_url,head) # urls=get_chapterurl(source) # for i in urls: # articl_html=get_source(i) # get_article(articl_html) urls=get_chapterurl(get_source(start_url, head)) for i in urls: artical_html=get_source(i,head) title,content=get_article(artical_html) print(title) print(content) save(title,content)