p4r267ofm
7c79d9d08e
|
7 months ago | |
---|---|---|
README.md | 7 months ago |
README.md
sudongpozhuan
import requests from lxml import etree import csv start_url="https://www.xingyueboke.com/sudongpozhuan/" h={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"} def get_source(url=start_url): r=requests.get(url,headers=h) if r.status_code==200: return r.content.decode('utf-8') return r.text else: print("失败") return "" def get_chapter_urls(start_source): se=etree.HTML(start_source) urls=se.xpath('//[@id="content-list"]/div[2]/ul/li/a/@href') return urls def get_article(article_html): selector=etree.HTML(article_html) title=selector.xpath('//[@id="nr_title"]/text()') content=selector.xpath('string(//*[@id="nr1"]/div)') return title,content def save(title,content): import os fi="苏东波传"+title[0]+".txt" # dirname=os.path.dirname("苏东坡传") # if not os.path.exists(dirname): # os.makedirs(dirname) with open(fi,'a+',encoding='utf-8') as f: f.write(content) def czd(urls,titles,contents): data = [] for i in range(len(urls)-1): data.append([urls[i],titles[i],contents[i]]) with open("苏东坡传.csv",'a',encoding='utf-8',newline='') as f: writer=csv.writer(f) writer.writerow(["网站","标题","文章长度"]) writer.writerows(data)
if name=="main": source=get_source()#网站源码 urls=get_chapter_urls(source)#文章网站 titles = [] contents = [] for url in urls: article_html=get_source(url) title,content=get_article(article_html) titles.append(title) contents.append(len(content)) # print(title) # print(content) save(title,content) # print(titles) # print(contents) czd(urls,titles,contents)