diff --git a/README.md b/README.md index 1cd524d..c9c70de 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,61 @@ # sudongpozhuan +import requests +from lxml import etree +import csv +start_url="https://www.xingyueboke.com/sudongpozhuan/" +h={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"} +def get_source(url=start_url): + r=requests.get(url,headers=h) + if r.status_code==200: + return r.content.decode('utf-8') + return r.text + else: + print("失败") + return "" +def get_chapter_urls(start_source): + se=etree.HTML(start_source) + urls=se.xpath('//*[@id="content-list"]/div[2]/ul/li/a/@href') + return urls +def get_article(article_html): + selector=etree.HTML(article_html) + title=selector.xpath('//*[@id="nr_title"]/text()') + content=selector.xpath('string(//*[@id="nr1"]/div)') + return title,content +def save(title,content): + import os + fi="苏东波传"+title[0]+".txt" + # dirname=os.path.dirname("苏东坡传") + # if not os.path.exists(dirname): + # os.makedirs(dirname) + with open(fi,'a+',encoding='utf-8') as f: + f.write(content) +def czd(urls,titles,contents): + data = [] + for i in range(len(urls)-1): + data.append([urls[i],titles[i],contents[i]]) + with open("苏东坡传.csv",'a',encoding='utf-8',newline='') as f: + writer=csv.writer(f) + writer.writerow(["网站","标题","文章长度"]) + writer.writerows(data) + + +if __name__=="__main__": + source=get_source()#网站源码 + urls=get_chapter_urls(source)#文章网站 + titles = [] + contents = [] + for url in urls: + article_html=get_source(url) + title,content=get_article(article_html) + titles.append(title) + contents.append(len(content)) + # print(title) + # print(content) + save(title,content) + # print(titles) + # print(contents) + czd(urls,titles,contents) + + +