diff --git a/sudongpo b/sudongpo new file mode 100644 index 0000000..d6e9445 --- /dev/null +++ b/sudongpo @@ -0,0 +1,56 @@ +import requests +from lxml import etree +import csv +start_url="https://www.xingyueboke.com/sudongpozhuan/" +h={"User-Agent": +"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} +def get_source(url=start_url): + response= requests.get(url,headers=h) + if response.status_code==200: + response.encoding="utf-8" + return response.text + else: + print("请求失败,状态码为{}".format(response.status_code)) + return "" +def get_chapter_urls(start_source): + selector=etree.HTML(start_source) + urls=selector.xpath('//*[@id="content-list"]/div[2]/ul/li/a/@href') + return urls +def get_article(acticle_html): + selector= etree.HTML(acticle_html) + title=selector.xpath('//h1/text()')[0] + content=selector.xpath('string(//div[@id="nr1"]/div)') + return title,content +def save(title,content): + import os + filename="苏东坡传/"+title+".txt" + dirname=os.path.dirname(filename) + if not os.path.exists(dirname): + os.mkdir(dirname) + with open(filename,"a+",encoding='utf-8') as f: + f.write(content) +def save_to_csv(chapter_data): + with open("苏东坡传.csv","w",newline="",encoding="utf-8") as csvfile: + writer=csv.writer(csvfile) + writer.writerow(["网页地址","标题","正文长度"]) + for data in chapter_data: + writer.writerow(data) +if __name__ =='__main__': + source= get_source(start_url) + urls=get_chapter_urls(source) + chapter_data=[] + for url in urls: + article_html=get_source(url) + title,content=get_article(article_html) + print(title) + print(content) + if title and content: + save(title, content) + chapter_data.append([url, title, len(content)]) + save_to_csv(chapter_data) + + +html=get_source(start_url) +print(html) +url=get_source(html) +print(url)