diff --git a/20240417144533_6ircz8o5.py b/20240417144533_6ircz8o5.py new file mode 100644 index 0000000..916c273 --- /dev/null +++ b/20240417144533_6ircz8o5.py @@ -0,0 +1,55 @@ +import requests +import os +import csv +from lxml import etree +start_url = 'https://www.xingyueboke.com/sudongpozhuan/' +h = { + 'User-Agent': + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0" +} +def get_source(url=start_url): + response = requests.get(url, headers=h) + if response.status_code == 200: + return response.content.decode('utf-8') + return response.text + else: + print("请求失败,状态码为{}".format(response.status_code)) + return "" +def get_chapter_urls(start_source): + selector = etree.HTML(start_source) + urls = selector.xpath('//*[@id="content-list"]/div[2]/ul/li/a/@href') + right_urls = [] + for i in range(85209, 85238): + url = 'https://www.xingyueboke.com/sudongpozhuan/' + str(i) + '.html' + right_urls.append(url) + return right_urls +def get_article(article_html): + selector = etree.HTML(article_html) + title = selector.xpath('//*[@id="nr_title"]/text()') + content = selector.xpath('string(//*[@id="nr1"]/div)') + return title, content +def save(title, content): + filter = "苏东波传" + title[0] + ".txt" + with open(filter, 'a+', encoding='utf-8') as f: + f.write(content) +def czd(urls,titles,contents): + data = [] + for i in range(len(urls)-1): + data.append([urls[i],titles[i],contents[i]]) + with open("苏东坡传.csv",'a',encoding='ANSI',newline='') as f: + writer=csv.writer(f) + writer.writerow(["网站","标题","文章长度"]) + writer.writerows(data) +if __name__=="__main__": + source=get_source() + # print(source) + urls=get_chapter_urls(source) + titles = [] + contents = [] + for url in urls: + article_html=get_source(url) + title,content=get_article(article_html) + titles.append(title) + contents.append(len(content)) + save(title,content) + csv(urls,titles,contents)