From 88a015d5347642dec3f08a19f42843fdfbba00fa Mon Sep 17 00:00:00 2001 From: ph275ue6c <2370007971@qq.com> Date: Wed, 17 Apr 2024 17:18:01 +0800 Subject: [PATCH] Delete '2.py' --- 2.py | 63 ------------------------------------------------------------ 1 file changed, 63 deletions(-) delete mode 100644 2.py diff --git a/2.py b/2.py deleted file mode 100644 index c49aa69..0000000 --- a/2.py +++ /dev/null @@ -1,63 +0,0 @@ -import requests -import os -import csv -from lxml import etree - -start_url="https://www.xingyueboke.com/sudongpozhuan/" -h={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} - -def get_source(url=start_url): - response = requests.get(url,headers=h) - if response.status_code== 200: - return response.content.decode('utf-8') - else: - print("请求失败,状态码为{}".format(response.status_code)) - return "" - -source = get_source() - -def get_chapter_urls(start_source): - selector = etree.HTML(start_source) - urls=selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') - right_urls=[] - for url in urls: - right_urls.append(url) - return right_urls - -urls = get_chapter_urls(source) - -def get_article(article_html): - selector = etree.HTML(article_html) - title = selector.xpath('//h1/text()')[0] - content = selector.xpath('string(//div[@id="nr1"]/div)') - return title,content - -def save(title,content): - filename = "苏东坡传/" + title + ".txt" - - if not os.path.exists("苏东坡传"): - os.makedirs("苏东坡传") - with open(filename, "w", encoding='utf-8') as f: - f.write(content) - -def saveCsv(list): - filename = "苏东坡传/苏东坡传.csv" - if not os.path.exists("苏东坡传"): - os.makedirs("苏东坡传") - with open(filename, "w", encoding="utf-8",newline='') as f: - w = csv.writer(f) - w.writerow(["网页地址", "标题", "正文长度"]) - w.writerows(list) -list =[] -for url in urls: - article_html = get_source(url) - title, content = get_article(article_html) - print(title) - list.append([url,title,len(content)]) - save(title, content) - saveCsv(list) - -# if __name__=='__main__': -# source = get_source() -# urls=get_chapter_urls(source) -