parent
cefbd68281
commit
88a015d534
@ -1,63 +0,0 @@
|
|||||||
import requests
|
|
||||||
import os
|
|
||||||
import csv
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
start_url="https://www.xingyueboke.com/sudongpozhuan/"
|
|
||||||
h={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"}
|
|
||||||
|
|
||||||
def get_source(url=start_url):
|
|
||||||
response = requests.get(url,headers=h)
|
|
||||||
if response.status_code== 200:
|
|
||||||
return response.content.decode('utf-8')
|
|
||||||
else:
|
|
||||||
print("请求失败,状态码为{}".format(response.status_code))
|
|
||||||
return ""
|
|
||||||
|
|
||||||
source = get_source()
|
|
||||||
|
|
||||||
def get_chapter_urls(start_source):
|
|
||||||
selector = etree.HTML(start_source)
|
|
||||||
urls=selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href')
|
|
||||||
right_urls=[]
|
|
||||||
for url in urls:
|
|
||||||
right_urls.append(url)
|
|
||||||
return right_urls
|
|
||||||
|
|
||||||
urls = get_chapter_urls(source)
|
|
||||||
|
|
||||||
def get_article(article_html):
|
|
||||||
selector = etree.HTML(article_html)
|
|
||||||
title = selector.xpath('//h1/text()')[0]
|
|
||||||
content = selector.xpath('string(//div[@id="nr1"]/div)')
|
|
||||||
return title,content
|
|
||||||
|
|
||||||
def save(title,content):
|
|
||||||
filename = "苏东坡传/" + title + ".txt"
|
|
||||||
|
|
||||||
if not os.path.exists("苏东坡传"):
|
|
||||||
os.makedirs("苏东坡传")
|
|
||||||
with open(filename, "w", encoding='utf-8') as f:
|
|
||||||
f.write(content)
|
|
||||||
|
|
||||||
def saveCsv(list):
|
|
||||||
filename = "苏东坡传/苏东坡传.csv"
|
|
||||||
if not os.path.exists("苏东坡传"):
|
|
||||||
os.makedirs("苏东坡传")
|
|
||||||
with open(filename, "w", encoding="utf-8",newline='') as f:
|
|
||||||
w = csv.writer(f)
|
|
||||||
w.writerow(["网页地址", "标题", "正文长度"])
|
|
||||||
w.writerows(list)
|
|
||||||
list =[]
|
|
||||||
for url in urls:
|
|
||||||
article_html = get_source(url)
|
|
||||||
title, content = get_article(article_html)
|
|
||||||
print(title)
|
|
||||||
list.append([url,title,len(content)])
|
|
||||||
save(title, content)
|
|
||||||
saveCsv(list)
|
|
||||||
|
|
||||||
# if __name__=='__main__':
|
|
||||||
# source = get_source()
|
|
||||||
# urls=get_chapter_urls(source)
|
|
||||||
|
|
Loading…
Reference in new issue