You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
56 lines
2.0 KiB
56 lines
2.0 KiB
7 months ago
|
import requests
|
||
|
import os
|
||
|
import csv
|
||
|
from lxml import etree
|
||
|
start_url = 'https://www.xingyueboke.com/sudongpozhuan/'
|
||
|
h = {
|
||
|
'User-Agent':
|
||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
|
||
|
}
|
||
|
def get_source(url=start_url):
|
||
|
response = requests.get(url, headers=h)
|
||
|
if response.status_code == 200:
|
||
|
return response.content.decode('utf-8')
|
||
|
return response.text
|
||
|
else:
|
||
|
print("请求失败,状态码为{}".format(response.status_code))
|
||
|
return ""
|
||
|
def get_chapter_urls(start_source):
|
||
|
selector = etree.HTML(start_source)
|
||
|
urls = selector.xpath('//*[@id="content-list"]/div[2]/ul/li/a/@href')
|
||
|
right_urls = []
|
||
|
for i in range(85209, 85238):
|
||
|
url = 'https://www.xingyueboke.com/sudongpozhuan/' + str(i) + '.html'
|
||
|
right_urls.append(url)
|
||
|
return right_urls
|
||
|
def get_article(article_html):
|
||
|
selector = etree.HTML(article_html)
|
||
|
title = selector.xpath('//*[@id="nr_title"]/text()')
|
||
|
content = selector.xpath('string(//*[@id="nr1"]/div)')
|
||
|
return title, content
|
||
|
def save(title, content):
|
||
|
filter = "苏东波传" + title[0] + ".txt"
|
||
|
with open(filter, 'a+', encoding='utf-8') as f:
|
||
|
f.write(content)
|
||
|
def czd(urls,titles,contents):
|
||
|
data = []
|
||
|
for i in range(len(urls)-1):
|
||
|
data.append([urls[i],titles[i],contents[i]])
|
||
|
with open("苏东坡传.csv",'a',encoding='ANSI',newline='') as f:
|
||
|
writer=csv.writer(f)
|
||
|
writer.writerow(["网站","标题","文章长度"])
|
||
|
writer.writerows(data)
|
||
|
if __name__=="__main__":
|
||
|
source=get_source()
|
||
|
# print(source)
|
||
|
urls=get_chapter_urls(source)
|
||
|
titles = []
|
||
|
contents = []
|
||
|
for url in urls:
|
||
|
article_html=get_source(url)
|
||
|
title,content=get_article(article_html)
|
||
|
titles.append(title)
|
||
|
contents.append(len(content))
|
||
|
save(title,content)
|
||
|
csv(urls,titles,contents)
|