You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
77 lines
2.3 KiB
77 lines
2.3 KiB
7 months ago
|
import requests
|
||
|
from lxml import etree
|
||
|
import os
|
||
|
import csv
|
||
|
url = "https://www.xingyueboke.com/sudongpozhuan/"
|
||
|
headers={"User-Agent":
|
||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
|
"Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"}
|
||
|
|
||
|
def get_source(url):
|
||
|
response = requests.get(url, headers=headers)
|
||
|
if response.status_code == 200:
|
||
|
response.encoding = "utf-8"
|
||
|
return response.text
|
||
|
# return response.content.decode('utf8')
|
||
|
else:
|
||
|
print("请求失败,状态码为{}".format(response.status_code))
|
||
|
return ""
|
||
|
def get_page_source(url):
|
||
|
response = requests.get(url)
|
||
|
if response.status_code == 200:
|
||
|
response.encoding = "utf-8"
|
||
|
return response.text
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
chapter_url = "https://www.xingyueboke.com/sudongpozhuan/85210.html"
|
||
|
chapter_source = get_page_source(chapter_url)
|
||
|
print(chapter_source)
|
||
|
source = get_source(url)
|
||
|
print(source)
|
||
|
|
||
|
def get_chapter_urls(start_source):
|
||
|
selector = etree.HTML(start_source)
|
||
|
urls = selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href')
|
||
|
rights_urls = []
|
||
|
for url in urls:
|
||
|
rights_urls.append(url)
|
||
|
return rights_urls
|
||
|
|
||
|
url1 = get_chapter_urls(source)
|
||
|
# print(url1)
|
||
|
|
||
|
def get_article(article_html):
|
||
|
selectors = etree.HTML(article_html)
|
||
|
title = selectors.xpath('//h1/text()')[0]
|
||
|
content = selectors.xpath('string(//div[@id="nr1"]/div)')
|
||
|
return title,content
|
||
|
|
||
|
def save(title,content):
|
||
|
a= "苏东坡传/"+title +".txt"
|
||
|
b = os.path.dirname(a)
|
||
|
if not os.path.exists(b):
|
||
|
os.makedirs(b)
|
||
|
with open(a,"a+",encoding='utf-8') as f:
|
||
|
f.write(content)
|
||
|
|
||
|
def saveCsv(articles_list):
|
||
|
a = "苏东坡传/苏东坡传.csv"
|
||
|
if not os.path.exists("苏东坡传"):
|
||
|
os.makedirs("苏东坡传")
|
||
|
with open(a, "w", encoding="utf-8",newline='') as f:
|
||
|
w = csv.writer(f)
|
||
|
for article in articles_list:
|
||
|
w.writerow(article)
|
||
|
articles_list =[]
|
||
|
|
||
|
|
||
|
for chapter_url in url1:
|
||
|
article_html = get_source(chapter_url)
|
||
|
if article_html:
|
||
|
title, content = get_article(article_html)
|
||
|
print(title)
|
||
|
# print(content)
|
||
|
save(title, content)
|
||
|
articles_list.append([chapter_url, title, len(content)])
|
||
|
saveCsv(articles_list)
|