ADD file via upload

main
pqk5c82f3 8 months ago
parent 2137ca26b9
commit 184a725148

@ -0,0 +1,67 @@
import requests
from lxml import etree
import os
import csv
url = "https://www.xingyueboke.com/sudongpozhuan/"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.1311 SLBChan/109"}
def get_source(url):
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = "utf-8"
return response.text
# return response.content.decode('utf8')
else:
print("请求失败,状态码为{}".format(response.status_code))
return ""
source = get_source(url)
print(source)
def get_chapter_urls(start_source):
selector = etree.HTML(start_source)
urls = selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href')
rights_urls = []
for url in urls:
rights_urls.append(url)
return rights_urls
url1 = get_chapter_urls(source)
print(url1)
def get_article(article_html):
selectors = etree.HTML(article_html)
title = selectors.xpath('//h1/text()')[0]
content = selectors.xpath('string(//div[@id="nr1"]/div)')
return title,content
def save(title,content):
filename = "苏坡东传/"+title +".txt"
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(filename,"a+",encoding='utf-8') as f:
f.write(content)
def saveCsv(articles_list):
filename = "苏东坡传/苏东坡传.csv"
if not os.path.exists("苏东坡传"):
os.makedirs("苏东坡传")
with open(filename, "w", encoding="utf-8",newline='') as f:
w = csv.writer(f)
for article in articles_list:
w.writerow(article)
articles_list =[]
for chapter_url in url1:
article_html = get_source(chapter_url)
if article_html:
title, content = get_article(article_html)
print(title)
print(content)
save(title, content)
articles_list.append([chapter_url, title, len(content)])
saveCsv(articles_list)
Loading…
Cancel
Save