ADD file via upload

2 years ago · 184a725148
parent 2137ca26b9
commit 184a725148
1 changed files with 67 additions and 0 deletions
--- a/xiaozu.py
+++ b/xiaozu.py
@ -0,0 +1,67 @@
 import requests
 from lxml import etree
 import os
 import csv
 url = "https://www.xingyueboke.com/sudongpozhuan/"
 headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.1311 SLBChan/109"}
 def get_source(url):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        response.encoding = "utf-8"
        return response.text
        # return response.content.decode('utf8')
    else:
        print("请求失败，状态码为{}".format(response.status_code))
        return ""
 source = get_source(url)
 print(source)
 def get_chapter_urls(start_source):
    selector = etree.HTML(start_source)
    urls = selector.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href')
    rights_urls = []
    for url in urls:
        rights_urls.append(url)
    return rights_urls
 url1 = get_chapter_urls(source)
 print(url1)
 def get_article(article_html):
    selectors = etree.HTML(article_html)
    title = selectors.xpath('//h1/text()')[0]
    content = selectors.xpath('string(//div[@id="nr1"]/div)')
    return title,content
 def save(title,content):
    filename = "苏坡东传/"+title +".txt"
    dirname = os.path.dirname(filename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    with open(filename,"a+",encoding='utf-8') as f:
        f.write(content)
 def saveCsv(articles_list):
    filename = "苏东坡传/苏东坡传.csv"
    if not os.path.exists("苏东坡传"):
        os.makedirs("苏东坡传")
        with open(filename, "w", encoding="utf-8",newline='') as f:
            w = csv.writer(f)
            for article in articles_list:
                w.writerow(article)
 articles_list =[]
 for chapter_url in url1:
    article_html = get_source(chapter_url)
    if article_html:
        title, content = get_article(article_html)
        print(title)
        print(content)
        save(title, content)
        articles_list.append([chapter_url, title, len(content)])
 saveCsv(articles_list)