diff --git a/poemtry.py b/poemtry.py new file mode 100644 index 0000000..7c4ca35 --- /dev/null +++ b/poemtry.py @@ -0,0 +1,71 @@ +import requests +from lxml import etree + +start_url = "https://www.xingyueboke.com/sudongpozhuan/" +head = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"} + + +def get_source(start_url, head): + response = requests.get(start_url, headers=head) + response.encoding = 'utf-8' + return response.text + + + +def get_chapterurl(start_source): + select = etree.HTML(start_source) + urls = select.xpath('//div[@class="book-list clearfix"]/ul/li/a/@href') + return urls + + + + +def get_article(article_html): + select = etree.HTML(article_html) + # title = select.xpath('/html/body/div[1]/article/header/h1/text()') + # content = select.xpath('//*[@id="nr1"]/div/p/text()') + title = select.xpath('//h1[@class="post-title"]/text()') + content = select.xpath('//div[@id="nr1"]/div/p/text()') + return title,content + + + + + +# +# +# +def save(title, content): + with open("dufu.text", "w", encoding='utf-8') as f: + for j in content: + f.write(j) +# +# +# +# +# +# +# +# +# +# +# +# source=get_source(start_url,head) +# urls=get_chapterurl(source) +# for i in urls: +# articl_html=get_source(i) +# get_article(articl_html) + + + +urls=get_chapterurl(get_source(start_url, head)) +for i in urls: + artical_html=get_source(i,head) + title,content=get_article(artical_html) + print(title) + print(content) + save(title,content) + + +