From f483a92a3cca4c6cc93dbf6a89c8d52497a346a0 Mon Sep 17 00:00:00 2001 From: prxecj98k <2906422449@qq.com> Date: Mon, 3 Jun 2024 23:03:38 +0800 Subject: [PATCH] ADD file via upload --- ...30511_王文杰_计科2105_小说爬虫.py | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 21412030511_王文杰_计科2105_小说爬虫.py diff --git a/21412030511_王文杰_计科2105_小说爬虫.py b/21412030511_王文杰_计科2105_小说爬虫.py new file mode 100644 index 0000000..72f5b5d --- /dev/null +++ b/21412030511_王文杰_计科2105_小说爬虫.py @@ -0,0 +1,106 @@ +import requests +from lxml import html +from lxml import etree +import re +import os +import tkinter as tk +import concurrent.futures +x = '' +def get_response(html_url):#请求函数 + #伪装自己 + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0' + } + response = requests.get(url=html_url,headers = headers) + return response +def get_list_url(html_url):#获取章节url,书名 + html_data = get_response(html_url).text + #book_name + name = re.findall('

(.*?)

',html_data)[0] + url_list = re.findall('
',html_data) + # print(name) + # print(url_list) + return name,url_list +def get_content(html_url): #获取小说标题,内容 + html_data = get_response(html_url).text + title = re.findall('

(.*?)

',html_data)[0] + selector = etree.HTML(html_data) + content = '\n'.join(selector.xpath('//*[@id="chaptercontent"]/text()')) + return title,content +def save(name,title,content):#保存小说 + #创建文件夹 + file = f'{name}\\' + if not os.path.exists(file): + os.mkdir(file) + with open( file+ name + '.txt', mode='a', encoding='utf-8') as f: + f.write(title) + f.write('\n') + f.write(content) + f.write('\n') +def main(home_url,name):#主函数 + # index_url = 'https://www.bqgui.cc'+ url + # index_url = open("wz.txt",'r',encoding='utf8').readlines + title,content = get_content(html_url=home_url) + save(name,title,content) +def queren(): + f = open("wz.txt",'r',encoding='utf8') + url = f.read() + print(url) + + # url='https://www.bqgui.cc/book/511/' + name,url_list = get_list_url(url) + exe = concurrent.futures.ThreadPoolExecutor(max_workers=7) + for url in url_list: + index_url = 'https://www.bqgui.cc'+ url + exe.submit(main,index_url,name) + exe.shutdown() + +def window(): + root = tk.Tk() + root.title("小说爬虫") + root.geometry("500x500+300+300") + + text_table = tk.Label(root,text="请在https://www.bqgui.cc中找到想下载的书并在下方输入其网址") + text_table.pack() + + e1= tk.Entry(root,width=60) + e1.pack() + def func(): + global x + x=e1.get() + with open("wz.txt",'w+',encoding='utf8') as f: + f.writelines(x) + # print(x) + e1.delete(0,50) + + + tk.Button(root,text="确认",command=func).pack() + tk.Button(root,text="下载",command=queren).pack() + root.mainloop() +def huanyin(): + root = tk.Tk() + root.title("小说爬虫") + root.geometry("500x500+300+300") + text1_table = tk.Label(root,text="欢迎使用小说爬虫") + text1_table.pack() + text2_table = tk.Label(root,text="班级:计科2105") + text2_table.pack() + text2_table = tk.Label(root,text="姓名:王文杰") + text2_table.pack() + text3_table = tk.Label(root,text="学号:21412030511") + text3_table.pack() + def fun2(): + root.withdraw() + window() + root.destroy() + + tk.Button(root,text="下一步",command=fun2).pack() + + root.mainloop() +if __name__ == '__main__': + huanyin() + + + + +