From f483a92a3cca4c6cc93dbf6a89c8d52497a346a0 Mon Sep 17 00:00:00 2001
From: prxecj98k <2906422449@qq.com>
Date: Mon, 3 Jun 2024 23:03:38 +0800
Subject: [PATCH] ADD file via upload

---
 ...30511_王文杰_计科2105_小说爬虫.py | 106 ++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 21412030511_王文杰_计科2105_小说爬虫.py
diff --git a/21412030511_王文杰_计科2105_小说爬虫.py b/21412030511_王文杰_计科2105_小说爬虫.py
new file mode 100644
index 0000000..72f5b5d
--- /dev/null
+++ b/21412030511_王文杰_计科2105_小说爬虫.py
@@ -0,0 +1,106 @@
+import requests
+from lxml import html
+from lxml import etree
+import re
+import os
+import tkinter as tk
+import concurrent.futures
+x = ''
+def get_response(html_url):#请求函数
+    #伪装自己
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'  
+    }
+    response = requests.get(url=html_url,headers = headers)
+    return response 
+def get_list_url(html_url):#获取章节url,书名
+    html_data =  get_response(html_url).text
+    #book_name
+    name = re.findall('<h1>(.*?)</h1>',html_data)[0]
+    url_list = re.findall('<dd><a href ="(.*?)">',html_data)
+    # print(name)
+    # print(url_list)
+    return name,url_list
+def get_content(html_url): #获取小说标题，内容
+    html_data =  get_response(html_url).text
+    title = re.findall('<h1 class="wap_none">(.*?)</h1>',html_data)[0]
+    selector = etree.HTML(html_data)
+    content = '\n'.join(selector.xpath('//*[@id="chaptercontent"]/text()'))
+    return title,content
+def save(name,title,content):#保存小说
+    #创建文件夹
+    file = f'{name}\\'
+    if not os.path.exists(file):
+        os.mkdir(file)
+    with open( file+ name + '.txt', mode='a', encoding='utf-8') as f:
+        f.write(title)
+        f.write('\n')
+        f.write(content)
+        f.write('\n')
+def main(home_url,name):#主函数
+    # index_url = 'https://www.bqgui.cc'+ url
+    # index_url = open("wz.txt",'r',encoding='utf8').readlines
+    title,content = get_content(html_url=home_url)
+    save(name,title,content)  
+def queren():
+    f = open("wz.txt",'r',encoding='utf8')
+    url = f.read()
+    print(url)
+    
+    # url='https://www.bqgui.cc/book/511/'
+    name,url_list = get_list_url(url)
+    exe = concurrent.futures.ThreadPoolExecutor(max_workers=7)
+    for url in url_list:
+        index_url = 'https://www.bqgui.cc'+ url
+        exe.submit(main,index_url,name)
+    exe.shutdown()   
+    
+def window():
+    root = tk.Tk()
+    root.title("小说爬虫")
+    root.geometry("500x500+300+300")
+
+    text_table = tk.Label(root,text="请在https://www.bqgui.cc中找到想下载的书并在下方输入其网址")
+    text_table.pack()
+
+    e1= tk.Entry(root,width=60)
+    e1.pack()
+    def func():
+        global x
+        x=e1.get()
+        with open("wz.txt",'w+',encoding='utf8') as f:
+            f.writelines(x)
+        # print(x)
+        e1.delete(0,50)
+
+  
+    tk.Button(root,text="确认",command=func).pack()
+    tk.Button(root,text="下载",command=queren).pack()
+    root.mainloop()    
+def huanyin():
+    root = tk.Tk()
+    root.title("小说爬虫")
+    root.geometry("500x500+300+300")
+    text1_table = tk.Label(root,text="欢迎使用小说爬虫")
+    text1_table.pack()
+    text2_table = tk.Label(root,text="班级：计科2105")
+    text2_table.pack()
+    text2_table = tk.Label(root,text="姓名：王文杰")
+    text2_table.pack()
+    text3_table = tk.Label(root,text="学号：21412030511")
+    text3_table.pack()
+    def fun2():
+        root.withdraw()
+        window()
+        root.destroy()
+        
+    tk.Button(root,text="下一步",command=fun2).pack()
+
+    root.mainloop()      
+if __name__ == '__main__':
+    huanyin()
+    
+
+  
+
+