From f483a92a3cca4c6cc93dbf6a89c8d52497a346a0 Mon Sep 17 00:00:00 2001
From: prxecj98k <2906422449@qq.com>
Date: Mon, 3 Jun 2024 23:03:38 +0800
Subject: [PATCH] ADD file via upload
---
...30511_王文杰_计科2105_小说爬虫.py | 106 ++++++++++++++++++
1 file changed, 106 insertions(+)
create mode 100644 21412030511_王文杰_计科2105_小说爬虫.py
diff --git a/21412030511_王文杰_计科2105_小说爬虫.py b/21412030511_王文杰_计科2105_小说爬虫.py
new file mode 100644
index 0000000..72f5b5d
--- /dev/null
+++ b/21412030511_王文杰_计科2105_小说爬虫.py
@@ -0,0 +1,106 @@
+import requests
+from lxml import html
+from lxml import etree
+import re
+import os
+import tkinter as tk
+import concurrent.futures
+x = ''
+def get_response(html_url):#请求函数
+ #伪装自己
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'
+ }
+ response = requests.get(url=html_url,headers = headers)
+ return response
+def get_list_url(html_url):#获取章节url,书名
+ html_data = get_response(html_url).text
+ #book_name
+ name = re.findall('
(.*?)
',html_data)[0]
+ url_list = re.findall('',html_data)
+ # print(name)
+ # print(url_list)
+ return name,url_list
+def get_content(html_url): #获取小说标题,内容
+ html_data = get_response(html_url).text
+ title = re.findall('(.*?)
',html_data)[0]
+ selector = etree.HTML(html_data)
+ content = '\n'.join(selector.xpath('//*[@id="chaptercontent"]/text()'))
+ return title,content
+def save(name,title,content):#保存小说
+ #创建文件夹
+ file = f'{name}\\'
+ if not os.path.exists(file):
+ os.mkdir(file)
+ with open( file+ name + '.txt', mode='a', encoding='utf-8') as f:
+ f.write(title)
+ f.write('\n')
+ f.write(content)
+ f.write('\n')
+def main(home_url,name):#主函数
+ # index_url = 'https://www.bqgui.cc'+ url
+ # index_url = open("wz.txt",'r',encoding='utf8').readlines
+ title,content = get_content(html_url=home_url)
+ save(name,title,content)
+def queren():
+ f = open("wz.txt",'r',encoding='utf8')
+ url = f.read()
+ print(url)
+
+ # url='https://www.bqgui.cc/book/511/'
+ name,url_list = get_list_url(url)
+ exe = concurrent.futures.ThreadPoolExecutor(max_workers=7)
+ for url in url_list:
+ index_url = 'https://www.bqgui.cc'+ url
+ exe.submit(main,index_url,name)
+ exe.shutdown()
+
+def window():
+ root = tk.Tk()
+ root.title("小说爬虫")
+ root.geometry("500x500+300+300")
+
+ text_table = tk.Label(root,text="请在https://www.bqgui.cc中找到想下载的书并在下方输入其网址")
+ text_table.pack()
+
+ e1= tk.Entry(root,width=60)
+ e1.pack()
+ def func():
+ global x
+ x=e1.get()
+ with open("wz.txt",'w+',encoding='utf8') as f:
+ f.writelines(x)
+ # print(x)
+ e1.delete(0,50)
+
+
+ tk.Button(root,text="确认",command=func).pack()
+ tk.Button(root,text="下载",command=queren).pack()
+ root.mainloop()
+def huanyin():
+ root = tk.Tk()
+ root.title("小说爬虫")
+ root.geometry("500x500+300+300")
+ text1_table = tk.Label(root,text="欢迎使用小说爬虫")
+ text1_table.pack()
+ text2_table = tk.Label(root,text="班级:计科2105")
+ text2_table.pack()
+ text2_table = tk.Label(root,text="姓名:王文杰")
+ text2_table.pack()
+ text3_table = tk.Label(root,text="学号:21412030511")
+ text3_table.pack()
+ def fun2():
+ root.withdraw()
+ window()
+ root.destroy()
+
+ tk.Button(root,text="下一步",command=fun2).pack()
+
+ root.mainloop()
+if __name__ == '__main__':
+ huanyin()
+
+
+
+
+