ADD file via upload

main
pewxvf3lf 1 year ago
parent e8617e44de
commit 26fa87b3cd

@ -0,0 +1,75 @@
import requests
import re
import os #生成文件
import parsel #解析数据
from threading import Thread
from fake_useragent import UserAgent
import requests
from time import sleep
from queue import Queue
class MyThread(Thread):
def __init__(self,url_queue_no,file_novel):
Thread.__init__(self)
self.url_queue_no = url_queue_no
self.file_novel = file_novel
def run(self):
while not self.url_queue_no.empty():#有坑,会卡进程,但是影响不大
url = self.url_queue_no.get(timeout = 1)
headers = {
'User-Agent': UserAgent().chrome
}
sleep(2)#防止被封IP,控制爬取速度
try:
response2 = requests.get(url, headers=headers, timeout=5)
selector_html2 = parsel.Selector(response2.text)
#getall返回文本列表
title = selector_html2.css('#main > h1::text').get()
title = re.sub(r'[\\/:*?"<>|]', '', title)#过滤字符
#注意 由于window系统对于文件名敏感不允许\ / : * ? " < > |’出现,需要过滤
book = selector_html2.css('#content::text').getall()
# print(book)
book = '\n'.join(book)
with open(self.file_novel + title + '.txt', mode = 'a', encoding='utf-8') as f:
f.write(book)
print(title + '下载中')
except Exception as e:
print(e)
def mkfile_novel(file_name):
if not os.path.exists(file_name):
os.makedirs(file_name)
def spider_novel_start(id,url_queue_no,file_novel):
headers ={
'User-Agent': UserAgent().chrome
}
# id = input('书名的id号:')
url = 'https://www.qb5.ch/book_' + id + '/'
response = requests.get(url, headers=headers)
selector_html1 = parsel.Selector(response.text)
div1 = selector_html1.css('body > div.zjbox > dl > dd')
# 方法2 bs4
# bs4html = BeautifulSoup(response.text, 'lxml')
# div2 = bs4html.select('body > div.listmain > dl > dd:nth-child(n+2) > a'
for div in div1:
href = div.css('a::attr(href)').get()
if href == None:#避免空标签导致对请求地址的修改
continue
url2 = url + href
url_queue_no.put(url2)#多线程
for i in range(4):#线程数量
t1 = MyThread(url_queue_no, file_novel)
t1.start()
# if __name__ == '__main__':
# url_queue_no = Queue()#用队列保证线程安全
# # file_name = input('请输\入保存的文件夹名:') + '\\'
# file_novel = 'end\\book\\'
# mkfile_novel(file_novel)
# # id = input('请输入小说的id号:')
# id = '116659'
# spider_novel_start(id,url_queue_no)
Loading…
Cancel
Save