|
|
import requests
|
|
|
import re
|
|
|
import os #生成文件
|
|
|
import parsel #解析数据
|
|
|
from threading import Thread
|
|
|
from fake_useragent import UserAgent
|
|
|
import requests
|
|
|
from time import sleep
|
|
|
from queue import Queue
|
|
|
|
|
|
|
|
|
class MyThread(Thread):
|
|
|
def __init__(self,url_queue_no,file_novel):
|
|
|
Thread.__init__(self)
|
|
|
self.url_queue_no = url_queue_no
|
|
|
self.file_novel = file_novel
|
|
|
def run(self):
|
|
|
while not self.url_queue_no.empty():#有坑,会卡进程,但是影响不大
|
|
|
url = self.url_queue_no.get(timeout = 1)
|
|
|
headers = {
|
|
|
'User-Agent': UserAgent().chrome
|
|
|
}
|
|
|
sleep(2)#防止被封IP,控制爬取速度
|
|
|
try:
|
|
|
response2 = requests.get(url, headers=headers, timeout=5)
|
|
|
selector_html2 = parsel.Selector(response2.text)
|
|
|
#getall返回文本列表
|
|
|
title = selector_html2.css('#main > h1::text').get()
|
|
|
title = re.sub(r'[\\/:*?"<>|]', '', title)#过滤字符
|
|
|
#注意 由于window系统对于文件名敏感,不允许‘\ / : * ? " < > |’出现,需要过滤
|
|
|
book = selector_html2.css('#content::text').getall()
|
|
|
# print(book)
|
|
|
book = '\n'.join(book)
|
|
|
with open(self.file_novel + title + '.txt', mode = 'a', encoding='utf-8') as f:
|
|
|
f.write(book)
|
|
|
print(title + '下载中')
|
|
|
except Exception as e:
|
|
|
print(e)
|
|
|
|
|
|
def mkfile_novel(file_name):
|
|
|
if not os.path.exists(file_name):
|
|
|
os.makedirs(file_name)
|
|
|
|
|
|
def spider_novel_start(id,url_queue_no,file_novel):
|
|
|
headers ={
|
|
|
'User-Agent': UserAgent().chrome
|
|
|
}
|
|
|
# id = input('书名的id号:')
|
|
|
url = 'https://www.qb5.ch/book_' + id + '/'
|
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
|
|
|
selector_html1 = parsel.Selector(response.text)
|
|
|
div1 = selector_html1.css('body > div.zjbox > dl > dd')
|
|
|
# 方法2 bs4
|
|
|
# bs4html = BeautifulSoup(response.text, 'lxml')
|
|
|
# div2 = bs4html.select('body > div.listmain > dl > dd:nth-child(n+2) > a'
|
|
|
for div in div1:
|
|
|
href = div.css('a::attr(href)').get()
|
|
|
if href == None:#避免空标签导致对请求地址的修改
|
|
|
continue
|
|
|
url2 = url + href
|
|
|
url_queue_no.put(url2)#多线程
|
|
|
for i in range(4):#线程数量
|
|
|
t1 = MyThread(url_queue_no, file_novel)
|
|
|
t1.start()
|
|
|
|
|
|
# if __name__ == '__main__':
|
|
|
# url_queue_no = Queue()#用队列保证线程安全
|
|
|
# # file_name = input('请输\入保存的文件夹名:') + '\\'
|
|
|
# file_novel = 'end\\book\\'
|
|
|
# mkfile_novel(file_novel)
|
|
|
# # id = input('请输入小说的id号:')
|
|
|
# id = '116659'
|
|
|
# spider_novel_start(id,url_queue_no)
|