You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

76 lines
2.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
import os #生成文件
import parsel #解析数据
from threading import Thread
from fake_useragent import UserAgent
import requests
from time import sleep
from queue import Queue
class MyThread(Thread):
def __init__(self,url_queue_no,file_novel):
Thread.__init__(self)
self.url_queue_no = url_queue_no
self.file_novel = file_novel
def run(self):
while not self.url_queue_no.empty():#有坑,会卡进程,但是影响不大
url = self.url_queue_no.get(timeout = 1)
headers = {
'User-Agent': UserAgent().chrome
}
sleep(2)#防止被封IP,控制爬取速度
try:
response2 = requests.get(url, headers=headers, timeout=5)
selector_html2 = parsel.Selector(response2.text)
#getall返回文本列表
title = selector_html2.css('#main > h1::text').get()
title = re.sub(r'[\\/:*?"<>|]', '', title)#过滤字符
#注意 由于window系统对于文件名敏感不允许\ / : * ? " < > |’出现,需要过滤
book = selector_html2.css('#content::text').getall()
# print(book)
book = '\n'.join(book)
with open(self.file_novel + title + '.txt', mode = 'a', encoding='utf-8') as f:
f.write(book)
print(title + '下载中')
except Exception as e:
print(e)
def mkfile_novel(file_name):
if not os.path.exists(file_name):
os.makedirs(file_name)
def spider_novel_start(id,url_queue_no,file_novel):
headers ={
'User-Agent': UserAgent().chrome
}
# id = input('书名的id号:')
url = 'https://www.qb5.ch/book_' + id + '/'
response = requests.get(url, headers=headers)
selector_html1 = parsel.Selector(response.text)
div1 = selector_html1.css('body > div.zjbox > dl > dd')
# 方法2 bs4
# bs4html = BeautifulSoup(response.text, 'lxml')
# div2 = bs4html.select('body > div.listmain > dl > dd:nth-child(n+2) > a'
for div in div1:
href = div.css('a::attr(href)').get()
if href == None:#避免空标签导致对请求地址的修改
continue
url2 = url + href
url_queue_no.put(url2)#多线程
for i in range(4):#线程数量
t1 = MyThread(url_queue_no, file_novel)
t1.start()
# if __name__ == '__main__':
# url_queue_no = Queue()#用队列保证线程安全
# # file_name = input('请输\入保存的文件夹名:') + '\\'
# file_novel = 'end\\book\\'
# mkfile_novel(file_novel)
# # id = input('请输入小说的id号:')
# id = '116659'
# spider_novel_start(id,url_queue_no)