import requests import re import os #生成文件 import parsel #解析数据 from threading import Thread from fake_useragent import UserAgent import requests from time import sleep from queue import Queue class MyThread(Thread): def __init__(self,url_queue_no,file_novel): Thread.__init__(self) self.url_queue_no = url_queue_no self.file_novel = file_novel def run(self): while not self.url_queue_no.empty():#有坑,会卡进程,但是影响不大 url = self.url_queue_no.get(timeout = 1) headers = { 'User-Agent': UserAgent().chrome } sleep(2)#防止被封IP,控制爬取速度 try: response2 = requests.get(url, headers=headers, timeout=5) selector_html2 = parsel.Selector(response2.text) #getall返回文本列表 title = selector_html2.css('#main > h1::text').get() title = re.sub(r'[\\/:*?"<>|]', '', title)#过滤字符 #注意 由于window系统对于文件名敏感,不允许‘\ / : * ? " < > |’出现,需要过滤 book = selector_html2.css('#content::text').getall() # print(book) book = '\n'.join(book) with open(self.file_novel + title + '.txt', mode = 'a', encoding='utf-8') as f: f.write(book) print(title + '下载中') except Exception as e: print(e) def mkfile_novel(file_name): if not os.path.exists(file_name): os.makedirs(file_name) def spider_novel_start(id,url_queue_no,file_novel): headers ={ 'User-Agent': UserAgent().chrome } # id = input('书名的id号:') url = 'https://www.qb5.ch/book_' + id + '/' response = requests.get(url, headers=headers) selector_html1 = parsel.Selector(response.text) div1 = selector_html1.css('body > div.zjbox > dl > dd') # 方法2 bs4 # bs4html = BeautifulSoup(response.text, 'lxml') # div2 = bs4html.select('body > div.listmain > dl > dd:nth-child(n+2) > a' for div in div1: href = div.css('a::attr(href)').get() if href == None:#避免空标签导致对请求地址的修改 continue url2 = url + href url_queue_no.put(url2)#多线程 for i in range(4):#线程数量 t1 = MyThread(url_queue_no, file_novel) t1.start() # if __name__ == '__main__': # url_queue_no = Queue()#用队列保证线程安全 # # file_name = input('请输\入保存的文件夹名:') + '\\' # file_novel = 'end\\book\\' # mkfile_novel(file_novel) # # id = input('请输入小说的id号:') # id = '116659' # spider_novel_start(id,url_queue_no)