diff --git a/spider_novel.py b/spider_novel.py new file mode 100644 index 0000000..d186ab8 --- /dev/null +++ b/spider_novel.py @@ -0,0 +1,75 @@ +import requests +import re +import os #生成文件 +import parsel #解析数据 +from threading import Thread +from fake_useragent import UserAgent +import requests +from time import sleep +from queue import Queue + + +class MyThread(Thread): + def __init__(self,url_queue_no,file_novel): + Thread.__init__(self) + self.url_queue_no = url_queue_no + self.file_novel = file_novel + def run(self): + while not self.url_queue_no.empty():#有坑,会卡进程,但是影响不大 + url = self.url_queue_no.get(timeout = 1) + headers = { + 'User-Agent': UserAgent().chrome + } + sleep(2)#防止被封IP,控制爬取速度 + try: + response2 = requests.get(url, headers=headers, timeout=5) + selector_html2 = parsel.Selector(response2.text) + #getall返回文本列表 + title = selector_html2.css('#main > h1::text').get() + title = re.sub(r'[\\/:*?"<>|]', '', title)#过滤字符 + #注意 由于window系统对于文件名敏感,不允许‘\ / : * ? " < > |’出现,需要过滤 + book = selector_html2.css('#content::text').getall() + # print(book) + book = '\n'.join(book) + with open(self.file_novel + title + '.txt', mode = 'a', encoding='utf-8') as f: + f.write(book) + print(title + '下载中') + except Exception as e: + print(e) + +def mkfile_novel(file_name): + if not os.path.exists(file_name): + os.makedirs(file_name) + +def spider_novel_start(id,url_queue_no,file_novel): + headers ={ + 'User-Agent': UserAgent().chrome + } + # id = input('书名的id号:') + url = 'https://www.qb5.ch/book_' + id + '/' + + response = requests.get(url, headers=headers) + + selector_html1 = parsel.Selector(response.text) + div1 = selector_html1.css('body > div.zjbox > dl > dd') + # 方法2 bs4 + # bs4html = BeautifulSoup(response.text, 'lxml') + # div2 = bs4html.select('body > div.listmain > dl > dd:nth-child(n+2) > a' + for div in div1: + href = div.css('a::attr(href)').get() + if href == None:#避免空标签导致对请求地址的修改 + continue + url2 = url + href + url_queue_no.put(url2)#多线程 + for i in range(4):#线程数量 + t1 = MyThread(url_queue_no, file_novel) + t1.start() + +# if __name__ == '__main__': +# url_queue_no = Queue()#用队列保证线程安全 +# # file_name = input('请输\入保存的文件夹名:') + '\\' +# file_novel = 'end\\book\\' +# mkfile_novel(file_novel) +# # id = input('请输入小说的id号:') +# id = '116659' +# spider_novel_start(id,url_queue_no)