import requests from bs4 import BeautifulSoup import lxml import re import math import time import threading from queue import Queue from urllib import request import os import json from urllib.parse import urldefrag, urljoin, urlparse from datetime import datetime import random def save(url): # path = "D:/web/爬虫/资料/abc.png" root = "D:/web/爬虫/资料" path = root + url.split('/')[-1] try: if not os.path.exists(root):# 如果根目录不存在则创建 os.mkdir(root) r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding with open(path, 'wb') as f: f.write(r.content) f.close() # print("获取成功!") except: print("爬取失败!") class ilyncSpider(): def __init__(self): self.base_url = 'https://www.ilync.cn/org/6818_d_0_0_-1_-1_0_{}' self.http_url = 'https://www.ilync.cn/{}' self.pages_num = 2 self.throttle = Throttle(10.0) self.header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, compress', 'Accept-Language': 'en-us;q=0.5,en;q=0.3', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0', "https": "https://{0}".format("127.0.0.1:8000"), "http": "http://{0}".format("127.0.0.1:8000") } self.url_queue = Queue() self.html_queue = Queue() self.content_queue = Queue() """获取url列表""" def get_url_list(self): for i in range(1, self.pages_num + 1): self.url_queue.put(self.base_url.format(i)) """根据url解析页面内容""" def parse_url(self): while True: url = self.url_queue.get() response = requests.get(url, headers=self.header) self.html_queue.put(response.content.decode("utf-8")) self.url_queue.task_done() def get_content_list(self): while True: content = self.html_queue.get() soup = BeautifulSoup(content, 'lxml') first_filter = soup.find_all('div', class_='course-list-wrap')[0] second_filter = first_filter.find_all('div', class_='grid-cell') infos = [] for one in second_filter: temp_dict = {} img = one.find('img').attrs['src'] title = one.find('img').attrs['title'] price = one.find('div', class_='course-price').text price = re.sub(r'\s', '', price) # 去除制表符、换行符 id_str = one.find('a', class_='course-pic').attrs['href'] id = id_str[id_str.find('_') + 1:id_str.find('?')] temp_dict['id'] = id temp_dict['title'] = re.sub(r'\xa0', ' ', title) # 空格转换 temp_dict['img'] = img temp_dict['price'] = price temp_dict['url'] = self.http_url.format(id_str) infos.append(temp_dict) self.content_queue.put(infos) self.html_queue.task_done() """保存数据""" def save_content_list(self): while True: content_list = self.content_queue.get() for i in content_list: print(i) self.throttle.wait_url(i['url']) save(i['img']) db_dict = { 'id':i['id'], '标题':i['title'], '图标':i['img'], '价格': i['price'], '链接': i['url'] } db_json = json.dumps(db_dict, ensure_ascii=False, indent=2) self.db.write(db_json) self.content_queue.task_done() """使用多线程调用""" def run(self): self.db = open('./课程信息.txt', 'w', encoding='utf-8') thread_list = [] t_url = threading.Thread(target=self.get_url_list) thread_list.append(t_url) for i in range(10): t_xml = threading.Thread(target=self.parse_url) thread_list.append(t_xml) t_content = threading.Thread(target=self.get_content_list) thread_list.append(t_content) t_save = threading.Thread(target=self.save_content_list) thread_list.append(t_save) for t in thread_list: t.setDaemon(True) t.start() for q in [self.url_queue, self.html_queue, self.content_queue]: q.join() self.db.close() print('所有数据获取完成') class Throttle: """ 下载限速器 """ def __init__(self, delay): self.domains = {} self.delay = delay def wait_url(self, url_str): domain_url = urlparse(url_str).netloc last_accessed = self.domains.get(domain_url) if self.delay > 0 and last_accessed is not None: sleep_interval = self.delay - (datetime.now() - last_accessed).seconds if sleep_interval > 0: time.sleep(sleep_interval + round(random.uniform(1, 3), 1)) self.domains[domain_url] = datetime.now() start = time.time() obj = ilyncSpider() obj.run() end = time.time() print(end - start)