crawler/main

import requests
from bs4 import BeautifulSoup
import lxml
import re
import math
import time
import threading
from queue import Queue
from urllib import request
import os
import json
from urllib.parse import urldefrag, urljoin, urlparse
from datetime import datetime
import random
def save(url):
    # path = "D:/web/爬虫/资料/abc.png"
    root = "D:/web/爬虫/资料"
    path = root + url.split('/')[-1]
    try:
        if not os.path.exists(root):# 如果根目录不存在则创建
            os.mkdir(root)
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        with open(path, 'wb') as f:
            f.write(r.content)
        f.close()
        # print("获取成功!")
    except:
        print("爬取失败!")
class ilyncSpider():
    def __init__(self):
        self.base_url = 'https://www.ilync.cn/org/6818_d_0_0_-1_-1_0_{}'
        self.http_url = 'https://www.ilync.cn/{}'
        self.pages_num = 2
        self.throttle = Throttle(10.0)
        self.header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, compress',
        'Accept-Language': 'en-us;q=0.5,en;q=0.3',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0',
        "https": "https://{0}".format("127.0.0.1:8000"),
        "http": "http://{0}".format("127.0.0.1:8000")
        }
        self.url_queue = Queue()
        self.html_queue = Queue()
        self.content_queue = Queue()

        """获取url列表"""
    def get_url_list(self):
        for i in range(1, self.pages_num + 1):
            self.url_queue.put(self.base_url.format(i))

        """根据url解析页面内容"""
    def parse_url(self):
        while True:
            url = self.url_queue.get()
            response = requests.get(url, headers=self.header)
            self.html_queue.put(response.content.decode("utf-8"))
            self.url_queue.task_done()

    def get_content_list(self):
        while True:
            content = self.html_queue.get()
            soup = BeautifulSoup(content, 'lxml')
            first_filter = soup.find_all('div', class_='course-list-wrap')[0]
            second_filter = first_filter.find_all('div', class_='grid-cell')
            infos = []
            for one in second_filter:
                temp_dict = {}
                img = one.find('img').attrs['src']
                title = one.find('img').attrs['title']
                price = one.find('div', class_='course-price').text
                price = re.sub(r'\s', '', price)  # 去除制表符、换行符
                id_str = one.find('a', class_='course-pic').attrs['href']
                id = id_str[id_str.find('_') + 1:id_str.find('?')]
                temp_dict['id'] = id
                temp_dict['title'] = re.sub(r'\xa0', ' ', title)  # 空格转换
                temp_dict['img'] = img
                temp_dict['price'] = price
                temp_dict['url'] = self.http_url.format(id_str)
                infos.append(temp_dict)
            self.content_queue.put(infos)
            self.html_queue.task_done()

        """保存数据"""
    def save_content_list(self):
        while True:
            content_list = self.content_queue.get()
            for i in content_list:
                print(i)
                self.throttle.wait_url(i['url'])
                save(i['img'])
                db_dict = {
                    'id':i['id'],
                    '标题':i['title'],
                    '图标':i['img'],
                    '价格': i['price'],
                    '链接': i['url']
                }
                db_json = json.dumps(db_dict, ensure_ascii=False, indent=2)
                self.db.write(db_json)
            self.content_queue.task_done()

        """使用多线程调用"""
    def run(self):
        self.db = open('./课程信息.txt', 'w', encoding='utf-8')
        thread_list = []
        t_url = threading.Thread(target=self.get_url_list)
        thread_list.append(t_url)
        for i in range(10):
            t_xml = threading.Thread(target=self.parse_url)
            thread_list.append(t_xml)
        t_content = threading.Thread(target=self.get_content_list)
        thread_list.append(t_content)
        t_save = threading.Thread(target=self.save_content_list)
        thread_list.append(t_save)

        for t in thread_list:
            t.setDaemon(True)
            t.start()
        for q in [self.url_queue, self.html_queue, self.content_queue]:
            q.join()
        self.db.close()
        print('所有数据获取完成')

class Throttle:
    """
    下载限速器
    """

    def __init__(self, delay):
        self.domains = {}
        self.delay = delay

    def wait_url(self, url_str):
        domain_url = urlparse(url_str).netloc
        last_accessed = self.domains.get(domain_url)

        if self.delay > 0 and last_accessed is not None:
            sleep_interval = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_interval > 0:
                time.sleep(sleep_interval + round(random.uniform(1, 3), 1))
        self.domains[domain_url] = datetime.now()

start = time.time()
obj = ilyncSpider()
obj.run()
end = time.time()
print(end - start)