You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

153 lines
5.3 KiB

import requests
from bs4 import BeautifulSoup
import lxml
import re
import math
import time
import threading
from queue import Queue
from urllib import request
import os
import json
from urllib.parse import urldefrag, urljoin, urlparse
from datetime import datetime
import random
def save(url):
# path = "D:/web/爬虫/资料/abc.png"
root = "D:/web/爬虫/资料"
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):# 如果根目录不存在则创建
os.mkdir(root)
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
with open(path, 'wb') as f:
f.write(r.content)
f.close()
# print("获取成功!")
except:
print("爬取失败!")
class ilyncSpider():
def __init__(self):
self.base_url = 'https://www.ilync.cn/org/6818_d_0_0_-1_-1_0_{}'
self.http_url = 'https://www.ilync.cn/{}'
self.pages_num = 2
self.throttle = Throttle(10.0)
self.header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0',
"https": "https://{0}".format("127.0.0.1:8000"),
"http": "http://{0}".format("127.0.0.1:8000")
}
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()
"""获取url列表"""
def get_url_list(self):
for i in range(1, self.pages_num + 1):
self.url_queue.put(self.base_url.format(i))
"""根据url解析页面内容"""
def parse_url(self):
while True:
url = self.url_queue.get()
response = requests.get(url, headers=self.header)
self.html_queue.put(response.content.decode("utf-8"))
self.url_queue.task_done()
def get_content_list(self):
while True:
content = self.html_queue.get()
soup = BeautifulSoup(content, 'lxml')
first_filter = soup.find_all('div', class_='course-list-wrap')[0]
second_filter = first_filter.find_all('div', class_='grid-cell')
infos = []
for one in second_filter:
temp_dict = {}
img = one.find('img').attrs['src']
title = one.find('img').attrs['title']
price = one.find('div', class_='course-price').text
price = re.sub(r'\s', '', price) # 去除制表符、换行符
id_str = one.find('a', class_='course-pic').attrs['href']
id = id_str[id_str.find('_') + 1:id_str.find('?')]
temp_dict['id'] = id
temp_dict['title'] = re.sub(r'\xa0', ' ', title) # 空格转换
temp_dict['img'] = img
temp_dict['price'] = price
temp_dict['url'] = self.http_url.format(id_str)
infos.append(temp_dict)
self.content_queue.put(infos)
self.html_queue.task_done()
"""保存数据"""
def save_content_list(self):
while True:
content_list = self.content_queue.get()
for i in content_list:
print(i)
self.throttle.wait_url(i['url'])
save(i['img'])
db_dict = {
'id':i['id'],
'标题':i['title'],
'图标':i['img'],
'价格': i['price'],
'链接': i['url']
}
db_json = json.dumps(db_dict, ensure_ascii=False, indent=2)
self.db.write(db_json)
self.content_queue.task_done()
"""使用多线程调用"""
def run(self):
self.db = open('./课程信息.txt', 'w', encoding='utf-8')
thread_list = []
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
for i in range(10):
t_xml = threading.Thread(target=self.parse_url)
thread_list.append(t_xml)
t_content = threading.Thread(target=self.get_content_list)
thread_list.append(t_content)
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True)
t.start()
for q in [self.url_queue, self.html_queue, self.content_queue]:
q.join()
self.db.close()
print('所有数据获取完成')
class Throttle:
"""
下载限速器
"""
def __init__(self, delay):
self.domains = {}
self.delay = delay
def wait_url(self, url_str):
domain_url = urlparse(url_str).netloc
last_accessed = self.domains.get(domain_url)
if self.delay > 0 and last_accessed is not None:
sleep_interval = self.delay - (datetime.now() - last_accessed).seconds
if sleep_interval > 0:
time.sleep(sleep_interval + round(random.uniform(1, 3), 1))
self.domains[domain_url] = datetime.now()
start = time.time()
obj = ilyncSpider()
obj.run()
end = time.time()
print(end - start)