You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
153 lines
5.3 KiB
153 lines
5.3 KiB
import requests
|
|
from bs4 import BeautifulSoup
|
|
import lxml
|
|
import re
|
|
import math
|
|
import time
|
|
import threading
|
|
from queue import Queue
|
|
from urllib import request
|
|
import os
|
|
import json
|
|
from urllib.parse import urldefrag, urljoin, urlparse
|
|
from datetime import datetime
|
|
import random
|
|
def save(url):
|
|
# path = "D:/web/爬虫/资料/abc.png"
|
|
root = "D:/web/爬虫/资料"
|
|
path = root + url.split('/')[-1]
|
|
try:
|
|
if not os.path.exists(root):# 如果根目录不存在则创建
|
|
os.mkdir(root)
|
|
r = requests.get(url)
|
|
r.raise_for_status()
|
|
r.encoding = r.apparent_encoding
|
|
with open(path, 'wb') as f:
|
|
f.write(r.content)
|
|
f.close()
|
|
# print("获取成功!")
|
|
except:
|
|
print("爬取失败!")
|
|
class ilyncSpider():
|
|
def __init__(self):
|
|
self.base_url = 'https://www.ilync.cn/org/6818_d_0_0_-1_-1_0_{}'
|
|
self.http_url = 'https://www.ilync.cn/{}'
|
|
self.pages_num = 2
|
|
self.throttle = Throttle(10.0)
|
|
self.header = {
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Encoding': 'gzip, deflate, compress',
|
|
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
|
|
'Cache-Control': 'max-age=0',
|
|
'Connection': 'keep-alive',
|
|
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0',
|
|
"https": "https://{0}".format("127.0.0.1:8000"),
|
|
"http": "http://{0}".format("127.0.0.1:8000")
|
|
}
|
|
self.url_queue = Queue()
|
|
self.html_queue = Queue()
|
|
self.content_queue = Queue()
|
|
|
|
"""获取url列表"""
|
|
def get_url_list(self):
|
|
for i in range(1, self.pages_num + 1):
|
|
self.url_queue.put(self.base_url.format(i))
|
|
|
|
"""根据url解析页面内容"""
|
|
def parse_url(self):
|
|
while True:
|
|
url = self.url_queue.get()
|
|
response = requests.get(url, headers=self.header)
|
|
self.html_queue.put(response.content.decode("utf-8"))
|
|
self.url_queue.task_done()
|
|
|
|
def get_content_list(self):
|
|
while True:
|
|
content = self.html_queue.get()
|
|
soup = BeautifulSoup(content, 'lxml')
|
|
first_filter = soup.find_all('div', class_='course-list-wrap')[0]
|
|
second_filter = first_filter.find_all('div', class_='grid-cell')
|
|
infos = []
|
|
for one in second_filter:
|
|
temp_dict = {}
|
|
img = one.find('img').attrs['src']
|
|
title = one.find('img').attrs['title']
|
|
price = one.find('div', class_='course-price').text
|
|
price = re.sub(r'\s', '', price) # 去除制表符、换行符
|
|
id_str = one.find('a', class_='course-pic').attrs['href']
|
|
id = id_str[id_str.find('_') + 1:id_str.find('?')]
|
|
temp_dict['id'] = id
|
|
temp_dict['title'] = re.sub(r'\xa0', ' ', title) # 空格转换
|
|
temp_dict['img'] = img
|
|
temp_dict['price'] = price
|
|
temp_dict['url'] = self.http_url.format(id_str)
|
|
infos.append(temp_dict)
|
|
self.content_queue.put(infos)
|
|
self.html_queue.task_done()
|
|
|
|
"""保存数据"""
|
|
def save_content_list(self):
|
|
while True:
|
|
content_list = self.content_queue.get()
|
|
for i in content_list:
|
|
print(i)
|
|
self.throttle.wait_url(i['url'])
|
|
save(i['img'])
|
|
db_dict = {
|
|
'id':i['id'],
|
|
'标题':i['title'],
|
|
'图标':i['img'],
|
|
'价格': i['price'],
|
|
'链接': i['url']
|
|
}
|
|
db_json = json.dumps(db_dict, ensure_ascii=False, indent=2)
|
|
self.db.write(db_json)
|
|
self.content_queue.task_done()
|
|
|
|
"""使用多线程调用"""
|
|
def run(self):
|
|
self.db = open('./课程信息.txt', 'w', encoding='utf-8')
|
|
thread_list = []
|
|
t_url = threading.Thread(target=self.get_url_list)
|
|
thread_list.append(t_url)
|
|
for i in range(10):
|
|
t_xml = threading.Thread(target=self.parse_url)
|
|
thread_list.append(t_xml)
|
|
t_content = threading.Thread(target=self.get_content_list)
|
|
thread_list.append(t_content)
|
|
t_save = threading.Thread(target=self.save_content_list)
|
|
thread_list.append(t_save)
|
|
|
|
for t in thread_list:
|
|
t.setDaemon(True)
|
|
t.start()
|
|
for q in [self.url_queue, self.html_queue, self.content_queue]:
|
|
q.join()
|
|
self.db.close()
|
|
print('所有数据获取完成')
|
|
|
|
class Throttle:
|
|
"""
|
|
下载限速器
|
|
"""
|
|
|
|
def __init__(self, delay):
|
|
self.domains = {}
|
|
self.delay = delay
|
|
|
|
def wait_url(self, url_str):
|
|
domain_url = urlparse(url_str).netloc
|
|
last_accessed = self.domains.get(domain_url)
|
|
|
|
if self.delay > 0 and last_accessed is not None:
|
|
sleep_interval = self.delay - (datetime.now() - last_accessed).seconds
|
|
if sleep_interval > 0:
|
|
time.sleep(sleep_interval + round(random.uniform(1, 3), 1))
|
|
self.domains[domain_url] = datetime.now()
|
|
|
|
start = time.time()
|
|
obj = ilyncSpider()
|
|
obj.run()
|
|
end = time.time()
|
|
print(end - start)
|