diff --git a/src/backend/Computer_Configurations/config_list_spider b/src/backend/Computer_Configurations/config_list_spider new file mode 100644 index 0000000..ea14826 --- /dev/null +++ b/src/backend/Computer_Configurations/config_list_spider @@ -0,0 +1,210 @@ +''' +Description: 电脑配置单爬虫,爬取站点如下 + http://zj.zol.com.cn/top_diy.html +Author: Fishermanykx +LastEditors: Fishermanykx +LastEditTime: 2021-06-23 10:32:21 +''' +import codecs +import sys + +import re +import time +import lxml +import json +import requests +import pandas +from bs4 import BeautifulSoup + +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.common.exceptions import NoSuchElementException +from selenium.common.exceptions import ElementNotInteractableException +from selenium.common.exceptions import ElementClickInterceptedException + +from pprint import pprint + +# f = codecs.open("out.txt", "w+", 'utf-8') +# sys.stdout = f + + +class GetConfigLists: + + def __init__(self): + self.root_url = 'http://zj.zol.com.cn/' + + self.chrome_options = Options() + prefs = {"profile.managed_default_content_settings.images": 2} + self.chrome_options.add_experimental_option("prefs", prefs) + # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 + # self.chrome_options.add_argument('--headless') + self.driver = webdriver.Chrome(options=self.chrome_options) + + self.configs = {} # 按 tag 存配置 + self.categories = [ + '经济实惠型', '家用学习型', '网吧游戏型', '商务办公型', '疯狂游戏型', '图形音像型', '豪华发烧型' + ] + self.url_indexs = [1, 2, 21, 6, 5, 8, 3] + self.delay_time = 0.3 + for i in range(1, 8): + self.configs[self.url_indexs[i-1]] = [] + + def __del__(self): + self.driver.close() + + def testFunc(self): + url = 'http://zj.zol.com.cn/diy/detail/9680358.html' + page_resp = requests.get(url, timeout=30) + page_resp.raise_for_status() + page_resp.encoding = page_resp.apparent_encoding + page = page_resp.text + # Construct soup object + soup = BeautifulSoup(page, features="lxml") + cfg_table = soup.find_all('tr', class_='tr1') + # pprint(cfg_table) + item_dict = {} + keys = ['CPU', '主板', '内存', '硬盘', '固态硬盘', '显卡', '机箱', '电源', '散热器'] + total_price = 0 + for item in cfg_table: + info = item.find_all('td', limit=4) + accessory_type = info[0].text + name = info[1].text + num = int(info[2].text) + unit_price = int(info[3].text[1:]) + item_dict[accessory_type] = { + 'name': name, + 'number': num, + 'unit_price': unit_price, + 'percentage': 0 + } + if accessory_type in keys: + total_price += num * unit_price + + # 统计各配件价格比例 + for key in keys: + if key in item_dict: + item_dict[key]['percentage'] = item_dict[key]['number'] * item_dict[ + key]['unit_price'] / total_price + else: + item_dict[key] = { + 'name': "NULL", + 'number': 0, + 'unit_price': 0, + 'percentage': 0 + } + pprint(item_dict) + + def parseSingleTypeURLS(self, conf_type): + ''' + description: 拿到某个分类的前10页的URL + param {*} conf_type: int类型,取值范围为 url_indexs, 分别对应7种类型 + ''' + page_num = 1 + sub_page_urls = [] # 每个页面的子页面的入口 url + total_page_num = 50 + for page_num in range(1, total_page_num + 1): + cur_url = 'http://zj.zol.com.cn/list_c' + str(conf_type) + '_l1_1_' + str( + page_num) + '.html' + # 先用 selenium 滚到底 + self.driver.get(cur_url) + time.sleep(0.5) + self.driver.execute_script( + "window.scrollTo(0,document.body.scrollHeight)") + time.sleep(0.5) + start_resp = self.driver.page_source + soup = BeautifulSoup(start_resp, "lxml") # 拿到主页面 + # print(soup) + # self.driver.quit() + # exit(0) + + # 解析该页面的所有子页面链接 + cfg_lis = soup.find_all('li', class_='outli') + for cfg in cfg_lis: + # 解析价格 + price = int(cfg.find('font').text[:-1]) + if price > 200000: + print("You are thinking peach") + continue + # 解析子页面链接 + sub_page_link = cfg.find('a', class_='link')['href'] + sub_page_urls.append(self.root_url + sub_page_link) + # pprint(sub_page_urls) + return sub_page_urls + + def getSingleTypeInfo(self, conf_type, urls): + ''' + description: 获取单个类型的信息 + param {*} + conf_type : 配置单分类 + urls : 在函数 parseSingleTypeURLS 中获得的链接 + return {*} + ''' + for url in urls: + page_resp = requests.get(url, timeout=30) + page_resp.raise_for_status() + page_resp.encoding = page_resp.apparent_encoding + page = page_resp.text + # Construct soup object + soup = BeautifulSoup(page, features="lxml") + cfg_table = soup.find_all('tr', class_='tr1') + # pprint(cfg_table) + item_dict = {} + keys = ['CPU', '主板', '内存', '硬盘', '固态硬盘', '显卡', '机箱', '电源', '散热器'] + total_price = 0 + for item in cfg_table: + info = item.find_all('td', limit=4) + accessory_type = info[0].text + name = info[1].text + num = int(info[2].text) + # 匹配数字 + pat = r'\d+' + try: + unit_price = int(re.search(pat, info[3].text).group()) + except: + print(url) + print(info[3].text) + exit(1) + item_dict[accessory_type] = { + 'name': name, + 'number': num, + 'unit_price': unit_price, + 'percentage': 0 + } + if accessory_type in keys: + total_price += num * unit_price + + # 统计各配件价格比例 + for key in keys: + if key in item_dict: + item_dict[key]['percentage'] = item_dict[key]['number'] * item_dict[ + key]['unit_price'] / total_price + else: + item_dict[key] = { + 'name': "NULL", + 'number': 0, + 'unit_price': 0, + 'percentage': 0 + } + self.configs[conf_type].append(item_dict) + time.sleep(self.delay_time) + # pprint(self.configs[conf_type]) + + def getConfigInfo(self): + # 获取配置信息 + for cat in range(1, 8): + num = self.url_indexs[cat-1] + urls = self.parseSingleTypeURLS(num) + self.getSingleTypeInfo(num, urls) + print("Successfully getting category: " + self.categories[cat-1]) + with codecs.open('tmp.txt', 'w', 'utf-8') as tmp_f: + tmp_f.write(str(self.configs)) + with codecs.open('result.txt', 'w', 'utf-8') as f: + f.write(str(self.configs)) + + +if __name__ == "__main__": + get_config_list = GetConfigLists() + # get_config_list.testFunc() + # urls = get_config_list.parseSingleTypeURLS(1) + # get_config_list.getSingleTypeInfo(1, urls) + get_config_list.getConfigInfo()