diff --git a/src/backend/Computer_Configurations/config_list_spider b/src/backend/Computer_Configurations/config_list_spider deleted file mode 100644 index ea14826..0000000 --- a/src/backend/Computer_Configurations/config_list_spider +++ /dev/null @@ -1,210 +0,0 @@ -''' -Description: 电脑配置单爬虫,爬取站点如下 - http://zj.zol.com.cn/top_diy.html -Author: Fishermanykx -LastEditors: Fishermanykx -LastEditTime: 2021-06-23 10:32:21 -''' -import codecs -import sys - -import re -import time -import lxml -import json -import requests -import pandas -from bs4 import BeautifulSoup - -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.common.exceptions import NoSuchElementException -from selenium.common.exceptions import ElementNotInteractableException -from selenium.common.exceptions import ElementClickInterceptedException - -from pprint import pprint - -# f = codecs.open("out.txt", "w+", 'utf-8') -# sys.stdout = f - - -class GetConfigLists: - - def __init__(self): - self.root_url = 'http://zj.zol.com.cn/' - - self.chrome_options = Options() - prefs = {"profile.managed_default_content_settings.images": 2} - self.chrome_options.add_experimental_option("prefs", prefs) - # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 - # self.chrome_options.add_argument('--headless') - self.driver = webdriver.Chrome(options=self.chrome_options) - - self.configs = {} # 按 tag 存配置 - self.categories = [ - '经济实惠型', '家用学习型', '网吧游戏型', '商务办公型', '疯狂游戏型', '图形音像型', '豪华发烧型' - ] - self.url_indexs = [1, 2, 21, 6, 5, 8, 3] - self.delay_time = 0.3 - for i in range(1, 8): - self.configs[self.url_indexs[i-1]] = [] - - def __del__(self): - self.driver.close() - - def testFunc(self): - url = 'http://zj.zol.com.cn/diy/detail/9680358.html' - page_resp = requests.get(url, timeout=30) - page_resp.raise_for_status() - page_resp.encoding = page_resp.apparent_encoding - page = page_resp.text - # Construct soup object - soup = BeautifulSoup(page, features="lxml") - cfg_table = soup.find_all('tr', class_='tr1') - # pprint(cfg_table) - item_dict = {} - keys = ['CPU', '主板', '内存', '硬盘', '固态硬盘', '显卡', '机箱', '电源', '散热器'] - total_price = 0 - for item in cfg_table: - info = item.find_all('td', limit=4) - accessory_type = info[0].text - name = info[1].text - num = int(info[2].text) - unit_price = int(info[3].text[1:]) - item_dict[accessory_type] = { - 'name': name, - 'number': num, - 'unit_price': unit_price, - 'percentage': 0 - } - if accessory_type in keys: - total_price += num * unit_price - - # 统计各配件价格比例 - for key in keys: - if key in item_dict: - item_dict[key]['percentage'] = item_dict[key]['number'] * item_dict[ - key]['unit_price'] / total_price - else: - item_dict[key] = { - 'name': "NULL", - 'number': 0, - 'unit_price': 0, - 'percentage': 0 - } - pprint(item_dict) - - def parseSingleTypeURLS(self, conf_type): - ''' - description: 拿到某个分类的前10页的URL - param {*} conf_type: int类型,取值范围为 url_indexs, 分别对应7种类型 - ''' - page_num = 1 - sub_page_urls = [] # 每个页面的子页面的入口 url - total_page_num = 50 - for page_num in range(1, total_page_num + 1): - cur_url = 'http://zj.zol.com.cn/list_c' + str(conf_type) + '_l1_1_' + str( - page_num) + '.html' - # 先用 selenium 滚到底 - self.driver.get(cur_url) - time.sleep(0.5) - self.driver.execute_script( - "window.scrollTo(0,document.body.scrollHeight)") - time.sleep(0.5) - start_resp = self.driver.page_source - soup = BeautifulSoup(start_resp, "lxml") # 拿到主页面 - # print(soup) - # self.driver.quit() - # exit(0) - - # 解析该页面的所有子页面链接 - cfg_lis = soup.find_all('li', class_='outli') - for cfg in cfg_lis: - # 解析价格 - price = int(cfg.find('font').text[:-1]) - if price > 200000: - print("You are thinking peach") - continue - # 解析子页面链接 - sub_page_link = cfg.find('a', class_='link')['href'] - sub_page_urls.append(self.root_url + sub_page_link) - # pprint(sub_page_urls) - return sub_page_urls - - def getSingleTypeInfo(self, conf_type, urls): - ''' - description: 获取单个类型的信息 - param {*} - conf_type : 配置单分类 - urls : 在函数 parseSingleTypeURLS 中获得的链接 - return {*} - ''' - for url in urls: - page_resp = requests.get(url, timeout=30) - page_resp.raise_for_status() - page_resp.encoding = page_resp.apparent_encoding - page = page_resp.text - # Construct soup object - soup = BeautifulSoup(page, features="lxml") - cfg_table = soup.find_all('tr', class_='tr1') - # pprint(cfg_table) - item_dict = {} - keys = ['CPU', '主板', '内存', '硬盘', '固态硬盘', '显卡', '机箱', '电源', '散热器'] - total_price = 0 - for item in cfg_table: - info = item.find_all('td', limit=4) - accessory_type = info[0].text - name = info[1].text - num = int(info[2].text) - # 匹配数字 - pat = r'\d+' - try: - unit_price = int(re.search(pat, info[3].text).group()) - except: - print(url) - print(info[3].text) - exit(1) - item_dict[accessory_type] = { - 'name': name, - 'number': num, - 'unit_price': unit_price, - 'percentage': 0 - } - if accessory_type in keys: - total_price += num * unit_price - - # 统计各配件价格比例 - for key in keys: - if key in item_dict: - item_dict[key]['percentage'] = item_dict[key]['number'] * item_dict[ - key]['unit_price'] / total_price - else: - item_dict[key] = { - 'name': "NULL", - 'number': 0, - 'unit_price': 0, - 'percentage': 0 - } - self.configs[conf_type].append(item_dict) - time.sleep(self.delay_time) - # pprint(self.configs[conf_type]) - - def getConfigInfo(self): - # 获取配置信息 - for cat in range(1, 8): - num = self.url_indexs[cat-1] - urls = self.parseSingleTypeURLS(num) - self.getSingleTypeInfo(num, urls) - print("Successfully getting category: " + self.categories[cat-1]) - with codecs.open('tmp.txt', 'w', 'utf-8') as tmp_f: - tmp_f.write(str(self.configs)) - with codecs.open('result.txt', 'w', 'utf-8') as f: - f.write(str(self.configs)) - - -if __name__ == "__main__": - get_config_list = GetConfigLists() - # get_config_list.testFunc() - # urls = get_config_list.parseSingleTypeURLS(1) - # get_config_list.getSingleTypeInfo(1, urls) - get_config_list.getConfigInfo()