From 247558de8050a786ebbb906fe138e7f7c64d9bcd Mon Sep 17 00:00:00 2001
From: p01254378 <1299268546@qq.com>
Date: Wed, 23 Jun 2021 10:36:09 +0800
Subject: [PATCH] Delete
 'src/backend/Computer_Configurations/config_list_spider'

---
 .../config_list_spider                        | 210 ------------------
 1 file changed, 210 deletions(-)
 delete mode 100644 src/backend/Computer_Configurations/config_list_spider

diff --git a/src/backend/Computer_Configurations/config_list_spider b/src/backend/Computer_Configurations/config_list_spider
deleted file mode 100644
index ea14826..0000000
--- a/src/backend/Computer_Configurations/config_list_spider
+++ /dev/null
@@ -1,210 +0,0 @@
-'''
-Description: 电脑配置单爬虫，爬取站点如下
-  http://zj.zol.com.cn/top_diy.html
-Author: Fishermanykx
-LastEditors: Fishermanykx
-LastEditTime: 2021-06-23 10:32:21
-'''
-import codecs
-import sys
-
-import re
-import time
-import lxml
-import json
-import requests
-import pandas
-from bs4 import BeautifulSoup
-
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.common.exceptions import NoSuchElementException
-from selenium.common.exceptions import ElementNotInteractableException
-from selenium.common.exceptions import ElementClickInterceptedException
-
-from pprint import pprint
-
-# f = codecs.open("out.txt", "w+", 'utf-8')
-# sys.stdout = f
-
-
-class GetConfigLists:
-
-  def __init__(self):
-    self.root_url = 'http://zj.zol.com.cn/'
-
-    self.chrome_options = Options()
-    prefs = {"profile.managed_default_content_settings.images": 2}
-    self.chrome_options.add_experimental_option("prefs", prefs)
-    # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
-    # self.chrome_options.add_argument('--headless')
-    self.driver = webdriver.Chrome(options=self.chrome_options)
-
-    self.configs = {}  # 按 tag 存配置
-    self.categories = [
-        '经济实惠型', '家用学习型', '网吧游戏型', '商务办公型', '疯狂游戏型', '图形音像型', '豪华发烧型'
-    ]
-    self.url_indexs = [1, 2, 21, 6, 5, 8, 3]
-    self.delay_time = 0.3
-    for i in range(1, 8):
-      self.configs[self.url_indexs[i-1]] = []
-
-  def __del__(self):
-    self.driver.close()
-
-  def testFunc(self):
-    url = 'http://zj.zol.com.cn/diy/detail/9680358.html'
-    page_resp = requests.get(url, timeout=30)
-    page_resp.raise_for_status()
-    page_resp.encoding = page_resp.apparent_encoding
-    page = page_resp.text
-    # Construct soup object
-    soup = BeautifulSoup(page, features="lxml")
-    cfg_table = soup.find_all('tr', class_='tr1')
-    # pprint(cfg_table)
-    item_dict = {}
-    keys = ['CPU', '主板', '内存', '硬盘', '固态硬盘', '显卡', '机箱', '电源', '散热器']
-    total_price = 0
-    for item in cfg_table:
-      info = item.find_all('td', limit=4)
-      accessory_type = info[0].text
-      name = info[1].text
-      num = int(info[2].text)
-      unit_price = int(info[3].text[1:])
-      item_dict[accessory_type] = {
-          'name': name,
-          'number': num,
-          'unit_price': unit_price,
-          'percentage': 0
-      }
-      if accessory_type in keys:
-        total_price += num * unit_price
-
-    # 统计各配件价格比例
-    for key in keys:
-      if key in item_dict:
-        item_dict[key]['percentage'] = item_dict[key]['number'] * item_dict[
-            key]['unit_price'] / total_price
-      else:
-        item_dict[key] = {
-            'name': "NULL",
-            'number': 0,
-            'unit_price': 0,
-            'percentage': 0
-        }
-    pprint(item_dict)
-
-  def parseSingleTypeURLS(self, conf_type):
-    '''
-    description: 拿到某个分类的前10页的URL
-    param {*} conf_type: int类型，取值范围为 url_indexs， 分别对应7种类型
-    '''
-    page_num = 1
-    sub_page_urls = []  # 每个页面的子页面的入口 url
-    total_page_num = 50
-    for page_num in range(1, total_page_num + 1):
-      cur_url = 'http://zj.zol.com.cn/list_c' + str(conf_type) + '_l1_1_' + str(
-          page_num) + '.html'
-      # 先用 selenium 滚到底
-      self.driver.get(cur_url)
-      time.sleep(0.5)
-      self.driver.execute_script(
-          "window.scrollTo(0,document.body.scrollHeight)")
-      time.sleep(0.5)
-      start_resp = self.driver.page_source
-      soup = BeautifulSoup(start_resp, "lxml")  # 拿到主页面
-      # print(soup)
-      # self.driver.quit()
-      # exit(0)
-
-      # 解析该页面的所有子页面链接
-      cfg_lis = soup.find_all('li', class_='outli')
-      for cfg in cfg_lis:
-        # 解析价格
-        price = int(cfg.find('font').text[:-1])
-        if price > 200000:
-          print("You are thinking peach")
-          continue
-        # 解析子页面链接
-        sub_page_link = cfg.find('a', class_='link')['href']
-        sub_page_urls.append(self.root_url + sub_page_link)
-    # pprint(sub_page_urls)
-    return sub_page_urls
-
-  def getSingleTypeInfo(self, conf_type, urls):
-    '''
-    description: 获取单个类型的信息
-    param {*}
-      conf_type : 配置单分类
-      urls : 在函数 parseSingleTypeURLS 中获得的链接
-    return {*}
-    '''
-    for url in urls:
-      page_resp = requests.get(url, timeout=30)
-      page_resp.raise_for_status()
-      page_resp.encoding = page_resp.apparent_encoding
-      page = page_resp.text
-      # Construct soup object
-      soup = BeautifulSoup(page, features="lxml")
-      cfg_table = soup.find_all('tr', class_='tr1')
-      # pprint(cfg_table)
-      item_dict = {}
-      keys = ['CPU', '主板', '内存', '硬盘', '固态硬盘', '显卡', '机箱', '电源', '散热器']
-      total_price = 0
-      for item in cfg_table:
-        info = item.find_all('td', limit=4)
-        accessory_type = info[0].text
-        name = info[1].text
-        num = int(info[2].text)
-        # 匹配数字
-        pat = r'\d+'
-        try:
-          unit_price = int(re.search(pat, info[3].text).group())
-        except:
-          print(url)
-          print(info[3].text)
-          exit(1)
-        item_dict[accessory_type] = {
-            'name': name,
-            'number': num,
-            'unit_price': unit_price,
-            'percentage': 0
-        }
-        if accessory_type in keys:
-          total_price += num * unit_price
-
-      # 统计各配件价格比例
-      for key in keys:
-        if key in item_dict:
-          item_dict[key]['percentage'] = item_dict[key]['number'] * item_dict[
-              key]['unit_price'] / total_price
-        else:
-          item_dict[key] = {
-              'name': "NULL",
-              'number': 0,
-              'unit_price': 0,
-              'percentage': 0
-          }
-      self.configs[conf_type].append(item_dict)
-      time.sleep(self.delay_time)
-    # pprint(self.configs[conf_type])
-
-  def getConfigInfo(self):
-    # 获取配置信息
-    for cat in range(1, 8):
-      num = self.url_indexs[cat-1]
-      urls = self.parseSingleTypeURLS(num)
-      self.getSingleTypeInfo(num, urls)
-      print("Successfully getting category: " + self.categories[cat-1])
-      with codecs.open('tmp.txt', 'w', 'utf-8') as tmp_f:
-        tmp_f.write(str(self.configs))
-    with codecs.open('result.txt', 'w', 'utf-8') as f:
-      f.write(str(self.configs))
-
-
-if __name__ == "__main__":
-  get_config_list = GetConfigLists()
-  # get_config_list.testFunc()
-  # urls = get_config_list.parseSingleTypeURLS(1)
-  # get_config_list.getSingleTypeInfo(1, urls)
-  get_config_list.getConfigInfo()