Delete 'src/backend/Computer_Configurations/config_list_spider'

4 years ago · 247558de80
parent 5af27c31a1
commit 247558de80
1 changed files with 0 additions and 210 deletions
--- a/src/backend/Computer_Configurations/config_list_spider
+++ b/src/backend/Computer_Configurations/config_list_spider
@ -1,210 +0,0 @@
 '''
 Description: 电脑配置单爬虫，爬取站点如下
  http://zj.zol.com.cn/top_diy.html
 Author: Fishermanykx
 LastEditors: Fishermanykx
 LastEditTime: 2021-06-23 10:32:21
 '''
 import codecs
 import sys
 import re
 import time
 import lxml
 import json
 import requests
 import pandas
 from bs4 import BeautifulSoup
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import NoSuchElementException
 from selenium.common.exceptions import ElementNotInteractableException
 from selenium.common.exceptions import ElementClickInterceptedException
 from pprint import pprint
 # f = codecs.open("out.txt", "w+", 'utf-8')
 # sys.stdout = f
 class GetConfigLists:
  def __init__(self):
    self.root_url = 'http://zj.zol.com.cn/'
    self.chrome_options = Options()
    prefs = {"profile.managed_default_content_settings.images": 2}
    self.chrome_options.add_experimental_option("prefs", prefs)
    # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
    # self.chrome_options.add_argument('--headless')
    self.driver = webdriver.Chrome(options=self.chrome_options)
    self.configs = {}  # 按 tag 存配置
    self.categories = [
        '经济实惠型', '家用学习型', '网吧游戏型', '商务办公型', '疯狂游戏型', '图形音像型', '豪华发烧型'
    ]
    self.url_indexs = [1, 2, 21, 6, 5, 8, 3]
    self.delay_time = 0.3
    for i in range(1, 8):
      self.configs[self.url_indexs[i-1]] = []
  def __del__(self):
    self.driver.close()
  def testFunc(self):
    url = 'http://zj.zol.com.cn/diy/detail/9680358.html'
    page_resp = requests.get(url, timeout=30)
    page_resp.raise_for_status()
    page_resp.encoding = page_resp.apparent_encoding
    page = page_resp.text
    # Construct soup object
    soup = BeautifulSoup(page, features="lxml")
    cfg_table = soup.find_all('tr', class_='tr1')
    # pprint(cfg_table)
    item_dict = {}
    keys = ['CPU', '主板', '内存', '硬盘', '固态硬盘', '显卡', '机箱', '电源', '散热器']
    total_price = 0
    for item in cfg_table:
      info = item.find_all('td', limit=4)
      accessory_type = info[0].text
      name = info[1].text
      num = int(info[2].text)
      unit_price = int(info[3].text[1:])
      item_dict[accessory_type] = {
          'name': name,
          'number': num,
          'unit_price': unit_price,
          'percentage': 0
      }
      if accessory_type in keys:
        total_price += num * unit_price
    # 统计各配件价格比例
    for key in keys:
      if key in item_dict:
        item_dict[key]['percentage'] = item_dict[key]['number'] * item_dict[
            key]['unit_price'] / total_price
      else:
        item_dict[key] = {
            'name': "NULL",
            'number': 0,
            'unit_price': 0,
            'percentage': 0
        }
    pprint(item_dict)
  def parseSingleTypeURLS(self, conf_type):
    '''
    description: 拿到某个分类的前10页的URL
    param {*} conf_type: int类型，取值范围为 url_indexs， 分别对应7种类型
    '''
    page_num = 1
    sub_page_urls = []  # 每个页面的子页面的入口 url
    total_page_num = 50
    for page_num in range(1, total_page_num + 1):
      cur_url = 'http://zj.zol.com.cn/list_c' + str(conf_type) + '_l1_1_' + str(
          page_num) + '.html'
      # 先用 selenium 滚到底
      self.driver.get(cur_url)
      time.sleep(0.5)
      self.driver.execute_script(
          "window.scrollTo(0,document.body.scrollHeight)")
      time.sleep(0.5)
      start_resp = self.driver.page_source
      soup = BeautifulSoup(start_resp, "lxml")  # 拿到主页面
      # print(soup)
      # self.driver.quit()
      # exit(0)
      # 解析该页面的所有子页面链接
      cfg_lis = soup.find_all('li', class_='outli')
      for cfg in cfg_lis:
        # 解析价格
        price = int(cfg.find('font').text[:-1])
        if price > 200000:
          print("You are thinking peach")
          continue
        # 解析子页面链接
        sub_page_link = cfg.find('a', class_='link')['href']
        sub_page_urls.append(self.root_url + sub_page_link)
    # pprint(sub_page_urls)
    return sub_page_urls
  def getSingleTypeInfo(self, conf_type, urls):
    '''
    description: 获取单个类型的信息
    param {*}
      conf_type : 配置单分类
      urls : 在函数 parseSingleTypeURLS 中获得的链接
    return {*}
    '''
    for url in urls:
      page_resp = requests.get(url, timeout=30)
      page_resp.raise_for_status()
      page_resp.encoding = page_resp.apparent_encoding
      page = page_resp.text
      # Construct soup object
      soup = BeautifulSoup(page, features="lxml")
      cfg_table = soup.find_all('tr', class_='tr1')
      # pprint(cfg_table)
      item_dict = {}
      keys = ['CPU', '主板', '内存', '硬盘', '固态硬盘', '显卡', '机箱', '电源', '散热器']
      total_price = 0
      for item in cfg_table:
        info = item.find_all('td', limit=4)
        accessory_type = info[0].text
        name = info[1].text
        num = int(info[2].text)
        # 匹配数字
        pat = r'\d+'
        try:
          unit_price = int(re.search(pat, info[3].text).group())
        except:
          print(url)
          print(info[3].text)
          exit(1)
        item_dict[accessory_type] = {
            'name': name,
            'number': num,
            'unit_price': unit_price,
            'percentage': 0
        }
        if accessory_type in keys:
          total_price += num * unit_price
      # 统计各配件价格比例
      for key in keys:
        if key in item_dict:
          item_dict[key]['percentage'] = item_dict[key]['number'] * item_dict[
              key]['unit_price'] / total_price
        else:
          item_dict[key] = {
              'name': "NULL",
              'number': 0,
              'unit_price': 0,
              'percentage': 0
          }
      self.configs[conf_type].append(item_dict)
      time.sleep(self.delay_time)
    # pprint(self.configs[conf_type])
  def getConfigInfo(self):
    # 获取配置信息
    for cat in range(1, 8):
      num = self.url_indexs[cat-1]
      urls = self.parseSingleTypeURLS(num)
      self.getSingleTypeInfo(num, urls)
      print("Successfully getting category: " + self.categories[cat-1])
      with codecs.open('tmp.txt', 'w', 'utf-8') as tmp_f:
        tmp_f.write(str(self.configs))
    with codecs.open('result.txt', 'w', 'utf-8') as f:
      f.write(str(self.configs))
 if __name__ == "__main__":
  get_config_list = GetConfigLists()
  # get_config_list.testFunc()
  # urls = get_config_list.parseSingleTypeURLS(1)
  # get_config_list.getSingleTypeInfo(1, urls)
  get_config_list.getConfigInfo()