Get computer configuration info

4 years ago · f489703374
parent 2e6dee942f
commit f489703374
1 changed files with 210 additions and 0 deletions
--- a/src/backend/Computer_Configurations/config_list_spider
+++ b/src/backend/Computer_Configurations/config_list_spider
@ -0,0 +1,210 @@
+'''
+Description: 电脑配置单爬虫，爬取站点如下
+  http://zj.zol.com.cn/top_diy.html
+Author: Fishermanykx
+LastEditors: Fishermanykx
+LastEditTime: 2021-06-23 10:32:21
+'''
+import codecs
+import sys
+
+import re
+import time
+import lxml
+import json
+import requests
+import pandas
+from bs4 import BeautifulSoup
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import NoSuchElementException
+from selenium.common.exceptions import ElementNotInteractableException
+from selenium.common.exceptions import ElementClickInterceptedException
+
+from pprint import pprint
+
+# f = codecs.open("out.txt", "w+", 'utf-8')
+# sys.stdout = f
+
+
+class GetConfigLists:
+
+  def __init__(self):
+    self.root_url = 'http://zj.zol.com.cn/'
+
+    self.chrome_options = Options()
+    prefs = {"profile.managed_default_content_settings.images": 2}
+    self.chrome_options.add_experimental_option("prefs", prefs)
+    # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
+    # self.chrome_options.add_argument('--headless')
+    self.driver = webdriver.Chrome(options=self.chrome_options)
+
+    self.configs = {}  # 按 tag 存配置
+    self.categories = [
+        '经济实惠型', '家用学习型', '网吧游戏型', '商务办公型', '疯狂游戏型', '图形音像型', '豪华发烧型'
+    ]
+    self.url_indexs = [1, 2, 21, 6, 5, 8, 3]
+    self.delay_time = 0.3
+    for i in range(1, 8):
+      self.configs[self.url_indexs[i-1]] = []
+
+  def __del__(self):
+    self.driver.close()
+
+  def testFunc(self):
+    url = 'http://zj.zol.com.cn/diy/detail/9680358.html'
+    page_resp = requests.get(url, timeout=30)
+    page_resp.raise_for_status()
+    page_resp.encoding = page_resp.apparent_encoding
+    page = page_resp.text
+    # Construct soup object
+    soup = BeautifulSoup(page, features="lxml")
+    cfg_table = soup.find_all('tr', class_='tr1')
+    # pprint(cfg_table)
+    item_dict = {}
+    keys = ['CPU', '主板', '内存', '硬盘', '固态硬盘', '显卡', '机箱', '电源', '散热器']
+    total_price = 0
+    for item in cfg_table:
+      info = item.find_all('td', limit=4)
+      accessory_type = info[0].text
+      name = info[1].text
+      num = int(info[2].text)
+      unit_price = int(info[3].text[1:])
+      item_dict[accessory_type] = {
+          'name': name,
+          'number': num,
+          'unit_price': unit_price,
+          'percentage': 0
+      }
+      if accessory_type in keys:
+        total_price += num * unit_price
+
+    # 统计各配件价格比例
+    for key in keys:
+      if key in item_dict:
+        item_dict[key]['percentage'] = item_dict[key]['number'] * item_dict[
+            key]['unit_price'] / total_price
+      else:
+        item_dict[key] = {
+            'name': "NULL",
+            'number': 0,
+            'unit_price': 0,
+            'percentage': 0
+        }
+    pprint(item_dict)
+
+  def parseSingleTypeURLS(self, conf_type):
+    '''
+    description: 拿到某个分类的前10页的URL
+    param {*} conf_type: int类型，取值范围为 url_indexs， 分别对应7种类型
+    '''
+    page_num = 1
+    sub_page_urls = []  # 每个页面的子页面的入口 url
+    total_page_num = 50
+    for page_num in range(1, total_page_num + 1):
+      cur_url = 'http://zj.zol.com.cn/list_c' + str(conf_type) + '_l1_1_' + str(
+          page_num) + '.html'
+      # 先用 selenium 滚到底
+      self.driver.get(cur_url)
+      time.sleep(0.5)
+      self.driver.execute_script(
+          "window.scrollTo(0,document.body.scrollHeight)")
+      time.sleep(0.5)
+      start_resp = self.driver.page_source
+      soup = BeautifulSoup(start_resp, "lxml")  # 拿到主页面
+      # print(soup)
+      # self.driver.quit()
+      # exit(0)
+
+      # 解析该页面的所有子页面链接
+      cfg_lis = soup.find_all('li', class_='outli')
+      for cfg in cfg_lis:
+        # 解析价格
+        price = int(cfg.find('font').text[:-1])
+        if price > 200000:
+          print("You are thinking peach")
+          continue
+        # 解析子页面链接
+        sub_page_link = cfg.find('a', class_='link')['href']
+        sub_page_urls.append(self.root_url + sub_page_link)
+    # pprint(sub_page_urls)
+    return sub_page_urls
+
+  def getSingleTypeInfo(self, conf_type, urls):
+    '''
+    description: 获取单个类型的信息
+    param {*}
+      conf_type : 配置单分类
+      urls : 在函数 parseSingleTypeURLS 中获得的链接
+    return {*}
+    '''
+    for url in urls:
+      page_resp = requests.get(url, timeout=30)
+      page_resp.raise_for_status()
+      page_resp.encoding = page_resp.apparent_encoding
+      page = page_resp.text
+      # Construct soup object
+      soup = BeautifulSoup(page, features="lxml")
+      cfg_table = soup.find_all('tr', class_='tr1')
+      # pprint(cfg_table)
+      item_dict = {}
+      keys = ['CPU', '主板', '内存', '硬盘', '固态硬盘', '显卡', '机箱', '电源', '散热器']
+      total_price = 0
+      for item in cfg_table:
+        info = item.find_all('td', limit=4)
+        accessory_type = info[0].text
+        name = info[1].text
+        num = int(info[2].text)
+        # 匹配数字
+        pat = r'\d+'
+        try:
+          unit_price = int(re.search(pat, info[3].text).group())
+        except:
+          print(url)
+          print(info[3].text)
+          exit(1)
+        item_dict[accessory_type] = {
+            'name': name,
+            'number': num,
+            'unit_price': unit_price,
+            'percentage': 0
+        }
+        if accessory_type in keys:
+          total_price += num * unit_price
+
+      # 统计各配件价格比例
+      for key in keys:
+        if key in item_dict:
+          item_dict[key]['percentage'] = item_dict[key]['number'] * item_dict[
+              key]['unit_price'] / total_price
+        else:
+          item_dict[key] = {
+              'name': "NULL",
+              'number': 0,
+              'unit_price': 0,
+              'percentage': 0
+          }
+      self.configs[conf_type].append(item_dict)
+      time.sleep(self.delay_time)
+    # pprint(self.configs[conf_type])
+
+  def getConfigInfo(self):
+    # 获取配置信息
+    for cat in range(1, 8):
+      num = self.url_indexs[cat-1]
+      urls = self.parseSingleTypeURLS(num)
+      self.getSingleTypeInfo(num, urls)
+      print("Successfully getting category: " + self.categories[cat-1])
+      with codecs.open('tmp.txt', 'w', 'utf-8') as tmp_f:
+        tmp_f.write(str(self.configs))
+    with codecs.open('result.txt', 'w', 'utf-8') as f:
+      f.write(str(self.configs))
+
+
+if __name__ == "__main__":
+  get_config_list = GetConfigLists()
+  # get_config_list.testFunc()
+  # urls = get_config_list.parseSingleTypeURLS(1)
+  # get_config_list.getSingleTypeInfo(1, urls)
+  get_config_list.getConfigInfo()