Get computer configuration info

master
Fishermanykx 3 years ago
parent 2e6dee942f
commit f489703374

@ -0,0 +1,210 @@
'''
Description: 电脑配置单爬虫,爬取站点如下
http://zj.zol.com.cn/top_diy.html
Author: Fishermanykx
LastEditors: Fishermanykx
LastEditTime: 2021-06-23 10:32:21
'''
import codecs
import sys
import re
import time
import lxml
import json
import requests
import pandas
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementNotInteractableException
from selenium.common.exceptions import ElementClickInterceptedException
from pprint import pprint
# f = codecs.open("out.txt", "w+", 'utf-8')
# sys.stdout = f
class GetConfigLists:
def __init__(self):
self.root_url = 'http://zj.zol.com.cn/'
self.chrome_options = Options()
prefs = {"profile.managed_default_content_settings.images": 2}
self.chrome_options.add_experimental_option("prefs", prefs)
# 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
# self.chrome_options.add_argument('--headless')
self.driver = webdriver.Chrome(options=self.chrome_options)
self.configs = {} # 按 tag 存配置
self.categories = [
'经济实惠型', '家用学习型', '网吧游戏型', '商务办公型', '疯狂游戏型', '图形音像型', '豪华发烧型'
]
self.url_indexs = [1, 2, 21, 6, 5, 8, 3]
self.delay_time = 0.3
for i in range(1, 8):
self.configs[self.url_indexs[i-1]] = []
def __del__(self):
self.driver.close()
def testFunc(self):
url = 'http://zj.zol.com.cn/diy/detail/9680358.html'
page_resp = requests.get(url, timeout=30)
page_resp.raise_for_status()
page_resp.encoding = page_resp.apparent_encoding
page = page_resp.text
# Construct soup object
soup = BeautifulSoup(page, features="lxml")
cfg_table = soup.find_all('tr', class_='tr1')
# pprint(cfg_table)
item_dict = {}
keys = ['CPU', '主板', '内存', '硬盘', '固态硬盘', '显卡', '机箱', '电源', '散热器']
total_price = 0
for item in cfg_table:
info = item.find_all('td', limit=4)
accessory_type = info[0].text
name = info[1].text
num = int(info[2].text)
unit_price = int(info[3].text[1:])
item_dict[accessory_type] = {
'name': name,
'number': num,
'unit_price': unit_price,
'percentage': 0
}
if accessory_type in keys:
total_price += num * unit_price
# 统计各配件价格比例
for key in keys:
if key in item_dict:
item_dict[key]['percentage'] = item_dict[key]['number'] * item_dict[
key]['unit_price'] / total_price
else:
item_dict[key] = {
'name': "NULL",
'number': 0,
'unit_price': 0,
'percentage': 0
}
pprint(item_dict)
def parseSingleTypeURLS(self, conf_type):
'''
description: 拿到某个分类的前10页的URL
param {*} conf_type: int类型取值范围为 url_indexs 分别对应7种类型
'''
page_num = 1
sub_page_urls = [] # 每个页面的子页面的入口 url
total_page_num = 50
for page_num in range(1, total_page_num + 1):
cur_url = 'http://zj.zol.com.cn/list_c' + str(conf_type) + '_l1_1_' + str(
page_num) + '.html'
# 先用 selenium 滚到底
self.driver.get(cur_url)
time.sleep(0.5)
self.driver.execute_script(
"window.scrollTo(0,document.body.scrollHeight)")
time.sleep(0.5)
start_resp = self.driver.page_source
soup = BeautifulSoup(start_resp, "lxml") # 拿到主页面
# print(soup)
# self.driver.quit()
# exit(0)
# 解析该页面的所有子页面链接
cfg_lis = soup.find_all('li', class_='outli')
for cfg in cfg_lis:
# 解析价格
price = int(cfg.find('font').text[:-1])
if price > 200000:
print("You are thinking peach")
continue
# 解析子页面链接
sub_page_link = cfg.find('a', class_='link')['href']
sub_page_urls.append(self.root_url + sub_page_link)
# pprint(sub_page_urls)
return sub_page_urls
def getSingleTypeInfo(self, conf_type, urls):
'''
description: 获取单个类型的信息
param {*}
conf_type : 配置单分类
urls : 在函数 parseSingleTypeURLS 中获得的链接
return {*}
'''
for url in urls:
page_resp = requests.get(url, timeout=30)
page_resp.raise_for_status()
page_resp.encoding = page_resp.apparent_encoding
page = page_resp.text
# Construct soup object
soup = BeautifulSoup(page, features="lxml")
cfg_table = soup.find_all('tr', class_='tr1')
# pprint(cfg_table)
item_dict = {}
keys = ['CPU', '主板', '内存', '硬盘', '固态硬盘', '显卡', '机箱', '电源', '散热器']
total_price = 0
for item in cfg_table:
info = item.find_all('td', limit=4)
accessory_type = info[0].text
name = info[1].text
num = int(info[2].text)
# 匹配数字
pat = r'\d+'
try:
unit_price = int(re.search(pat, info[3].text).group())
except:
print(url)
print(info[3].text)
exit(1)
item_dict[accessory_type] = {
'name': name,
'number': num,
'unit_price': unit_price,
'percentage': 0
}
if accessory_type in keys:
total_price += num * unit_price
# 统计各配件价格比例
for key in keys:
if key in item_dict:
item_dict[key]['percentage'] = item_dict[key]['number'] * item_dict[
key]['unit_price'] / total_price
else:
item_dict[key] = {
'name': "NULL",
'number': 0,
'unit_price': 0,
'percentage': 0
}
self.configs[conf_type].append(item_dict)
time.sleep(self.delay_time)
# pprint(self.configs[conf_type])
def getConfigInfo(self):
# 获取配置信息
for cat in range(1, 8):
num = self.url_indexs[cat-1]
urls = self.parseSingleTypeURLS(num)
self.getSingleTypeInfo(num, urls)
print("Successfully getting category: " + self.categories[cat-1])
with codecs.open('tmp.txt', 'w', 'utf-8') as tmp_f:
tmp_f.write(str(self.configs))
with codecs.open('result.txt', 'w', 'utf-8') as f:
f.write(str(self.configs))
if __name__ == "__main__":
get_config_list = GetConfigLists()
# get_config_list.testFunc()
# urls = get_config_list.parseSingleTypeURLS(1)
# get_config_list.getSingleTypeInfo(1, urls)
get_config_list.getConfigInfo()
Loading…
Cancel
Save