Fix some bug in Spider

master
fishermanykx 3 years ago
parent 8de470b5b0
commit 7dc98330bc

@ -0,0 +1,252 @@
CREATE DATABASE IF NOT EXISTS test;
USE test;
DROP TABLE IF EXISTS `cpu`;
CREATE TABLE `cpu` (
`id` int(11) DEFAULT 0,
`name` varchar(255) DEFAULT NULL,
`comment_num` INT DEFAULT 100,
`praise_rate` INT DEFAULT 90,
`shop_name` varchar(255) DEFAULT NULL,
`price` INT DEFAULT 0,
`link` varchar(255) NOT NULL,
`brand` varchar(255) DEFAULT NULL,
`tags` varchar(255) DEFAULT NULL,
`clock_speed` varchar(255) DEFAULT NULL,
`core_num` varchar(255) DEFAULT NULL,
`TDP` INT DEFAULT 0,
`socket` varchar(255) DEFAULT NULL,
`have_core_graphics_card` varchar(255) DEFAULT NULL,
`have_cpu_fan` varchar(255) DEFAULT NULL,
`generation` INT DEFAULT 0,
`introduction` JSON,
`Ptable_params` JSON,
`title_name` varchar(255) DEFAULT NULL,
PRIMARY KEY (`link`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci;
truncate cpu;
DROP TABLE IF EXISTS `motherboard`;
CREATE TABLE `motherboard` (
`id` int(11) DEFAULT 0,
`name` varchar(255) DEFAULT NULL,
`comment_num` INT DEFAULT 100,
`praise_rate` INT DEFAULT 90,
`shop_name` varchar(255) DEFAULT NULL,
`price` INT DEFAULT 0,
`link` varchar(255) NOT NULL,
`brand` varchar(255) DEFAULT NULL,
`tags` varchar(255) DEFAULT NULL,
`form_factor` varchar(255) DEFAULT NULL,
`platform` varchar(255) DEFAULT NULL,
`cpu_socket` varchar(255) DEFAULT NULL,
`m2_num` INT DEFAULT 0,
`slot_num` INT DEFAULT 0,
`ddr_gen` varchar(255) DEFAULT NULL,
`max_memory` INT DEFAULT 0,
`introduction` JSON,
`Ptable_params` JSON,
`title_name` varchar(255) DEFAULT NULL,
PRIMARY KEY (`link`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci;
truncate motherboard;
DROP TABLE IF EXISTS `graphics_card`;
CREATE TABLE `graphics_card` (
`id` int(11) DEFAULT 0,
`name` varchar(255) DEFAULT NULL,
`comment_num` INT DEFAULT 100,
`praise_rate` INT DEFAULT 90,
`shop_name` varchar(255) DEFAULT NULL,
`price` INT DEFAULT 0,
`link` varchar(255) NOT NULL,
`brand` varchar(255) DEFAULT NULL,
`tags` varchar(255) DEFAULT NULL,
`card_length` float DEFAULT 0,
`rgb` varchar(255) DEFAULT NULL,
`card_type` varchar(255) DEFAULT NULL,
`generation` INT DEFAULT 0,
`introduction` JSON,
`Ptable_params` JSON,
`title_name` varchar(255) DEFAULT NULL,
PRIMARY KEY (`link`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci;
truncate graphics_card;
DROP TABLE IF EXISTS `memory`;
CREATE TABLE `memory` (
`id` int(11) DEFAULT 0,
`name` varchar(255) DEFAULT NULL,
`comment_num` INT DEFAULT 100,
`praise_rate` INT DEFAULT 90,
`shop_name` varchar(255) DEFAULT NULL,
`price` INT DEFAULT 0,
`link` varchar(255) NOT NULL,
`brand` varchar(255) DEFAULT NULL,
`frequency` varchar(255) DEFAULT NULL,
`total_capacity` INT DEFAULT 0,
`memory_num` varchar(255) DEFAULT NULL,
`appearance` varchar(255) DEFAULT NULL,
`ddr_gen` varchar(255) DEFAULT NULL,
`introduction` JSON,
`Ptable_params` JSON,
`title_name` varchar(255) DEFAULT NULL,
PRIMARY KEY (`link`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci;
truncate memory;
DROP TABLE IF EXISTS `cpu_radiator`;
CREATE TABLE `cpu_radiator` (
`id` int(11) DEFAULT 0,
`name` varchar(255) DEFAULT NULL,
`comment_num` INT DEFAULT 100,
`praise_rate` INT DEFAULT 90,
`shop_name` varchar(255) DEFAULT NULL,
`price` INT DEFAULT 0,
`link` varchar(255) NOT NULL,
`brand` varchar(255) DEFAULT NULL,
`height` INT DEFAULT 0,
`socket` varchar(255) DEFAULT NULL,
`radiator_size` INT DEFAULT 0,
`rgb` varchar(255) DEFAULT NULL,
`introduction` JSON,
`Ptable_params` JSON,
`title_name` varchar(255) DEFAULT NULL,
PRIMARY KEY (`link`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci;
truncate cpu_radiator;
DROP TABLE IF EXISTS `ssd`;
CREATE TABLE `ssd` (
`id` int(11) DEFAULT 0,
`name` varchar(255) DEFAULT NULL,
`comment_num` INT DEFAULT 100,
`praise_rate` INT DEFAULT 90,
`shop_name` varchar(255) DEFAULT NULL,
`price` INT DEFAULT 0,
`link` varchar(255) NOT NULL,
`brand` varchar(255) DEFAULT NULL,
`interface` varchar(255) DEFAULT NULL,
`total_capacity` FLOAT DEFAULT 0,
`introduction` JSON,
`Ptable_params` JSON,
`title_name` varchar(255) DEFAULT NULL,
PRIMARY KEY (`link`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci;
truncate ssd;
DROP TABLE IF EXISTS `hdd`;
CREATE TABLE `hdd` (
`id` int(11) DEFAULT 0,
`name` varchar(255) DEFAULT NULL,
`comment_num` INT DEFAULT 100,
`praise_rate` INT DEFAULT 90,
`shop_name` varchar(255) DEFAULT NULL,
`price` INT DEFAULT 0,
`link` varchar(255) NOT NULL,
`brand` varchar(255) DEFAULT NULL,
`size` varchar(255) DEFAULT NULL,
`rotating_speed` varchar(255) DEFAULT NULL,
`total_capacity` FLOAT DEFAULT 0,
`introduction` JSON,
`Ptable_params` JSON,
`title_name` varchar(255) DEFAULT NULL,
PRIMARY KEY (`link`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci;
truncate hdd;
DROP TABLE IF EXISTS `power_supply`;
CREATE TABLE `power_supply` (
`id` int(11) DEFAULT 0,
`name` varchar(255) DEFAULT NULL,
`comment_num` INT DEFAULT 100,
`praise_rate` INT DEFAULT 90,
`shop_name` varchar(255) DEFAULT NULL,
`price` INT DEFAULT 0,
`link` varchar(255) NOT NULL,
`brand` varchar(255) DEFAULT NULL,
`tags` varchar(255) DEFAULT NULL,
`power` INT DEFAULT 0,
`size` varchar(255) DEFAULT NULL,
`modularization` varchar(255) DEFAULT NULL,
`transfer_efficiency` varchar(255) DEFAULT NULL,
`introduction` JSON,
`Ptable_params` JSON,
`title_name` varchar(255) DEFAULT NULL,
PRIMARY KEY (`link`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci;
truncate power_supply;
DROP TABLE IF EXISTS `computer_case`;
CREATE TABLE `computer_case` (
`id` int(11) DEFAULT 0,
`name` varchar(255) DEFAULT NULL,
`comment_num` INT DEFAULT 100,
`praise_rate` INT DEFAULT 90,
`shop_name` varchar(255) DEFAULT NULL,
`price` INT DEFAULT 0,
`link` varchar(255) NOT NULL,
`brand` varchar(255) DEFAULT NULL,
`max_form_factor` varchar(255) DEFAULT NULL,
`max_card_len` INT DEFAULT 0,
`max_radiator_height` INT DEFAULT 0,
`supported_radiator` varchar(255) DEFAULT NULL,
`has_transparent_side_panel` INT DEFAULT 0,
`introduction` JSON,
`Ptable_params` JSON,
`title_name` varchar(255) DEFAULT NULL,
PRIMARY KEY (`link`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci;
truncate computer_case;
DROP TABLE IF EXISTS `board_u_suit`;
CREATE TABLE `board_u_suit` (
`id` int(11) DEFAULT 0,
`name` varchar(255) DEFAULT NULL,
`board` varchar(255) DEFAULT NULL,
`cpu` varchar(255) DEFAULT NULL,
`shop_name` varchar(255) DEFAULT NULL,
`price` INT DEFAULT 0,
`link` varchar(255) NOT NULL,
`introduction` JSON,
`Ptable_params` JSON,
`title_name` varchar(255) DEFAULT NULL,
PRIMARY KEY (`link`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci;
truncate board_u_suit;
DROP TABLE IF EXISTS `trendings`;
CREATE TABLE `trendings` (
`id` varchar(255) NOT NULL,
`frequency` int(11) DEFAULT 0,
`cpu` varchar(255) NOT NULL,
`cpu_price` int(11) DEFAULT 0,
`cpu_link` varchar(255) NOT NULL,
`motherboard` varchar(255) NOT NULL,
`motherboard_price` int(11) DEFAULT 0,
`motherboard_link` varchar(255) NOT NULL,
`GPU` varchar(255) NOT NULL,
`GPU_price` int(11) DEFAULT 0,
`GPU_link` varchar(255) NOT NULL,
`memory` varchar(255) NOT NULL,
`memory_price` int(11) DEFAULT 0,
`memory_link` varchar(255) NOT NULL,
`CPURadiator` varchar(255) NOT NULL,
`CPURadiator_price` int(11) DEFAULT 0,
`CPURadiator_link` varchar(255) NOT NULL,
`ssd` varchar(255) NOT NULL,
`ssd_price` int(11) DEFAULT 0,
`ssd_link` varchar(255) NOT NULL,
`hdd` varchar(255) NOT NULL,
`hdd_price` int(11) DEFAULT 0,
`hdd_link` varchar(255) NOT NULL,
`powerSupply` varchar(255) NOT NULL,
`powerSupply_price` int(11) DEFAULT 0,
`powerSupply_link` varchar(255) NOT NULL,
`case` varchar(255) NOT NULL,
`case_price` int(11) DEFAULT 0,
`case_link` varchar(255) NOT NULL,
`totalPrice` int(11) DEFAULT 0,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci;
truncate trendings;

@ -331,7 +331,7 @@ class DataCleaning:
" modify card_length float default 0.0"
row = cursor.execute(instruct) # 返回被影响的行数
# 重新写入
sql_insert = "INSERT INTO video_card (id, name, comment_num, praise_rate, shop_name, price, link,"\
sql_insert = "INSERT INTO graphics_card (id, name, comment_num, praise_rate, shop_name, price, link,"\
"brand, tags, card_length, rgb, card_type, introduction, Ptable_params) VALUES (%(id)s, %(name)s, %(comment_num)s,"\
"%(praise_rate)s, %(shop_name)s, %(price)s, %(link)s, %(brand)s, %(tags)s, %(card_length)s, "\
"%(rgb)s, %(card_type)s, %(introduction)s, %(Ptable_params)s)"
@ -1029,7 +1029,7 @@ class DataCleaning:
self.cleanCPU()
elif t == 'motherboard':
self.cleanMotherboard()
elif t == 'video_card':
elif t == 'graphics_card':
self.cleanGraphicsCard()
elif t == 'cpu_radiator':
self.cleanCPURadiator()
@ -1050,6 +1050,6 @@ class DataCleaning:
if __name__ == "__main__":
# washer = DataCleaning('power_supply')
washer = DataCleaning('video_card')
washer = DataCleaning('graphics_card')
# washer = DataCleaning('motherboard')
washer.main()

@ -10,6 +10,7 @@ import re
import json
import time
import pymysql
from pymysql.converters import escape_string
from selenium.common.exceptions import ElementNotInteractableException
from selenium.common.exceptions import ElementClickInterceptedException
@ -553,13 +554,19 @@ class CPUSpider(JDSpider):
# 时钟频率
record['clock_speed'] = p_table['规格']['主频']
# 内核数
record['core_num'] = introd['核心数量']
try:
record['core_num'] = introd['核心数量']
except:
record['core_num'] = introd['核心数']
# 是否支持核显
record['have_core_graphics_card'] = introd.get('是否支持核显', '不支持核显')
# 是否自带风扇
record['have_cpu_fan'] = introd.get('是否自带风扇', '不带风扇')
# TDP
record['TDP'] = int(p_table['规格']['功率'][:-1])
try:
record['TDP'] = int(p_table['规格']['功率'][:-1])
except:
record['TDP'] = -1
# 处理接口
socket = introd["接口"]
if "1151" in socket:
@ -569,7 +576,8 @@ class CPUSpider(JDSpider):
record["socket"] = socket
# CPU 代数
# 分类标准锐龙5000系和酷睿10代为最新代锐龙3000和酷睿9代为上代其余为上古版本
# 分类标准:
# 鉴于 Intel 11 代倒吸牙膏认定锐龙5000系和酷睿10代为最新代锐龙3000和酷睿11代或9代为上代其余为上古版本
# 提取代数
name = record['name']
pat = r"\d+\w"
@ -578,6 +586,8 @@ class CPUSpider(JDSpider):
if record['brand'] == 'INTEL':
if res[:2] == "10":
record['generation'] = 3
elif res[:2] == '11':
record['generation'] = 2
elif res[0] == '9':
record['generation'] = 2
else:
@ -647,9 +657,9 @@ class CPUSpider(JDSpider):
cpu_link = "https://list.jd.com/list.html?cat=670%2C677%2C678&page="
cpu_link = "https://list.jd.com/list.html?cat=670%2C677%2C678&psort=3&psort=3&page="
page_num = 3
page_num = 1
start_page = 1
self.productSpider(cpu_link, page_num, start_page)
# self.productSpider(cpu_link, page_num, start_page)
self.cleanCPU()
print("Successfully get CPU data!")
@ -757,7 +767,10 @@ class MotherboardSpider(JDSpider):
print(introd)
exit(1)
# 加入 m2 接口数 列
record['m2_num'] = int(introd.get('M.2接口数量', 0))
try:
record['m2_num'] = int(introd.get('M.2接口数量', 0))
except:
record['m2_num'] = int(introd.get('M.2接口数量', "00")[:-1])
# 插槽数目
slot_num = p_table['内存'].get("内存插槽", 0)
try:
@ -812,8 +825,12 @@ class MotherboardSpider(JDSpider):
record = data[i]
if record['comment_num'] == 100: # 抓到板-U套装了
continue
if '套装' in record['title_name']:
continue
try:
if '套装' in record['title_name']:
continue
except:
if '套装' in record['name']:
continue
# 清洗数据
new_data.append(self.handleSingleMotherboardRecord(record))
new_data[index]['id'] = cnt
@ -833,12 +850,12 @@ class MotherboardSpider(JDSpider):
connection.commit()
def main(self):
motherboard_link = "https://list.jd.com/list.html?cat=670%2C677%2C681&psort=3&psort=3&page="
motherboard_link = "https://list.jd.com/list.html?cat=670%2C677%2C681&psort=3&psort=3&pvid=e726d0ee460448b6a16cf24950c1dabb&page="
# 爬取数据
page_num = 26 # 一共爬了26页
# page_num = 1 # for testing
# page_num = 26 # 一共爬了26页
page_num = 1 # for testing
start_page = 1
self.productSpider(motherboard_link, page_num, start_page)
# self.productSpider(motherboard_link, page_num, start_page)
# 清洗数据
self.cleanMotherboard()
print("Successfully get Motherboard data!")
@ -997,9 +1014,10 @@ class GraphicsCardSpider(JDSpider):
connection.commit()
def main(self):
graphics_card_link = "https://list.jd.com/list.html?cat=670%2C677%2C679&psort=3&psort=3&page="
graphics_card_link = "https://list.jd.com/list.html?cat=670%2C677%2C679&psort=3&psort=3&pvid=e726d0ee460448b6a16cf24950c1dabb&page="
# 爬取数据
page_num = 30
# page_num = 30
page_num = 1
start_page = 1
# self.productSpider(graphics_card_link, page_num, start_page)
# 清洗数据
@ -1169,17 +1187,18 @@ class MemorySpider(JDSpider):
"Ptable_params, title_name) VALUES (%(id)s, %(name)s, %(comment_num)s, %(praise_rate)s, %(shop_name)s, %(price)s"\
", %(link)s, %(brand)s, %(frequency)s, %(total_capacity)s, %(memory_num)s, %(appearance)s, "\
"%(ddr_gen)s, %(introduction)s, %(Ptable_params)s, %(title_name)s)"
sql_insert = pymysql.escape_string(sql_insert)
cursor.executemany(sql_insert, new_data)
# for i in range(len(new_data)):
# cursor.execute(sql_insert, new_data[i])
# sql_insert = escape_string(sql_insert)
# cursor.executemany(sql_insert, new_data)
for i in range(len(new_data)):
cursor.execute(sql_insert, new_data[i])
connection.commit()
def main(self):
memory_link = "https://list.jd.com/list.html?cat=670%2C677%2C680&psort=3&ev=210_1558%5E&psort=3&page="
# 爬取数据
page_num = 40
# page_num = 40
page_num = 1
start_page = 1
# self.productSpider(memory_link, page_num, start_page)
# 清洗数据
@ -1366,9 +1385,10 @@ class CPURadiatorSpider(JDSpider):
def main(self):
radiator_link = "https://list.jd.com/list.html?cat=670%2C677%2C682&psort=3&ev=3680_97402%7C%7C97403%7C%7C106254%7C%7C106255%5E&psort=3&page="
# 爬取数据
page_num = 27
# page_num = 27
page_num = 1
start_page = 1
# self.productSpider(radiator_link, page_num, start_page)
self.productSpider(radiator_link, page_num, start_page)
# 清洗数据
self.cleanCPURadiator()
print("Successfully get CPU Radiator data!")
@ -1538,7 +1558,8 @@ class SSDSpider(JDSpider):
def main(self):
ssd_link = "https://list.jd.com/list.html?cat=670%2C677%2C11303&psort=3&psort=3&page="
page_num = 36 # 一共爬了36页
# page_num = 36 # 一共爬了36页
page_num = 1 # 一共爬了36页
start_page = 1
# self.productSpider(ssd_link, page_num, start_page)
@ -1715,7 +1736,8 @@ class HDDSpider(JDSpider):
def main(self):
hdd_link = "https://list.jd.com/list.html?cat=670%2C677%2C683&psort=3&psort=3&page="
page_num = 11
# page_num = 11
page_num = 1
start_page = 1
# self.productSpider(hdd_link, page_num, start_page)
@ -1871,7 +1893,8 @@ class PowerSupplySpider(JDSpider):
def main(self):
power_supply_link = "https://list.jd.com/list.html?cat=670%2C677%2C691&psort=3&psort=3&page="
page_num = 25 # 抓25页
# page_num = 25 # 抓25页
page_num = 1
start_page = 1
# self.productSpider(power_supply_link, page_num, start_page)
@ -2056,9 +2079,10 @@ class CaseSpider(JDSpider):
def main(self):
case_link = "https://list.jd.com/list.html?cat=670%2C677%2C687&psort=3&psort=3&page="
page_num = 36
# page_num = 36
page_num = 1
start_page = 1
self.productSpider(case_link, page_num, start_page)
# self.productSpider(case_link, page_num, start_page)
self.cleanCase()
print("Successfully get Computer Case data!")
@ -2066,6 +2090,7 @@ class CaseSpider(JDSpider):
if __name__ == "__main__":
accessory_type = 'all'
accessory_type = 'motherboard'
if accessory_type == 'cpu':
cpu_spider = CPUSpider('cpu')
cpu_spider.main()

Loading…
Cancel
Save