diff --git a/src/backend/Data_Cleansing/createTestDB.sql b/src/backend/Data_Cleansing/createTestDB.sql new file mode 100644 index 0000000..0208b50 --- /dev/null +++ b/src/backend/Data_Cleansing/createTestDB.sql @@ -0,0 +1,252 @@ +CREATE DATABASE IF NOT EXISTS test; +USE test; + +DROP TABLE IF EXISTS `cpu`; +CREATE TABLE `cpu` ( + `id` int(11) DEFAULT 0, + `name` varchar(255) DEFAULT NULL, + `comment_num` INT DEFAULT 100, + `praise_rate` INT DEFAULT 90, + `shop_name` varchar(255) DEFAULT NULL, + `price` INT DEFAULT 0, + `link` varchar(255) NOT NULL, + `brand` varchar(255) DEFAULT NULL, + `tags` varchar(255) DEFAULT NULL, + `clock_speed` varchar(255) DEFAULT NULL, + `core_num` varchar(255) DEFAULT NULL, + `TDP` INT DEFAULT 0, + `socket` varchar(255) DEFAULT NULL, + `have_core_graphics_card` varchar(255) DEFAULT NULL, + `have_cpu_fan` varchar(255) DEFAULT NULL, + `generation` INT DEFAULT 0, + `introduction` JSON, + `Ptable_params` JSON, + `title_name` varchar(255) DEFAULT NULL, + PRIMARY KEY (`link`) +) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci; +truncate cpu; + +DROP TABLE IF EXISTS `motherboard`; +CREATE TABLE `motherboard` ( + `id` int(11) DEFAULT 0, + `name` varchar(255) DEFAULT NULL, + `comment_num` INT DEFAULT 100, + `praise_rate` INT DEFAULT 90, + `shop_name` varchar(255) DEFAULT NULL, + `price` INT DEFAULT 0, + `link` varchar(255) NOT NULL, + `brand` varchar(255) DEFAULT NULL, + `tags` varchar(255) DEFAULT NULL, + `form_factor` varchar(255) DEFAULT NULL, + `platform` varchar(255) DEFAULT NULL, + `cpu_socket` varchar(255) DEFAULT NULL, + `m2_num` INT DEFAULT 0, + `slot_num` INT DEFAULT 0, + `ddr_gen` varchar(255) DEFAULT NULL, + `max_memory` INT DEFAULT 0, + `introduction` JSON, + `Ptable_params` JSON, + `title_name` varchar(255) DEFAULT NULL, + PRIMARY KEY (`link`) +) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci; +truncate motherboard; + +DROP TABLE IF EXISTS `graphics_card`; +CREATE TABLE `graphics_card` ( + `id` int(11) DEFAULT 0, + `name` varchar(255) DEFAULT NULL, + `comment_num` INT DEFAULT 100, + `praise_rate` INT DEFAULT 90, + `shop_name` varchar(255) DEFAULT NULL, + `price` INT DEFAULT 0, + `link` varchar(255) NOT NULL, + `brand` varchar(255) DEFAULT NULL, + `tags` varchar(255) DEFAULT NULL, + `card_length` float DEFAULT 0, + `rgb` varchar(255) DEFAULT NULL, + `card_type` varchar(255) DEFAULT NULL, + `generation` INT DEFAULT 0, + `introduction` JSON, + `Ptable_params` JSON, + `title_name` varchar(255) DEFAULT NULL, + PRIMARY KEY (`link`) +) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci; +truncate graphics_card; + +DROP TABLE IF EXISTS `memory`; +CREATE TABLE `memory` ( + `id` int(11) DEFAULT 0, + `name` varchar(255) DEFAULT NULL, + `comment_num` INT DEFAULT 100, + `praise_rate` INT DEFAULT 90, + `shop_name` varchar(255) DEFAULT NULL, + `price` INT DEFAULT 0, + `link` varchar(255) NOT NULL, + `brand` varchar(255) DEFAULT NULL, + `frequency` varchar(255) DEFAULT NULL, + `total_capacity` INT DEFAULT 0, + `memory_num` varchar(255) DEFAULT NULL, + `appearance` varchar(255) DEFAULT NULL, + `ddr_gen` varchar(255) DEFAULT NULL, + `introduction` JSON, + `Ptable_params` JSON, + `title_name` varchar(255) DEFAULT NULL, + PRIMARY KEY (`link`) +) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci; +truncate memory; + +DROP TABLE IF EXISTS `cpu_radiator`; +CREATE TABLE `cpu_radiator` ( + `id` int(11) DEFAULT 0, + `name` varchar(255) DEFAULT NULL, + `comment_num` INT DEFAULT 100, + `praise_rate` INT DEFAULT 90, + `shop_name` varchar(255) DEFAULT NULL, + `price` INT DEFAULT 0, + `link` varchar(255) NOT NULL, + `brand` varchar(255) DEFAULT NULL, + `height` INT DEFAULT 0, + `socket` varchar(255) DEFAULT NULL, + `radiator_size` INT DEFAULT 0, + `rgb` varchar(255) DEFAULT NULL, + `introduction` JSON, + `Ptable_params` JSON, + `title_name` varchar(255) DEFAULT NULL, + PRIMARY KEY (`link`) +) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci; +truncate cpu_radiator; + +DROP TABLE IF EXISTS `ssd`; +CREATE TABLE `ssd` ( + `id` int(11) DEFAULT 0, + `name` varchar(255) DEFAULT NULL, + `comment_num` INT DEFAULT 100, + `praise_rate` INT DEFAULT 90, + `shop_name` varchar(255) DEFAULT NULL, + `price` INT DEFAULT 0, + `link` varchar(255) NOT NULL, + `brand` varchar(255) DEFAULT NULL, + `interface` varchar(255) DEFAULT NULL, + `total_capacity` FLOAT DEFAULT 0, + `introduction` JSON, + `Ptable_params` JSON, + `title_name` varchar(255) DEFAULT NULL, + PRIMARY KEY (`link`) +) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci; +truncate ssd; + +DROP TABLE IF EXISTS `hdd`; +CREATE TABLE `hdd` ( + `id` int(11) DEFAULT 0, + `name` varchar(255) DEFAULT NULL, + `comment_num` INT DEFAULT 100, + `praise_rate` INT DEFAULT 90, + `shop_name` varchar(255) DEFAULT NULL, + `price` INT DEFAULT 0, + `link` varchar(255) NOT NULL, + `brand` varchar(255) DEFAULT NULL, + `size` varchar(255) DEFAULT NULL, + `rotating_speed` varchar(255) DEFAULT NULL, + `total_capacity` FLOAT DEFAULT 0, + `introduction` JSON, + `Ptable_params` JSON, + `title_name` varchar(255) DEFAULT NULL, + PRIMARY KEY (`link`) +) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci; +truncate hdd; + +DROP TABLE IF EXISTS `power_supply`; +CREATE TABLE `power_supply` ( + `id` int(11) DEFAULT 0, + `name` varchar(255) DEFAULT NULL, + `comment_num` INT DEFAULT 100, + `praise_rate` INT DEFAULT 90, + `shop_name` varchar(255) DEFAULT NULL, + `price` INT DEFAULT 0, + `link` varchar(255) NOT NULL, + `brand` varchar(255) DEFAULT NULL, + `tags` varchar(255) DEFAULT NULL, + `power` INT DEFAULT 0, + `size` varchar(255) DEFAULT NULL, + `modularization` varchar(255) DEFAULT NULL, + `transfer_efficiency` varchar(255) DEFAULT NULL, + `introduction` JSON, + `Ptable_params` JSON, + `title_name` varchar(255) DEFAULT NULL, + PRIMARY KEY (`link`) +) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci; +truncate power_supply; + +DROP TABLE IF EXISTS `computer_case`; +CREATE TABLE `computer_case` ( + `id` int(11) DEFAULT 0, + `name` varchar(255) DEFAULT NULL, + `comment_num` INT DEFAULT 100, + `praise_rate` INT DEFAULT 90, + `shop_name` varchar(255) DEFAULT NULL, + `price` INT DEFAULT 0, + `link` varchar(255) NOT NULL, + `brand` varchar(255) DEFAULT NULL, + `max_form_factor` varchar(255) DEFAULT NULL, + `max_card_len` INT DEFAULT 0, + `max_radiator_height` INT DEFAULT 0, + `supported_radiator` varchar(255) DEFAULT NULL, + `has_transparent_side_panel` INT DEFAULT 0, + `introduction` JSON, + `Ptable_params` JSON, + `title_name` varchar(255) DEFAULT NULL, + PRIMARY KEY (`link`) +) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci; +truncate computer_case; + +DROP TABLE IF EXISTS `board_u_suit`; +CREATE TABLE `board_u_suit` ( + `id` int(11) DEFAULT 0, + `name` varchar(255) DEFAULT NULL, + `board` varchar(255) DEFAULT NULL, + `cpu` varchar(255) DEFAULT NULL, + `shop_name` varchar(255) DEFAULT NULL, + `price` INT DEFAULT 0, + `link` varchar(255) NOT NULL, + `introduction` JSON, + `Ptable_params` JSON, + `title_name` varchar(255) DEFAULT NULL, + PRIMARY KEY (`link`) +) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci; +truncate board_u_suit; + +DROP TABLE IF EXISTS `trendings`; +CREATE TABLE `trendings` ( + `id` varchar(255) NOT NULL, + `frequency` int(11) DEFAULT 0, + `cpu` varchar(255) NOT NULL, + `cpu_price` int(11) DEFAULT 0, + `cpu_link` varchar(255) NOT NULL, + `motherboard` varchar(255) NOT NULL, + `motherboard_price` int(11) DEFAULT 0, + `motherboard_link` varchar(255) NOT NULL, + `GPU` varchar(255) NOT NULL, + `GPU_price` int(11) DEFAULT 0, + `GPU_link` varchar(255) NOT NULL, + `memory` varchar(255) NOT NULL, + `memory_price` int(11) DEFAULT 0, + `memory_link` varchar(255) NOT NULL, + `CPURadiator` varchar(255) NOT NULL, + `CPURadiator_price` int(11) DEFAULT 0, + `CPURadiator_link` varchar(255) NOT NULL, + `ssd` varchar(255) NOT NULL, + `ssd_price` int(11) DEFAULT 0, + `ssd_link` varchar(255) NOT NULL, + `hdd` varchar(255) NOT NULL, + `hdd_price` int(11) DEFAULT 0, + `hdd_link` varchar(255) NOT NULL, + `powerSupply` varchar(255) NOT NULL, + `powerSupply_price` int(11) DEFAULT 0, + `powerSupply_link` varchar(255) NOT NULL, + `case` varchar(255) NOT NULL, + `case_price` int(11) DEFAULT 0, + `case_link` varchar(255) NOT NULL, + `totalPrice` int(11) DEFAULT 0, + PRIMARY KEY (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4, COLLATE=utf8mb4_general_ci; +truncate trendings; \ No newline at end of file diff --git a/src/backend/Data_Cleansing/data_cleansing.py b/src/backend/Data_Cleansing/data_cleansing.py index 9074beb..b7ac932 100644 --- a/src/backend/Data_Cleansing/data_cleansing.py +++ b/src/backend/Data_Cleansing/data_cleansing.py @@ -331,7 +331,7 @@ class DataCleaning: " modify card_length float default 0.0" row = cursor.execute(instruct) # 返回被影响的行数 # 重新写入 - sql_insert = "INSERT INTO video_card (id, name, comment_num, praise_rate, shop_name, price, link,"\ + sql_insert = "INSERT INTO graphics_card (id, name, comment_num, praise_rate, shop_name, price, link,"\ "brand, tags, card_length, rgb, card_type, introduction, Ptable_params) VALUES (%(id)s, %(name)s, %(comment_num)s,"\ "%(praise_rate)s, %(shop_name)s, %(price)s, %(link)s, %(brand)s, %(tags)s, %(card_length)s, "\ "%(rgb)s, %(card_type)s, %(introduction)s, %(Ptable_params)s)" @@ -1029,7 +1029,7 @@ class DataCleaning: self.cleanCPU() elif t == 'motherboard': self.cleanMotherboard() - elif t == 'video_card': + elif t == 'graphics_card': self.cleanGraphicsCard() elif t == 'cpu_radiator': self.cleanCPURadiator() @@ -1050,6 +1050,6 @@ class DataCleaning: if __name__ == "__main__": # washer = DataCleaning('power_supply') - washer = DataCleaning('video_card') + washer = DataCleaning('graphics_card') # washer = DataCleaning('motherboard') washer.main() diff --git a/src/backend/JDSpiders/Spider/JDSpider.py b/src/backend/JDSpiders/Spider/JDSpider.py index bc1015b..b67bd4e 100644 --- a/src/backend/JDSpiders/Spider/JDSpider.py +++ b/src/backend/JDSpiders/Spider/JDSpider.py @@ -10,6 +10,7 @@ import re import json import time import pymysql +from pymysql.converters import escape_string from selenium.common.exceptions import ElementNotInteractableException from selenium.common.exceptions import ElementClickInterceptedException @@ -553,13 +554,19 @@ class CPUSpider(JDSpider): # 时钟频率 record['clock_speed'] = p_table['规格']['主频'] # 内核数 - record['core_num'] = introd['核心数量'] + try: + record['core_num'] = introd['核心数量'] + except: + record['core_num'] = introd['核心数'] # 是否支持核显 record['have_core_graphics_card'] = introd.get('是否支持核显', '不支持核显') # 是否自带风扇 record['have_cpu_fan'] = introd.get('是否自带风扇', '不带风扇') # TDP - record['TDP'] = int(p_table['规格']['功率'][:-1]) + try: + record['TDP'] = int(p_table['规格']['功率'][:-1]) + except: + record['TDP'] = -1 # 处理接口 socket = introd["接口"] if "1151" in socket: @@ -569,7 +576,8 @@ class CPUSpider(JDSpider): record["socket"] = socket # CPU 代数 - # 分类标准:锐龙5000系和酷睿10代为最新代;锐龙3000和酷睿9代为上代;其余为上古版本 + # 分类标准: + # 鉴于 Intel 11 代倒吸牙膏,认定锐龙5000系和酷睿10代为最新代;锐龙3000和酷睿11代或9代为上代;其余为上古版本 # 提取代数 name = record['name'] pat = r"\d+\w" @@ -578,6 +586,8 @@ class CPUSpider(JDSpider): if record['brand'] == 'INTEL': if res[:2] == "10": record['generation'] = 3 + elif res[:2] == '11': + record['generation'] = 2 elif res[0] == '9': record['generation'] = 2 else: @@ -647,9 +657,9 @@ class CPUSpider(JDSpider): cpu_link = "https://list.jd.com/list.html?cat=670%2C677%2C678&page=" cpu_link = "https://list.jd.com/list.html?cat=670%2C677%2C678&psort=3&psort=3&page=" - page_num = 3 + page_num = 1 start_page = 1 - self.productSpider(cpu_link, page_num, start_page) + # self.productSpider(cpu_link, page_num, start_page) self.cleanCPU() print("Successfully get CPU data!") @@ -757,7 +767,10 @@ class MotherboardSpider(JDSpider): print(introd) exit(1) # 加入 m2 接口数 列 - record['m2_num'] = int(introd.get('M.2接口数量', 0)) + try: + record['m2_num'] = int(introd.get('M.2接口数量', 0)) + except: + record['m2_num'] = int(introd.get('M.2接口数量', "00")[:-1]) # 插槽数目 slot_num = p_table['内存'].get("内存插槽", 0) try: @@ -811,9 +824,13 @@ class MotherboardSpider(JDSpider): # 判定是否为板-U套装 record = data[i] if record['comment_num'] == 100: # 抓到板-U套装了 - continue - if '套装' in record['title_name']: - continue + continue + try: + if '套装' in record['title_name']: + continue + except: + if '套装' in record['name']: + continue # 清洗数据 new_data.append(self.handleSingleMotherboardRecord(record)) new_data[index]['id'] = cnt @@ -833,12 +850,12 @@ class MotherboardSpider(JDSpider): connection.commit() def main(self): - motherboard_link = "https://list.jd.com/list.html?cat=670%2C677%2C681&psort=3&psort=3&page=" + motherboard_link = "https://list.jd.com/list.html?cat=670%2C677%2C681&psort=3&psort=3&pvid=e726d0ee460448b6a16cf24950c1dabb&page=" # 爬取数据 - page_num = 26 # 一共爬了26页 - # page_num = 1 # for testing + # page_num = 26 # 一共爬了26页 + page_num = 1 # for testing start_page = 1 - self.productSpider(motherboard_link, page_num, start_page) + # self.productSpider(motherboard_link, page_num, start_page) # 清洗数据 self.cleanMotherboard() print("Successfully get Motherboard data!") @@ -997,9 +1014,10 @@ class GraphicsCardSpider(JDSpider): connection.commit() def main(self): - graphics_card_link = "https://list.jd.com/list.html?cat=670%2C677%2C679&psort=3&psort=3&page=" + graphics_card_link = "https://list.jd.com/list.html?cat=670%2C677%2C679&psort=3&psort=3&pvid=e726d0ee460448b6a16cf24950c1dabb&page=" # 爬取数据 - page_num = 30 + # page_num = 30 + page_num = 1 start_page = 1 # self.productSpider(graphics_card_link, page_num, start_page) # 清洗数据 @@ -1169,17 +1187,18 @@ class MemorySpider(JDSpider): "Ptable_params, title_name) VALUES (%(id)s, %(name)s, %(comment_num)s, %(praise_rate)s, %(shop_name)s, %(price)s"\ ", %(link)s, %(brand)s, %(frequency)s, %(total_capacity)s, %(memory_num)s, %(appearance)s, "\ "%(ddr_gen)s, %(introduction)s, %(Ptable_params)s, %(title_name)s)" - sql_insert = pymysql.escape_string(sql_insert) - cursor.executemany(sql_insert, new_data) - # for i in range(len(new_data)): - # cursor.execute(sql_insert, new_data[i]) + # sql_insert = escape_string(sql_insert) + # cursor.executemany(sql_insert, new_data) + for i in range(len(new_data)): + cursor.execute(sql_insert, new_data[i]) connection.commit() def main(self): memory_link = "https://list.jd.com/list.html?cat=670%2C677%2C680&psort=3&ev=210_1558%5E&psort=3&page=" # 爬取数据 - page_num = 40 + # page_num = 40 + page_num = 1 start_page = 1 # self.productSpider(memory_link, page_num, start_page) # 清洗数据 @@ -1366,9 +1385,10 @@ class CPURadiatorSpider(JDSpider): def main(self): radiator_link = "https://list.jd.com/list.html?cat=670%2C677%2C682&psort=3&ev=3680_97402%7C%7C97403%7C%7C106254%7C%7C106255%5E&psort=3&page=" # 爬取数据 - page_num = 27 + # page_num = 27 + page_num = 1 start_page = 1 - # self.productSpider(radiator_link, page_num, start_page) + self.productSpider(radiator_link, page_num, start_page) # 清洗数据 self.cleanCPURadiator() print("Successfully get CPU Radiator data!") @@ -1538,7 +1558,8 @@ class SSDSpider(JDSpider): def main(self): ssd_link = "https://list.jd.com/list.html?cat=670%2C677%2C11303&psort=3&psort=3&page=" - page_num = 36 # 一共爬了36页 + # page_num = 36 # 一共爬了36页 + page_num = 1 # 一共爬了36页 start_page = 1 # self.productSpider(ssd_link, page_num, start_page) @@ -1715,7 +1736,8 @@ class HDDSpider(JDSpider): def main(self): hdd_link = "https://list.jd.com/list.html?cat=670%2C677%2C683&psort=3&psort=3&page=" - page_num = 11 + # page_num = 11 + page_num = 1 start_page = 1 # self.productSpider(hdd_link, page_num, start_page) @@ -1871,7 +1893,8 @@ class PowerSupplySpider(JDSpider): def main(self): power_supply_link = "https://list.jd.com/list.html?cat=670%2C677%2C691&psort=3&psort=3&page=" - page_num = 25 # 抓25页 + # page_num = 25 # 抓25页 + page_num = 1 start_page = 1 # self.productSpider(power_supply_link, page_num, start_page) @@ -2056,9 +2079,10 @@ class CaseSpider(JDSpider): def main(self): case_link = "https://list.jd.com/list.html?cat=670%2C677%2C687&psort=3&psort=3&page=" - page_num = 36 + # page_num = 36 + page_num = 1 start_page = 1 - self.productSpider(case_link, page_num, start_page) + # self.productSpider(case_link, page_num, start_page) self.cleanCase() print("Successfully get Computer Case data!") @@ -2066,6 +2090,7 @@ class CaseSpider(JDSpider): if __name__ == "__main__": accessory_type = 'all' + accessory_type = 'motherboard' if accessory_type == 'cpu': cpu_spider = CPUSpider('cpu') cpu_spider.main() diff --git a/src/backend/JDSpiders/Spider/__pycache__/DBConfig.cpython-37.pyc b/src/backend/JDSpiders/Spider/__pycache__/DBConfig.cpython-37.pyc new file mode 100644 index 0000000..d7ad420 Binary files /dev/null and b/src/backend/JDSpiders/Spider/__pycache__/DBConfig.cpython-37.pyc differ diff --git a/src/backend/JDSpiders/Spider/__pycache__/DBConfig.cpython-38.pyc b/src/backend/JDSpiders/Spider/__pycache__/DBConfig.cpython-38.pyc new file mode 100644 index 0000000..159141f Binary files /dev/null and b/src/backend/JDSpiders/Spider/__pycache__/DBConfig.cpython-38.pyc differ diff --git a/src/backend/JDSpiders/Spider/__pycache__/JDSpider.cpython-37-PYTEST.pyc b/src/backend/JDSpiders/Spider/__pycache__/JDSpider.cpython-37-PYTEST.pyc new file mode 100644 index 0000000..eed40b7 Binary files /dev/null and b/src/backend/JDSpiders/Spider/__pycache__/JDSpider.cpython-37-PYTEST.pyc differ diff --git a/src/backend/JDSpiders/Spider/__pycache__/dataCleaning.cpython-37.pyc b/src/backend/JDSpiders/Spider/__pycache__/dataCleaning.cpython-37.pyc new file mode 100644 index 0000000..0ff869f Binary files /dev/null and b/src/backend/JDSpiders/Spider/__pycache__/dataCleaning.cpython-37.pyc differ