From 71be85b65672d1fcd13fdc8963817329a72b894f Mon Sep 17 00:00:00 2001 From: fishermanykx Date: Wed, 7 Jul 2021 00:29:35 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E6=95=B0=E6=8D=AE=E6=B8=85?= =?UTF-8?q?=E6=B4=97=E4=B8=8E=E5=85=BC=E5=AE=B9=E6=80=A7=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../compatibility_checking.py | 208 ++++ .../Compatibility_Checking/questionnaire.json | 3 + .../questionnaire_bak.json | 11 + .../questionnaires.json | 46 + .../Compatibility_Checking/result.json | 1 + src/backend/Data_Cleansing/data_cleansing.py | 1055 +++++++++++++++++ 6 files changed, 1324 insertions(+) create mode 100644 src/backend/Compatibility_Checking/compatibility_checking.py create mode 100644 src/backend/Compatibility_Checking/questionnaire.json create mode 100644 src/backend/Compatibility_Checking/questionnaire_bak.json create mode 100644 src/backend/Compatibility_Checking/questionnaires.json create mode 100644 src/backend/Compatibility_Checking/result.json create mode 100644 src/backend/Data_Cleansing/data_cleansing.py diff --git a/src/backend/Compatibility_Checking/compatibility_checking.py b/src/backend/Compatibility_Checking/compatibility_checking.py new file mode 100644 index 0000000..d6ce546 --- /dev/null +++ b/src/backend/Compatibility_Checking/compatibility_checking.py @@ -0,0 +1,208 @@ +''' +Description: + 问卷:json串,键值为 l1, ... , ln + 返回json串 {"flag": 1} 或 {"flag": 0, "errorList": ["xxxxxx", "xxxxxx"]} +Author: Fishermanykx +LastEditors: Fishermanykx +LastEditTime: 2021-05-28 11:12:58 +''' +import json +import pymysql + +from pprint import pprint + +MYSQL_HOSTS = "127.0.0.1" +MYSQL_USER = "root" +MYSQL_PASSWORD = "08239015" +MYSQL_PORT = 3306 +MYSQL_DB = "computer_accessories" + + +class CompatibilityChecking: + def __init__(self): + # 是否成功 + self.flag = 1 + # 读入问卷 + with open("questionnaire.json", 'r', encoding='UTF-8') as f: + data = json.load(f) + self.questionnaire = data # 问卷dict + # pprint(data['l1']) + # 数据库指针 + self.db = pymysql.connect( + host=MYSQL_HOSTS, + port=MYSQL_PORT, + user=MYSQL_USER, + passwd=MYSQL_PASSWORD, + db=MYSQL_DB, + charset="utf8") + self.cursor = self.db.cursor(cursor=pymysql.cursors.DictCursor) + + def parseQuestionnaire(self): + """解析问卷并读入相关配件的参数""" + accessories = ['cpu', 'motherboard', 'graphics_card', 'memory', + 'ssd', 'hdd', 'cpu_radiator', 'power_supply', 'computer_case'] + accessories_data = [] + for i in range(len(accessories)): + query = "select * from " + \ + accessories[i]+" where link = '" + \ + self.questionnaire['l'+str(i+1)] + "'" + self.cursor.execute(query) + tmp_data = self.cursor.fetchone() + + # 若不存在该商品,报错 + if not tmp_data: + return [False, self.questionnaire['l'+str(i+1)]] + + accessories_data.append(tmp_data) + + self.cpu = accessories_data[0] + self.motherboard = accessories_data[1] + self.graphics_card = accessories_data[2] + self.memory = accessories_data[3] + self.ssd = accessories_data[4] + self.hdd = accessories_data[5] + self.cpu_radiator = accessories_data[6] + self.power_supply = accessories_data[7] + self.computer_case = accessories_data[8] + # pprint(self.cpu) + return [True] + + def check(self): + """检测兼容性,若成功,返回[True];否则返回[False, errorList]""" + errorList = [] + + # 检测 CPU 和 主板 的接口是否一致 + cpu_socket = self.cpu['socket'] + MB_socket = self.motherboard['cpu_socket'] + if cpu_socket != MB_socket: + error = "主板与CPU接口不匹配" + errorList.append(error) + + # 检测散热器接口是否与CPU一致 + radiator_sockets = self.cpu_radiator['socket'] + if cpu_socket not in radiator_sockets: + error = "散热器支持接口与CPU接口不匹配" + errorList.append(error) + + # 检测内存代数与主板是否匹配 + MB_ddr_gen = self.motherboard['ddr_gen'] + memory_ddr_gen = self.memory['ddr_gen'] + if MB_ddr_gen != memory_ddr_gen: + error = "主板支持内存的代数与所选内存不匹配" + errorList.append(error) + + # 检测内存容量是否超过主板最大支持容量 + memory_capacity = self.memory['total_capacity'] + MB_max_mem = self.motherboard['max_memory'] + if memory_capacity > MB_max_mem: + error = "所选内存容量大于主板支持的最大内存容量" + errorList.append(error) + + # 检测内存数量是否超过主板插槽数 + mem_num = int(self.memory['memory_num'][0]) + MB_slot_num = self.motherboard['slot_num'] + if mem_num > MB_slot_num: + error = "所选内存数量大于所选主板上的插槽数量" + errorList.append(error) + + # 机箱相关 + + # 显卡长度 + card_len = self.graphics_card['card_length'] + max_card_len = self.computer_case['max_card_len'] + if card_len > max_card_len: + error = "所选显卡长度大于所选机箱允许的最大卡长" + errorList.append(error) + + # 主板大小 + max_form_factor = self.computer_case['max_form_factor'] + form_factor = self.motherboard['form_factor'] + fit = 1 + if max_form_factor == "MINI-ITX": + if form_factor != "MINI-ITX": + fit = 0 + elif max_form_factor == "M-ATX": + if (form_factor == "ATX") or (form_factor == "E-ATX"): + fit = 0 + elif max_form_factor == "ATX": + if form_factor == "E-ATX": + fit = 0 + if not fit: + error = "所选主板板型大于所选机箱能容纳的最大主板大小" + errorList.append(error) + + # 散热器相关 + # 机箱支持相关参数 + max_radiator_height = self.computer_case['max_radiator_height'] + max_water_cooling = 0 # 水冷尺寸限制 + case_radiators = self.computer_case['supported_radiator'] + if case_radiators == '0': + max_water_cooling = 0 + else: + case_radiators = case_radiators.split('~') + max_water_cooling = int(case_radiators[0]) + # 散热器相关参数 + radiator_size = self.cpu_radiator['radiator_size'] # 散热器尺寸(若为 0 则是风冷) + height = self.cpu_radiator['height'] # 散热器高度 + if not height: + error = "散热器参数不足,无法判断" + errorList.append(error) + else: + if not radiator_size: + # 若散热器是风冷,检查风冷高度 + if height > max_radiator_height: + error = "所选风冷散热器高度大于所选机箱散热器限高" + errorList.append(error) + else: + # 若散热器是水冷,检查水冷大小 + if radiator_size > max_water_cooling: + error = "所选水冷尺寸大于所选机箱支持的最大尺寸" + errorList.append(error) + + # SSD接口 + m2_num = self.motherboard['m2_num'] + ssd_interface = self.ssd['interface'] + if (not m2_num) and (ssd_interface == 'M.2'): + error = "所选主板上没有M.2接口,故不支持M.2接口的固态硬盘" + errorList.append(error) + + # 电源大小 + power_size = self.power_supply['size'] + if max_form_factor == "MINI-ITX" and (power_size == "ATX" or power_size == "服务器电源"): + error = "所选机箱无法容纳所选电源" + errorList.append(error) + + if len(errorList): + return [False, errorList] + else: + return [True] + + def main(self): + errorList = [] + parse_res = self.parseQuestionnaire() + if parse_res[0]: + check_res = self.check() + if not check_res[0]: + errorList = check_res[1] + else: + error = "数据库中不存在链接 " + parse_res[1] + " 指向的商品" + errorList.append(error) + + if len(errorList): + self.flag = 0 + res = {"flag": self.flag, "errorList": errorList} + else: + res = {"flag": self.flag} + res_json = json.dumps(res) + + # 写入文件 + with open("result.json", 'w', encoding='utf-8') as f: + f.write(res_json) + + # for debugging + print(res) + + +if __name__ == "__main__": + check = CompatibilityChecking() + check.main() diff --git a/src/backend/Compatibility_Checking/questionnaire.json b/src/backend/Compatibility_Checking/questionnaire.json new file mode 100644 index 0000000..e6ccad3 --- /dev/null +++ b/src/backend/Compatibility_Checking/questionnaire.json @@ -0,0 +1,3 @@ +{ + "l1": "https://item.jd.com/100006391078.html", "l2": "https://item.jd.com/100003809901.html", "l3": "https://item.jd.com/100009115115.html", "l4": "https://item.jd.com/100012759442.html", "l5": "https://item.jd.com/100005926991.html", "l6": "https://item.jd.com/675971.html", "l7": "https://item.jd.com/100011674030.html", "l8": "https://item.jd.com/6828141.html", "l9": "https://item.jd.com/100002404996.html" +} \ No newline at end of file diff --git a/src/backend/Compatibility_Checking/questionnaire_bak.json b/src/backend/Compatibility_Checking/questionnaire_bak.json new file mode 100644 index 0000000..598de4c --- /dev/null +++ b/src/backend/Compatibility_Checking/questionnaire_bak.json @@ -0,0 +1,11 @@ +{ + "l1": "https://item.jd.com/100006391078.html", // CPU + "l2": "https://item.jd.com/100003809901.html", // 主板 + "l3": "https://item.jd.com/100009115115.html", // 显卡 + "l4": "https://item.jd.com/100005116786.html", // 内存 + "l5": "https://item.jd.com/100005926991.html", // 固态硬盘 + "l6": "https://item.jd.com/675971.html", // 机械硬盘 + "l7": "https://item.jd.com/100011674030.html", // CPU散热器 + "l8": "https://item.jd.com/6828141.html", // 电源 + "l9": "https://item.jd.com/100002404996.html" // 机箱 +} \ No newline at end of file diff --git a/src/backend/Compatibility_Checking/questionnaires.json b/src/backend/Compatibility_Checking/questionnaires.json new file mode 100644 index 0000000..c856474 --- /dev/null +++ b/src/backend/Compatibility_Checking/questionnaires.json @@ -0,0 +1,46 @@ +[ +// 正确用例 +{ + "l1": "https://item.jd.com/100006391078.html", "l2": "https://item.jd.com/100003809901.html", "l3": "https://item.jd.com/100009115115.html", "l4": "https://item.jd.com/100005116786.html", "l5": "https://item.jd.com/100005926991.html", "l6": "https://item.jd.com/675971.html", "l7": "https://item.jd.com/100011674030.html", "l8": "https://item.jd.com/6828141.html", "l9": "https://item.jd.com/100002404996.html" +}, +// 错误用例1:商品链接错误 +{ + "l1": "https://item.jd.com/100006391078111.html", "l2": "https://item.jd.com/100003809901.html", "l3": "https://item.jd.com/100009115115.html", "l4": "https://item.jd.com/100005116786.html", "l5": "https://item.jd.com/100005926991.html", "l6": "https://item.jd.com/675971.html", "l7": "https://item.jd.com/100011674030.html", "l8": "https://item.jd.com/6828141.html", "l9": "https://item.jd.com/100002404996.html" +}, +// 错误用例2:CPU和主板不匹配 +{ + "l1": "https://item.jd.com/100008667315.html", "l2": "https://item.jd.com/100003809901.html", "l3": "https://item.jd.com/100009115115.html", "l4": "https://item.jd.com/100005116786.html", "l5": "https://item.jd.com/100005926991.html", "l6": "https://item.jd.com/675971.html", "l7": "https://item.jd.com/100011674030.html", "l8": "https://item.jd.com/6828141.html", "l9": "https://item.jd.com/100002404996.html" +}, +// 错误用例3:内存条和主板不匹配 +{ + "l1": "https://item.jd.com/100006391078.html", "l2": "https://item.jd.com/100003809901.html", "l3": "https://item.jd.com/100009115115.html", "l4": "https://item.jd.com/664483.html", "l5": "https://item.jd.com/100005926991.html", "l6": "https://item.jd.com/675971.html", "l7": "https://item.jd.com/100011674030.html", "l8": "https://item.jd.com/6828141.html", "l9": "https://item.jd.com/100002404996.html" +}, +// 错误用例4:显卡太长 +{ + "l1": "https://item.jd.com/100006391078.html", "l2": "https://item.jd.com/100007523963.html", "l3": "https://item.jd.com/100016672370.html", "l4": "https://item.jd.com/100005116786.html", "l5": "https://item.jd.com/100005926991.html", "l6": "https://item.jd.com/675971.html", "l7": "https://item.jd.com/3762171.html", "l8": "https://item.jd.com/100009390674.html", "l9": "https://item.jd.com/100007000176.html" +}, +// 错误用例5:主板太大 +{ + "l1": "https://item.jd.com/100006391078.html", "l2": "https://item.jd.com/100003809901.html", "l3": "https://item.jd.com/100009115115.html", "l4": "https://item.jd.com/100005116786.html", "l5": "https://item.jd.com/100005926991.html", "l6": "https://item.jd.com/675971.html", "l7": "https://item.jd.com/3762171.html", "l8": "https://item.jd.com/6828141.html", "l9": "https://item.jd.com/4293268.html" +}, +// 错误用例6:电源太大 +{ + "l1": "https://item.jd.com/100006391078.html", "l2": "https://item.jd.com/100007523963.html", "l3": "https://item.jd.com/4211287.html", "l4": "https://item.jd.com/100005116786.html", "l5": "https://item.jd.com/100005926991.html", "l6": "https://item.jd.com/675971.html", "l7": "https://item.jd.com/3762171.html", "l8": "https://item.jd.com/6828141.html", "l9": "https://item.jd.com/100007000176.html" +}, +// 错误用例7:水冷太大 +{ + "l1": "https://item.jd.com/100006391078.html", "l2": "https://item.jd.com/100007523963.html", "l3": "https://item.jd.com/4211287.html", "l4": "https://item.jd.com/100005116786.html", "l5": "https://item.jd.com/100005926991.html", "l6": "https://item.jd.com/675971.html", "l7": "https://item.jd.com/100011674030.html", "l8": "https://item.jd.com/100009390674.html", "l9": "https://item.jd.com/100007000176.html" +}, +// 错误用例7:风冷太高 +{ + "l1": "https://item.jd.com/100006391078.html", "l2": "https://item.jd.com/100007523963.html", "l3": "https://item.jd.com/4211287.html", "l4": "https://item.jd.com/100005116786.html", "l5": "https://item.jd.com/100005926991.html", "l6": "https://item.jd.com/675971.html", "l7": "https://item.jd.com/598827.html", "l8": "https://item.jd.com/100009390674.html", "l9": "https://item.jd.com/100007000176.html" +}, +// 错误用例8:主板上没有 M2 接口 +{ + "l1": "https://item.jd.com/100006391078.html", "l2": "https://item.jd.com/69193529317.html", "l3": "https://item.jd.com/4211287.html", "l4": "https://item.jd.com/100005116786.html", "l5": "https://item.jd.com/100005926991.html", "l6": "https://item.jd.com/675971.html", "l7": "https://item.jd.com/3762171.html", "l8": "https://item.jd.com/100009390674.html", "l9": "https://item.jd.com/100007000176.html" +}, +// 错误用例9:内存容量大于主板支持总容量,内存条数大于主板插槽数(没错我说的就是256G皇家戟) +{ + "l1": "https://item.jd.com/100006391078.html", "l2": "https://item.jd.com/100003809901.html", "l3": "https://item.jd.com/100009115115.html", "l4": "https://item.jd.com/100012759442.html", "l5": "https://item.jd.com/100005926991.html", "l6": "https://item.jd.com/675971.html", "l7": "https://item.jd.com/100011674030.html", "l8": "https://item.jd.com/6828141.html", "l9": "https://item.jd.com/100002404996.html" +} +] \ No newline at end of file diff --git a/src/backend/Compatibility_Checking/result.json b/src/backend/Compatibility_Checking/result.json new file mode 100644 index 0000000..5d1f8ab --- /dev/null +++ b/src/backend/Compatibility_Checking/result.json @@ -0,0 +1 @@ +{"flag": 0, "errorList": ["\u6240\u9009\u5185\u5b58\u5bb9\u91cf\u5927\u4e8e\u4e3b\u677f\u652f\u6301\u7684\u6700\u5927\u5185\u5b58\u5bb9\u91cf", "\u6240\u9009\u5185\u5b58\u6570\u91cf\u5927\u4e8e\u6240\u9009\u4e3b\u677f\u4e0a\u7684\u63d2\u69fd\u6570\u91cf"]} \ No newline at end of file diff --git a/src/backend/Data_Cleansing/data_cleansing.py b/src/backend/Data_Cleansing/data_cleansing.py new file mode 100644 index 0000000..9074beb --- /dev/null +++ b/src/backend/Data_Cleansing/data_cleansing.py @@ -0,0 +1,1055 @@ +''' +Description: +Author: Fishermanykx +LastEditors: Fishermanykx +LastEditTime: 2021-05-28 11:13:27 +''' +import re +import pymysql + +MYSQL_HOSTS = "127.0.0.1" +MYSQL_USER = "root" +MYSQL_PASSWORD = "08239015" +MYSQL_PORT = 3306 +MYSQL_DB = "test" +# MYSQL_DB = "computer_accessories" + + +class DataCleaning: + def __init__(self, accessory_type): + self.accessory_type = accessory_type + + def washComments(self, comment_num, praise_rate): + # 清洗好评率 + praise_rate = int(praise_rate[:-1]) + # 清洗评论数 + pat = r'\d*(\.)?\d*' + base = 1 + try: + if '万' in comment_num: + base *= 10**4 + res = re.match(pat, comment_num) + comment_num = eval(res.group()) * base + except: + comment_num = 100 + return int(comment_num), praise_rate + + def handleSingleCPURecrod(self, record): + record['price'] = int(record['price']) + introd = eval(record['introduction']) + p_table = eval(record['Ptable_params']) + + # TDP + record['TDP'] = int(p_table['规格']['功率'][:-1]) + # 处理接口 + socket = introd["接口"] + if "1151" in socket: + socket = "INTEL LGA1151" + elif "其他" in socket: + socket = "INTEL LGA1151" + record["socket"] = socket + # 清洗评论和好评率 + record['comment_num'], record['praise_rate'] = self.washComments( + record['comment_num'], record['praise_rate']) + # 清洗 brand + record['brand'] = self.washBrand(record['brand']) + + return record + + def cleanCPU(self): + ''' + description: 清洗 CPU 数据 + ''' + connection = pymysql.connect( + host=MYSQL_HOSTS, + port=MYSQL_PORT, + user=MYSQL_USER, + passwd=MYSQL_PASSWORD, + db=MYSQL_DB, + charset="utf8") + # 获取游标 + cursor = connection.cursor(cursor=pymysql.cursors.DictCursor) + + cursor.execute("drop table " + self.accessory_type) + cursor.execute( + "create table test."+self.accessory_type+" as select * from computer_accessories."+self.accessory_type) + + instruct = "alter table " + self.accessory_type + " modify price int default NULL" + row = cursor.execute(instruct) # 返回被影响的行数 + cursor.execute("select * from " + self.accessory_type) + data = cursor.fetchall() # 以字典列表的形式读出表中所有数据 + + # 增加列 + add_col = "alter table cpu add column TDP int NOT NULL default 0 AFTER core_num " + cursor.execute(add_col) + add_col = "alter table cpu add column socket VARCHAR(255) default NULL AFTER TDP" + cursor.execute(add_col) + + # 数据清洗 + data_len = len(data) + # print(data_len) + for i in range(data_len): + # 逐条数据处理 + data[i] = self.handleSingleCPURecrod(data[i]) + + # 清空表 + cursor.execute("truncate table " + self.accessory_type) + # 将评论数和好评率修改为 int 类型 + instruct = "alter table "+self.accessory_type+" modify comment_num int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type+" modify praise_rate int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + # 重新写入 + sql_insert = "INSERT INTO cpu (id, name, comment_num, praise_rate, shop_name, price, link,"\ + "brand, tags, clock_speed, core_num, TDP, socket, have_core_graphics_card, have_cpu_fan, introduction, "\ + "Ptable_params) VALUES (%(id)s, %(name)s, %(comment_num)s, %(praise_rate)s, %(shop_name)s, %(price)s, "\ + "%(link)s, %(brand)s, %(tags)s, %(clock_speed)s, %(core_num)s, %(TDP)s, %(socket)s, "\ + "%(have_core_graphics_card)s, %(have_cpu_fan)s, %(introduction)s, %(Ptable_params)s)" + cursor.executemany(sql_insert, data) + + connection.commit() + + def washBrand(self, brand): + if "Intel" in brand: + return "INTEL" + elif 'AMD' in brand: + return "AMD" + + index = brand.find('(') + if brand == 'Thermaltake(Tt)': + return "TT" + + if index != -1: + brand = brand[:index] + else: + # Special cases + if brand == 'Crucial': + brand = '英睿达' + elif brand == 'HP': + brand = '惠普' + elif brand == 'uFound': + brand = '方正' + elif brand == 'pioneer': + brand = '先锋' + elif brand == 'HIKVISION': + brand = '海康威视' + elif brand == 'dahua': + brand = '大华' + elif brand == 'SEASONIC': + brand = '海韵' + elif brand == 'SUPER FLOWER': + brand = '振华' + elif brand == 'INWIN': + brand = '迎广' + elif brand == 'NZXT': + brand = '恩杰' + elif brand == 'be quiet': + brand = '德商必酷' + elif brand == 'XPG': + brand = '威刚' + elif brand == 'LIANLI': + brand = '联立' + elif brand == 'BitFenix': + brand = '火鸟' + elif brand == 'Fractal Design': + brand = '分形工艺' + elif brand == 'METALLICGEAR': + brand = '普力魔' + elif brand == 'Thermalright': + brand = '利民' + elif brand == 'noctua': + brand = '猫头鹰' + elif brand == 'PHANTEKS': + brand = '追风者' + + # 注意:没被处理的有:EVGA, zero zone(这TM还是家国内公司。。。), ID_COOLING + + return brand + + def handleSingleMotherboardRecord(self, record): + record['price'] = int(record['price']) + introd = eval(record['introduction']) + p_table = eval(record['Ptable_params']) + # form_factor 去中文 + s = record['form_factor'] + record['form_factor'] = s[:s.find('(')] + # platform + record['platform'] = self.washBrand(record['platform'][:-2]) + # 加入 CPU 接口列 + try: + ss = introd['适用CPU接口'] + index = ss.find('(') + if (index != -1): + ss = ss[:index] + ss = ss.split(",")[0] + if ss == 'INTEL1151': + ss = 'INTEL LGA1151' + record['cpu_socket'] = ss + except: + print(introd) + exit(1) + # 加入 m2 接口数 列 + record['m2_num'] = int(introd.get('M.2接口数量', 0)) + # 清洗品牌 + record['brand'] = self.washBrand(record['brand']) + # 清洗评论和好评率 + record['comment_num'], record['praise_rate'] = self.washComments( + record['comment_num'], record['praise_rate']) + + return record + + def cleanMotherboard(self): + connection = pymysql.connect( + host=MYSQL_HOSTS, + port=MYSQL_PORT, + user=MYSQL_USER, + passwd=MYSQL_PASSWORD, + db=MYSQL_DB, + charset="utf8") + # 获取游标 + cursor = connection.cursor(cursor=pymysql.cursors.DictCursor) + + cursor.execute("drop table " + self.accessory_type) + cursor.execute( + "create table test."+self.accessory_type+" as select * from computer_accessories."+self.accessory_type) + + instruct = "alter table " + self.accessory_type + " modify price int default NULL" + row = cursor.execute(instruct) # 返回被影响的行数 + cursor.execute("select * from " + self.accessory_type) + data = cursor.fetchall() # 以字典列表的形式读出表中所有数据 + + # 插入 CPU 接口列和 m.2 接口数 + add_col = "alter table " + self.accessory_type + \ + " add column cpu_socket VARCHAR(255) default NULL AFTER platform " + cursor.execute(add_col) + add_col = "alter table " + self.accessory_type + \ + " add column m2_num int NOT NULL default 0 AFTER cpu_socket" + cursor.execute(add_col) + + # 数据清洗 + data_len = len(data) + # print(data_len) + new_data = [] + for i in range(data_len): + # 逐条数据处理 + # 判定是否为板-U套装 + record = data[i] + if record['comment_num'] == '100': # 抓到板-U套装了 + continue + if '板U套装' in record['name']: + continue + # 清洗数据 + new_data.append(self.handleSingleMotherboardRecord(record)) + + # 清空表 + cursor.execute("truncate table " + self.accessory_type) + # 将评论数和好评率修改为 int 类型 + instruct = "alter table motherboard modify comment_num int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table motherboard modify praise_rate int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + # 重新写入 + sql_insert = "INSERT INTO motherboard (id, name, comment_num, praise_rate, shop_name, price, link,"\ + "brand, tags, form_factor, platform, cpu_socket, m2_num, introduction, Ptable_params) VALUES (%(id)s, %(name)s,"\ + " %(comment_num)s, %(praise_rate)s, %(shop_name)s, %(price)s, %(link)s, %(brand)s, %(tags)s, %(form_factor)s, "\ + "%(platform)s, %(cpu_socket)s, %(m2_num)s, %(introduction)s, %(Ptable_params)s)" + cursor.executemany(sql_insert, new_data) + + connection.commit() + + def handleSingleGraphicsCard(self, record): + record['price'] = int(record['price']) + introd = eval(record['introduction']) + p_table = eval(record['Ptable_params']) + + # 显卡类别 + amd = introd.get('AMD芯片', "其他") + nv = introd.get('NVIDIA芯片', "其他") + if nv == '其他': + record['card_type'] = 'AMD' + else: + record['card_type'] = 'NVIDIA' + # 修正卡长 + record['card_length'] = eval(record['card_length']) + # 修正 RGB 列 + rgb = record['rgb'] + if rgb == '单色': + record['rgb'] = '支持RGB' + # 清洗评论和好评率 + record['comment_num'], record['praise_rate'] = self.washComments( + record['comment_num'], record['praise_rate']) + # 清洗 brand + record['brand'] = self.washBrand(record['brand']) + + # if record['brand'] == '磐镭': + # record['card_length'] /= 10 + + return record + + def cleanGraphicsCard(self): + connection = pymysql.connect( + host=MYSQL_HOSTS, + port=MYSQL_PORT, + user=MYSQL_USER, + passwd=MYSQL_PASSWORD, + db=MYSQL_DB, + charset="utf8") + # 获取游标 + cursor = connection.cursor(cursor=pymysql.cursors.DictCursor) + + cursor.execute("drop table " + self.accessory_type) + cursor.execute( + "create table test."+self.accessory_type+" as select * from computer_accessories."+self.accessory_type) + + instruct = "alter table " + self.accessory_type + " modify price int default NULL" + row = cursor.execute(instruct) # 返回被影响的行数 + cursor.execute("select * from " + self.accessory_type) + data = cursor.fetchall() # 以字典列表的形式读出表中所有数据 + + # 插入 显卡类别列 (AMD/NVIDIA) + add_col = "alter table " + self.accessory_type + \ + " add column card_type VARCHAR(255) default NULL AFTER rgb " + cursor.execute(add_col) + + # 数据清洗 + data_len = len(data) + # print(data_len) + for i in range(data_len): + # 逐条数据处理 + record = data[i] + # 清洗数据 + data[i] = self.handleSingleGraphicsCard(record) + + # 清空表 + cursor.execute("truncate table " + self.accessory_type) + # 将评论数和好评率修改为 int 类型 + instruct = "alter table "+self.accessory_type+" modify comment_num int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type+" modify praise_rate int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type + \ + " modify card_length float default 0.0" + row = cursor.execute(instruct) # 返回被影响的行数 + # 重新写入 + sql_insert = "INSERT INTO video_card (id, name, comment_num, praise_rate, shop_name, price, link,"\ + "brand, tags, card_length, rgb, card_type, introduction, Ptable_params) VALUES (%(id)s, %(name)s, %(comment_num)s,"\ + "%(praise_rate)s, %(shop_name)s, %(price)s, %(link)s, %(brand)s, %(tags)s, %(card_length)s, "\ + "%(rgb)s, %(card_type)s, %(introduction)s, %(Ptable_params)s)" + cursor.executemany(sql_insert, data) + + connection.commit() + + def handleSingleCPURadiator(self, record): + record['price'] = int(record['price']) + introd = eval(record['introduction']) + p_table = eval(record['Ptable_params']) + + # 散热器高度 + try: + h = p_table['规格']['散热器高度'] + h = int(h[:-2]) + except: + h = 0 + record['height'] = h + + # 冷排大小 + cooling_size = introd.get('水冷类型', '0风冷') + cooling_size = cooling_size[:-2] + try: + record['radiator_size'] = int(cooling_size) + except: + record['radiator_size'] = 0 + + # RGB + rgb = introd.get('发光类型', '无') + if '无' in rgb: + rgb = '无' + elif 'RGB' not in rgb: + rgb = 'RGB' + record['rgb'] = rgb + + # 兼容接口 + try: + socket_str = introd.get('兼容接口', "") + socket_str = socket_str.split(',') + res_socket = "" + for item in socket_str: + index = item.find('(') + if index != -1: + item = item[:index] + item = item.strip() + if item == "INTEL1151": + item = 'INTEL LGA1151' + res_socket += item + res_socket += '~' + res_socket = res_socket[:-1] + except: + print(record) + exit(1) + record['socket'] = res_socket + + # 清洗评论和好评率 + record['comment_num'], record['praise_rate'] = self.washComments( + record['comment_num'], record['praise_rate']) + # 清洗 brand + record['brand'] = self.washBrand(record['brand']) + + return record + + def cleanCPURadiator(self): + connection = pymysql.connect( + host=MYSQL_HOSTS, + port=MYSQL_PORT, + user=MYSQL_USER, + passwd=MYSQL_PASSWORD, + db=MYSQL_DB, + charset="utf8") + # 获取游标 + cursor = connection.cursor(cursor=pymysql.cursors.DictCursor) + + cursor.execute("drop table " + self.accessory_type) + cursor.execute( + "create table test."+self.accessory_type+" as select * from computer_accessories."+self.accessory_type) + + instruct = "alter table " + self.accessory_type + " modify price int default NULL" + row = cursor.execute(instruct) # 返回被影响的行数 + cursor.execute("select * from " + self.accessory_type) + data = cursor.fetchall() # 以字典列表的形式读出表中所有数据 + + # 插入 散热器高度 (mm) + add_col = "alter table " + self.accessory_type + \ + " add column height int default NULL AFTER brand " + cursor.execute(add_col) + # 插入 兼容接口 列 + add_col = "alter table " + self.accessory_type + \ + " add column socket VARCHAR(255) default NULL AFTER height " + cursor.execute(add_col) + # 插入 冷排 列 + add_col = "alter table " + self.accessory_type + \ + " add column radiator_size int default NULL AFTER socket " + cursor.execute(add_col) + # 插入 RGB 列 + add_col = "alter table " + self.accessory_type + \ + " add column rgb VARCHAR(255) default NULL AFTER radiator_size " + cursor.execute(add_col) + + # 数据清洗 + data_len = len(data) + # print(data_len) + for i in range(data_len): + # 逐条数据处理 + record = data[i] + # 清洗数据 + data[i] = self.handleSingleCPURadiator(record) + + # 清空表 + cursor.execute("truncate table " + self.accessory_type) + # 将评论数和好评率修改为 int 类型 + instruct = "alter table "+self.accessory_type+" modify comment_num int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type+" modify praise_rate int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + + # 重新写入 + sql_insert = "INSERT INTO cpu_radiator (id, name, comment_num, praise_rate, shop_name, price, link,"\ + "brand, height, socket, radiator_size, rgb, introduction, Ptable_params) VALUES (%(id)s, %(name)s, %(comment_num)s,"\ + "%(praise_rate)s, %(shop_name)s, %(price)s, %(link)s, %(brand)s, %(height)s, %(socket)s, "\ + "%(radiator_size)s, %(rgb)s, %(introduction)s, %(Ptable_params)s)" + cursor.executemany(sql_insert, data) + + connection.commit() + + def handleSingleCase(self, record): + record['price'] = int(record['price']) + introd = eval(record['introduction']) + p_table = eval(record['Ptable_params']) + + # 最大板型 + boards = introd.get('支持主板', '') + if boards: + boards = boards.split(',')[0] + boards = boards[:boards.find('(')] + record['max_form_factor'] = boards + + # 最大卡长 + max_l = p_table['规格'].get('显卡(连供电头总长度)限长', 0) + if max_l: + max_l = int(max_l[1:-2]) + record['max_card_len'] = max_l + + # 散热器限高 + max_h = p_table['规格'].get('CPU散热器限高', 0) + if max_h: + pat = r'\d*' + max_h = re.match(pat, max_h).group() + record['max_radiator_height'] = int(max_h) + + # 支持的冷排 + sup_rad = introd.get('支持水冷', '0') + res = sup_rad + + if sup_rad != '0': + sup_rad = sup_rad.split(',') + res = "" + for i in range(len(sup_rad)): + sup_rad[i] = sup_rad[i][:-2] + res = res + sup_rad[i] + '~' + res = res[:-1] + if res == '不': + res = '0' + record["supported_radiator"] = res + + # 清洗评论和好评率 + record['comment_num'], record['praise_rate'] = self.washComments( + record['comment_num'], record['praise_rate']) + # 清洗 brand + record['brand'] = self.washBrand(record['brand']) + + return record + + def cleanCase(self): + connection = pymysql.connect( + host=MYSQL_HOSTS, + port=MYSQL_PORT, + user=MYSQL_USER, + passwd=MYSQL_PASSWORD, + db=MYSQL_DB, + charset="utf8") + # 获取游标 + cursor = connection.cursor(cursor=pymysql.cursors.DictCursor) + + cursor.execute("drop table " + self.accessory_type) + cursor.execute( + "create table test."+self.accessory_type+" as select * from computer_accessories."+self.accessory_type) + + instruct = "alter table " + self.accessory_type + " modify price int default NULL" + row = cursor.execute(instruct) # 返回被影响的行数 + cursor.execute("select * from " + self.accessory_type) + data = cursor.fetchall() # 以字典列表的形式读出表中所有数据 + + # 插入 最大板型 + add_col = "alter table " + self.accessory_type + \ + " add column max_form_factor VARCHAR(255) default NULL AFTER brand " + cursor.execute(add_col) + # 插入 最大显卡长度 列 + add_col = "alter table " + self.accessory_type + \ + " add column max_card_len int default NULL AFTER max_form_factor " + cursor.execute(add_col) + # 插入 散热器限高 列 + add_col = "alter table " + self.accessory_type + \ + " add column max_radiator_height int default NULL AFTER max_card_len " + cursor.execute(add_col) + # 插入 散热器规格支持 列 + add_col = "alter table " + self.accessory_type + \ + " add column supported_radiator VARCHAR(255) default NULL AFTER max_radiator_height " + cursor.execute(add_col) + + # 数据清洗 + data_len = len(data) + # print(data_len) + new_data = [] + for i in range(data_len): + # 逐条数据处理 + record = data[i] + p_table = eval(record['Ptable_params']) + max_l = p_table['规格'].get('显卡(连供电头总长度)限长', 0) + max_h = p_table['规格'].get('CPU散热器限高', 0) + name = record['name'] + if '套装' in name: + continue + # 清洗数据 + if max_h and max_l: + new_data.append(self.handleSingleCase(record)) + + # 清空表 + cursor.execute("truncate table " + self.accessory_type) + # 将评论数和好评率修改为 int 类型 + instruct = "alter table "+self.accessory_type+" modify comment_num int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type+" modify praise_rate int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + + # 重新写入 + sql_insert = "INSERT INTO computer_case (id, name, comment_num, praise_rate, shop_name, price, link,"\ + "brand, max_form_factor, max_card_len, max_radiator_height, supported_radiator, introduction, Ptable_params)"\ + " VALUES (%(id)s, %(name)s, %(comment_num)s, %(praise_rate)s, %(shop_name)s, %(price)s, %(link)s, %(brand)s, "\ + "%(max_form_factor)s, %(max_card_len)s, %(max_radiator_height)s, %(supported_radiator)s, %(introduction)s, %(Ptable_params)s)" + + cursor.executemany(sql_insert, new_data) + + connection.commit() + + def handleSingleHDD(self, record): + record['price'] = int(record['price']) + introd = eval(record['introduction']) + p_table = eval(record['Ptable_params']) + + # 清洗容量 (以TB为单位的0.25的倍数的浮点数) + capacity = record['total_capacity'] + dig = capacity[:-2] + if dig != '12' and dig.isdigit(): + capacity = dig + else: + name = record['name'] + pat = r'\d+T|\d+G|\d*\.\d*T' + res = re.search(pat, name) + if res: + res = res.group() + else: + res = '0T' + if res and res[-1] == 'T': + capacity = round(eval(res[:-1])) + elif res and res[-1] == 'G': + capacity = eval(res[:-1]) // 250 * 0.25 + else: + print("Error in converting capacity") + print(res) + print(record) + exit(1) + record['total_capacity'] = capacity + + # 清洗 尺寸 + try: + size = p_table['特性'].get('产品尺寸(mm)', '0x0x0') + except: + size = '0x0x0' + if '×' in size: + size = size.replace('×', '*') + if 'x' in size: + size = size.replace('x', '*') + if 'X' in size: + size = size.replace('X', '*') + new_size = "" + if len(size) > 9: + lis = size.split('*') + pat = r'\d*\.\d*|\d*' + for item in lis: + res = re.findall(pat, item) + res = list(filter(None, res))[0] + new_size = new_size + (res + '*') + new_size = new_size[:-1] + else: + pat = r'\d*\.\d*|\d*' + res = re.search(pat, size.strip()).group() + new_size = res + ' inches' + record['size'] = new_size + + # 清洗评论和好评率 + record['comment_num'], record['praise_rate'] = self.washComments( + record['comment_num'], record['praise_rate']) + # 清洗 brand + record['brand'] = self.washBrand(record['brand']) + + return record + + def cleanHDD(self): + connection = pymysql.connect( + host=MYSQL_HOSTS, + port=MYSQL_PORT, + user=MYSQL_USER, + passwd=MYSQL_PASSWORD, + db=MYSQL_DB, + charset="utf8") + # 获取游标 + cursor = connection.cursor(cursor=pymysql.cursors.DictCursor) + + cursor.execute("drop table " + self.accessory_type) + cursor.execute( + "create table test."+self.accessory_type+" as select * from computer_accessories."+self.accessory_type) + + instruct = "alter table " + self.accessory_type + " modify price int default NULL" + row = cursor.execute(instruct) # 返回被影响的行数 + cursor.execute("select * from " + self.accessory_type) + data = cursor.fetchall() # 以字典列表的形式读出表中所有数据 + + # 插入 尺寸 列 + add_col = "alter table " + self.accessory_type + \ + " add column size VARCHAR(255) default NULL AFTER brand " + cursor.execute(add_col) + + # 数据清洗 + data_len = len(data) + # print(data_len) + new_data = [] + for i in range(data_len): + # 逐条数据处理 + # 判定是否为 SSD + record = data[i] + name = record['name'] + if ('SSD' in name) or ('固态' in name): + continue + if '笔记本' in name: + continue + # 清洗数据 + new_data.append(self.handleSingleHDD(record)) + + # 清空表 + cursor.execute("truncate table " + self.accessory_type) + # 将评论数和好评率修改为 int 类型 + instruct = "alter table "+self.accessory_type+" modify comment_num int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type+" modify praise_rate int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type + \ + " modify total_capacity float default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + # 重新写入 + sql_insert = "INSERT INTO hdd (id, name, comment_num, praise_rate, shop_name, price, link,"\ + "brand, size,rotating_speed, total_capacity, introduction, Ptable_params) VALUES (%(id)s, %(name)s, %(comment_num)s,"\ + "%(praise_rate)s, %(shop_name)s, %(price)s, %(link)s, %(brand)s, %(size)s, %(rotating_speed)s, "\ + "%(total_capacity)s, %(introduction)s, %(Ptable_params)s)" + for i in range(len(new_data)): + try: + cursor.execute(sql_insert, new_data[i]) + except: + print(new_data[i]) + exit(1) + # sql_insert = "INSERT INTO hdd (id, name, comment_num, praise_rate, shop_name, price, link,"\ + # "brand, size,rotating_speed, total_capacity, introduction, Ptable_params) VALUES (%(id)s, %(name)s, %(comment_num)s,"\ + # "%(praise_rate)s, %(shop_name)s, %(price)s, %(link)s, %(brand)s, %(size)s, %(rotating_speed)s, "\ + # "%(total_capacity)s, %(introduction)s, %(Ptable_params)s)" + # cursor.executemany(sql_insert, new_data) + + connection.commit() + + def handleSingleSSD(self, record): + record['price'] = int(record['price']) + introd = eval(record['introduction']) + p_table = eval(record['Ptable_params']) + + # 清洗接口 + s = record['interface'] + index = s.find('接口') + record['interface'] = s[:index] + + # 清洗容量 (以TB为单位的0.25的倍数的浮点数) + capacity = record['total_capacity'] + if capacity == '960GB-1TB': + capacity = 1.0 + elif capacity == '(480-512)GB': + capacity = 0.5 + elif capacity == '(240-256)GB': + capacity = 0.25 + elif capacity == '(120-128)GB': + capacity = 0.125 + elif capacity == '2TB及以上': + pat = r'\dT|\d+G|\d\.\d*T' + res = re.search(pat, capacity) + if res: + res = res.group() + if res and res[-1] == 'T': + capacity = round(eval(res[:-1])) + elif res and res[-1] == 'G': + capacity = eval(res[:-1]) // 250 * 0.25 + else: + print("Error in converting capacity") + print(res) + print(record) + exit(1) + else: + name = record['name'] + pat = r'\dT|\d+G|\d\.\d*T' + res = re.search(pat, name) + if res: + res = res.group() + else: + res = '0T' + if res and res[-1] == 'T': + capacity = round(eval(res[:-1])) + elif res and res[-1] == 'G': + capacity = eval(res[:-1]) // 250 * 0.25 + else: + print("Error in converting capacity") + print(res) + print(record) + exit(1) + record['total_capacity'] = capacity + + # 清洗评论和好评率 + record['comment_num'], record['praise_rate'] = self.washComments( + record['comment_num'], record['praise_rate']) + # 清洗 brand + record['brand'] = self.washBrand(record['brand']) + + return record + + def cleanSSD(self): + ''' + description: 清洗 CPU 数据 + ''' + connection = pymysql.connect( + host=MYSQL_HOSTS, + port=MYSQL_PORT, + user=MYSQL_USER, + passwd=MYSQL_PASSWORD, + db=MYSQL_DB, + charset="utf8") + # 获取游标 + cursor = connection.cursor(cursor=pymysql.cursors.DictCursor) + + cursor.execute("drop table " + self.accessory_type) + cursor.execute( + "create table test."+self.accessory_type+" as select * from computer_accessories."+self.accessory_type) + + instruct = "alter table " + self.accessory_type + " modify price int default NULL" + row = cursor.execute(instruct) # 返回被影响的行数 + cursor.execute("select * from " + self.accessory_type) + data = cursor.fetchall() # 以字典列表的形式读出表中所有数据 + + # 数据清洗 + data_len = len(data) + # print(data_len) + for i in range(data_len): + # 逐条数据处理 + record = data[i] + # 清洗数据 + data[i] = self.handleSingleSSD(record) + + # 清空表 + cursor.execute("truncate table " + self.accessory_type) + # 将评论数和好评率修改为 int 类型 + instruct = "alter table "+self.accessory_type+" modify comment_num int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type+" modify praise_rate int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type + \ + " modify total_capacity float default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + # 重新写入 + sql_insert = "INSERT INTO ssd (id, name, comment_num, praise_rate, shop_name, price, link,"\ + "brand, interface, total_capacity, introduction, Ptable_params) VALUES (%(id)s, %(name)s, %(comment_num)s,"\ + "%(praise_rate)s, %(shop_name)s, %(price)s, %(link)s, %(brand)s, %(interface)s, "\ + "%(total_capacity)s, %(introduction)s, %(Ptable_params)s)" + cursor.executemany(sql_insert, data) + + connection.commit() + + def handleSingleMemory(self, record): + record['price'] = int(record['price']) + introd = eval(record['introduction']) + p_table = eval(record['Ptable_params']) + + # 清洗容量 (以TB为单位的0.25的倍数的浮点数) + capacity = record['total_capacity'] + if '及' not in capacity and '没' not in capacity: + capacity = int(capacity[:-2]) + else: + pat = r'\d+G' + name = record['name'] + res = re.search(pat, name) + if res: + res = res.group() + + if res and res[-1] == 'G': + capacity = eval(res[:-1]) + else: + print("Error in converting capacity") + print(res) + print(record) + exit(1) + + record['total_capacity'] = capacity + + # 清洗评论和好评率 + record['comment_num'], record['praise_rate'] = self.washComments( + record['comment_num'], record['praise_rate']) + # 清洗 brand + record['brand'] = self.washBrand(record['brand']) + + return record + + def cleanMemory(self): + ''' + description: 清洗 内存 数据 + ''' + connection = pymysql.connect( + host=MYSQL_HOSTS, + port=MYSQL_PORT, + user=MYSQL_USER, + passwd=MYSQL_PASSWORD, + db=MYSQL_DB, + charset="utf8") + # 获取游标 + cursor = connection.cursor(cursor=pymysql.cursors.DictCursor) + + cursor.execute("drop table " + self.accessory_type) + cursor.execute( + "create table test."+self.accessory_type+" as select * from computer_accessories."+self.accessory_type) + + instruct = "alter table " + self.accessory_type + " modify price int default NULL" + row = cursor.execute(instruct) # 返回被影响的行数 + cursor.execute("select * from " + self.accessory_type) + data = cursor.fetchall() # 以字典列表的形式读出表中所有数据 + + # 数据清洗 + data_len = len(data) + # print(data_len) + new_data = [] + for i in range(data_len): + # 逐条数据处理 + record = data[i] + name = record['name'] + if '套装' in name: + continue + # 清洗数据 + new_data.append(self.handleSingleMemory(record)) + + # 清空表 + cursor.execute("truncate table " + self.accessory_type) + # 将评论数和好评率修改为 int 类型 + instruct = "alter table "+self.accessory_type+" modify comment_num int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type+" modify praise_rate int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type + \ + " modify total_capacity int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + # 重新写入 + sql_insert = "INSERT INTO memory (id, name, comment_num, praise_rate, shop_name, price, link,"\ + "brand, frequency, total_capacity, memory_num, appearance, ddr_gen, introduction, "\ + "Ptable_params) VALUES (%(id)s, %(name)s, %(comment_num)s, %(praise_rate)s, %(shop_name)s, %(price)s"\ + ", %(link)s, %(brand)s, %(frequency)s, %(total_capacity)s, %(memory_num)s, %(appearance)s, "\ + "%(ddr_gen)s, %(introduction)s, %(Ptable_params)s)" + cursor.executemany(sql_insert, new_data) + + connection.commit() + + def handleSinglePow(self, record): + record['price'] = int(record['price']) + introd = eval(record['introduction']) + p_table = eval(record['Ptable_params']) + + # 接线类型 + record['modularization'] = introd['接线类型'] + + # 清洗 size + pat = r'[A-Z]+' + size = record['size'] + res = re.search(pat, size) + if res: + size = res.group() + else: + print("No size info") + record['size'] = size + + # 明确功率 + ss = p_table['规格'].get('额定功率', 0) + if ss: + record['power'] = int(ss[:-1]) + else: + record['power'] = ss + + # 清洗评论和好评率 + record['comment_num'], record['praise_rate'] = self.washComments( + record['comment_num'], record['praise_rate']) + # 清洗 brand + record['brand'] = self.washBrand(record['brand']) + + return record + + def cleanPowerSupply(self): + connection = pymysql.connect( + host=MYSQL_HOSTS, + port=MYSQL_PORT, + user=MYSQL_USER, + passwd=MYSQL_PASSWORD, + db=MYSQL_DB, + charset="utf8") + # 获取游标 + cursor = connection.cursor(cursor=pymysql.cursors.DictCursor) + + cursor.execute("drop table " + self.accessory_type) + cursor.execute( + "create table test."+self.accessory_type+" as select * from computer_accessories."+self.accessory_type) + + add_col = "alter table "+self.accessory_type + \ + " add column modularization VARCHAR(255) default NULL AFTER size " + cursor.execute(add_col) + + instruct = "alter table " + self.accessory_type + " modify price int default NULL" + row = cursor.execute(instruct) # 返回被影响的行数 + cursor.execute("select * from " + self.accessory_type) + data = cursor.fetchall() # 以字典列表的形式读出表中所有数据 + + # 数据清洗 + data_len = len(data) + # print(data_len) + new_data = [] + for i in range(data_len): + # 逐条数据处理 + record = data[i] + size = record['size'] + if size == '无': + continue + name = record['name'] + if '套装' in name: + continue + # 清洗数据 + new_data.append(self.handleSinglePow(record)) + + # 清空表 + cursor.execute("truncate table " + self.accessory_type) + # 将评论数和好评率修改为 int 类型 + instruct = "alter table "+self.accessory_type+" modify comment_num int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type+" modify praise_rate int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + instruct = "alter table "+self.accessory_type+" modify power int default 0" + row = cursor.execute(instruct) # 返回被影响的行数 + + # 重新写入 + sql_insert = "INSERT INTO power_supply (id, name, comment_num, praise_rate, shop_name, price, link,"\ + "brand, tags, power, size, modularization, transfer_efficiency, introduction, Ptable_params) VALUES (%(id)s, %(name)s, %(comment_num)s,"\ + "%(praise_rate)s, %(shop_name)s, %(price)s, %(link)s, %(brand)s, %(tags)s, %(power)s, "\ + "%(size)s, %(modularization)s, %(transfer_efficiency)s, %(introduction)s, %(Ptable_params)s)" + # cursor.executemany(sql_insert, new_data) + + for i in range(len(new_data)): + try: + cursor.execute(sql_insert, new_data[i]) + except: + print(new_data[i]) + exit(1) + + connection.commit() + + def main(self): + t = self.accessory_type + if t == 'all': + self.cleanCPU() + self.cleanMotherboard() + self.cleanGraphicsCard() + self.cleanCPURadiator() + self.cleanCase() + self.cleanHDD() + self.cleanSSD() + self.cleanMemory() + self.cleanPowerSupply() + elif t == 'cpu': + self.cleanCPU() + elif t == 'motherboard': + self.cleanMotherboard() + elif t == 'video_card': + self.cleanGraphicsCard() + elif t == 'cpu_radiator': + self.cleanCPURadiator() + elif t == 'computer_case': + self.cleanCase() + elif t == 'hdd': + self.cleanHDD() + elif t == 'ssd': + self.cleanSSD() + elif t == 'memory': + self.cleanMemory() + elif t == 'power_supply': + self.cleanPowerSupply() + else: + print(self.accessory_type) + print("Wrong name!") + + +if __name__ == "__main__": + # washer = DataCleaning('power_supply') + washer = DataCleaning('video_card') + # washer = DataCleaning('motherboard') + washer.main()