完成 data_cleansing 模块的集成

master
Fisher 3 years ago
parent 758027eb55
commit 8954439fed

@ -6,9 +6,8 @@ Description:
shop_name 店铺名
评论数和好评率不爬因为都是0
Author: Fishermanykx
Date: 2020-12-11 14:25:04
LastEditors: Fishermanykx
LastEditTime: 2021-05-28 11:12:07
LastEditTime: 2021-07-12 16:26:41
'''
import re
import json

@ -3,9 +3,8 @@ Description:
introduction 商品介绍的 .json
Ptable_params 规格与包装的 .json
Author: Fishermanykx
Date: 2020-12-08 20:45:47
LastEditors: Fishermanykx
LastEditTime: 2021-05-28 11:12:00
LastEditTime: 2021-07-12 16:14:17
'''
import json

@ -1,7 +1,6 @@
'''
Description: Data Base Configuration
Author: Fishermanykx
Date: 2020-12-30 15:33:37
LastEditors: Fishermanykx
LastEditTime: 2021-05-28 11:11:42
'''

@ -1,16 +1,15 @@
'''
Description:
Author: Fishermanykx
Date: 2020-12-29 08:21:41
LastEditors: Fishermanykx
LastEditTime: 2021-05-28 11:11:28
LastEditTime: 2021-07-13 09:17:32
'''
import re
import json
import time
import pymysql
from pymysql.converters import escape_string
# from pymysql.converters import escape_string
from selenium.common.exceptions import ElementNotInteractableException
from selenium.common.exceptions import ElementClickInterceptedException
@ -659,7 +658,7 @@ class CPUSpider(JDSpider):
page_num = 1
start_page = 1
# self.productSpider(cpu_link, page_num, start_page)
self.productSpider(cpu_link, page_num, start_page)
self.cleanCPU()
print("Successfully get CPU data!")
@ -824,7 +823,7 @@ class MotherboardSpider(JDSpider):
# 判定是否为板-U套装
record = data[i]
if record['comment_num'] == 100: # 抓到板-U套装了
continue
continue
try:
if '套装' in record['title_name']:
continue
@ -959,7 +958,7 @@ class GraphicsCardSpider(JDSpider):
else:
generation = 1
else:
if ('6900' in name) or ('6800' in name):
if ('6900' in name) or ('6800' in name) or ('6700' in name):
generation = 3
elif ('5700' in name) or ('5600' in name) or ('5500' in name):
generation = 2
@ -1187,7 +1186,7 @@ class MemorySpider(JDSpider):
"Ptable_params, title_name) VALUES (%(id)s, %(name)s, %(comment_num)s, %(praise_rate)s, %(shop_name)s, %(price)s"\
", %(link)s, %(brand)s, %(frequency)s, %(total_capacity)s, %(memory_num)s, %(appearance)s, "\
"%(ddr_gen)s, %(introduction)s, %(Ptable_params)s, %(title_name)s)"
# sql_insert = escape_string(sql_insert)
# sql_insert = pymysql.escape_string(sql_insert)
# cursor.executemany(sql_insert, new_data)
for i in range(len(new_data)):
cursor.execute(sql_insert, new_data[i])
@ -1894,7 +1893,7 @@ class PowerSupplySpider(JDSpider):
def main(self):
power_supply_link = "https://list.jd.com/list.html?cat=670%2C677%2C691&psort=3&psort=3&page="
# page_num = 25 # 抓25页
page_num = 1
page_num = 1
start_page = 1
# self.productSpider(power_supply_link, page_num, start_page)
@ -2090,7 +2089,7 @@ class CaseSpider(JDSpider):
if __name__ == "__main__":
accessory_type = 'all'
accessory_type = 'motherboard'
accessory_type = 'graphics_card'
if accessory_type == 'cpu':
cpu_spider = CPUSpider('cpu')
cpu_spider.main()

Loading…
Cancel
Save