Update spider

master
Fisher 3 years ago
parent 828a4320c7
commit df795b62d9

@ -2,7 +2,7 @@
Description: Description:
Author: Fishermanykx Author: Fishermanykx
LastEditors: Fishermanykx LastEditors: Fishermanykx
LastEditTime: 2021-07-13 09:17:32 LastEditTime: 2021-07-13 10:04:13
''' '''
import re import re
@ -851,10 +851,10 @@ class MotherboardSpider(JDSpider):
def main(self): def main(self):
motherboard_link = "https://list.jd.com/list.html?cat=670%2C677%2C681&psort=3&psort=3&pvid=e726d0ee460448b6a16cf24950c1dabb&page=" motherboard_link = "https://list.jd.com/list.html?cat=670%2C677%2C681&psort=3&psort=3&pvid=e726d0ee460448b6a16cf24950c1dabb&page="
# 爬取数据 # 爬取数据
# page_num = 26 # 一共爬了26页 page_num = 26 # 一共爬了26页
page_num = 1 # for testing # page_num = 1 # for testing
start_page = 1 start_page = 1
# self.productSpider(motherboard_link, page_num, start_page) self.productSpider(motherboard_link, page_num, start_page)
# 清洗数据 # 清洗数据
self.cleanMotherboard() self.cleanMotherboard()
print("Successfully get Motherboard data!") print("Successfully get Motherboard data!")
@ -1015,10 +1015,10 @@ class GraphicsCardSpider(JDSpider):
def main(self): def main(self):
graphics_card_link = "https://list.jd.com/list.html?cat=670%2C677%2C679&psort=3&psort=3&pvid=e726d0ee460448b6a16cf24950c1dabb&page=" graphics_card_link = "https://list.jd.com/list.html?cat=670%2C677%2C679&psort=3&psort=3&pvid=e726d0ee460448b6a16cf24950c1dabb&page="
# 爬取数据 # 爬取数据
# page_num = 30 page_num = 30
page_num = 1 # page_num = 1
start_page = 1 start_page = 1
# self.productSpider(graphics_card_link, page_num, start_page) self.productSpider(graphics_card_link, page_num, start_page)
# 清洗数据 # 清洗数据
self.cleanGraphicsCard() self.cleanGraphicsCard()
print("Successfully get Graphics Card data!") print("Successfully get Graphics Card data!")
@ -1196,10 +1196,10 @@ class MemorySpider(JDSpider):
def main(self): def main(self):
memory_link = "https://list.jd.com/list.html?cat=670%2C677%2C680&psort=3&ev=210_1558%5E&psort=3&page=" memory_link = "https://list.jd.com/list.html?cat=670%2C677%2C680&psort=3&ev=210_1558%5E&psort=3&page="
# 爬取数据 # 爬取数据
# page_num = 40 page_num = 40
page_num = 1 # page_num = 1
start_page = 1 start_page = 1
# self.productSpider(memory_link, page_num, start_page) self.productSpider(memory_link, page_num, start_page)
# 清洗数据 # 清洗数据
self.cleanMemory() self.cleanMemory()
print("Successfully get Memory data!") print("Successfully get Memory data!")
@ -1384,8 +1384,8 @@ class CPURadiatorSpider(JDSpider):
def main(self): def main(self):
radiator_link = "https://list.jd.com/list.html?cat=670%2C677%2C682&psort=3&ev=3680_97402%7C%7C97403%7C%7C106254%7C%7C106255%5E&psort=3&page=" radiator_link = "https://list.jd.com/list.html?cat=670%2C677%2C682&psort=3&ev=3680_97402%7C%7C97403%7C%7C106254%7C%7C106255%5E&psort=3&page="
# 爬取数据 # 爬取数据
# page_num = 27 page_num = 27
page_num = 1 # page_num = 1
start_page = 1 start_page = 1
self.productSpider(radiator_link, page_num, start_page) self.productSpider(radiator_link, page_num, start_page)
# 清洗数据 # 清洗数据
@ -1557,10 +1557,10 @@ class SSDSpider(JDSpider):
def main(self): def main(self):
ssd_link = "https://list.jd.com/list.html?cat=670%2C677%2C11303&psort=3&psort=3&page=" ssd_link = "https://list.jd.com/list.html?cat=670%2C677%2C11303&psort=3&psort=3&page="
# page_num = 36 # 一共爬了36页 page_num = 36 # 一共爬了36页
page_num = 1 # 一共爬了36页 # page_num = 1 # 一共爬了36页
start_page = 1 start_page = 1
# self.productSpider(ssd_link, page_num, start_page) self.productSpider(ssd_link, page_num, start_page)
self.cleanSSD() self.cleanSSD()
print("Successfully get SSD data!") print("Successfully get SSD data!")
@ -1735,10 +1735,10 @@ class HDDSpider(JDSpider):
def main(self): def main(self):
hdd_link = "https://list.jd.com/list.html?cat=670%2C677%2C683&psort=3&psort=3&page=" hdd_link = "https://list.jd.com/list.html?cat=670%2C677%2C683&psort=3&psort=3&page="
# page_num = 11 page_num = 11
page_num = 1 # page_num = 1
start_page = 1 start_page = 1
# self.productSpider(hdd_link, page_num, start_page) self.productSpider(hdd_link, page_num, start_page)
self.cleanHDD() self.cleanHDD()
print("Successfully get HDD data!") print("Successfully get HDD data!")
@ -1892,10 +1892,10 @@ class PowerSupplySpider(JDSpider):
def main(self): def main(self):
power_supply_link = "https://list.jd.com/list.html?cat=670%2C677%2C691&psort=3&psort=3&page=" power_supply_link = "https://list.jd.com/list.html?cat=670%2C677%2C691&psort=3&psort=3&page="
# page_num = 25 # 抓25页 page_num = 25 # 抓25页
page_num = 1 # page_num = 1
start_page = 1 start_page = 1
# self.productSpider(power_supply_link, page_num, start_page) self.productSpider(power_supply_link, page_num, start_page)
self.cleanPowerSupply() self.cleanPowerSupply()
print("Successfully get Power Supply data!") print("Successfully get Power Supply data!")
@ -2078,10 +2078,10 @@ class CaseSpider(JDSpider):
def main(self): def main(self):
case_link = "https://list.jd.com/list.html?cat=670%2C677%2C687&psort=3&psort=3&page=" case_link = "https://list.jd.com/list.html?cat=670%2C677%2C687&psort=3&psort=3&page="
# page_num = 36 page_num = 36
page_num = 1 # page_num = 1
start_page = 1 start_page = 1
# self.productSpider(case_link, page_num, start_page) self.productSpider(case_link, page_num, start_page)
self.cleanCase() self.cleanCase()
print("Successfully get Computer Case data!") print("Successfully get Computer Case data!")
@ -2089,7 +2089,7 @@ class CaseSpider(JDSpider):
if __name__ == "__main__": if __name__ == "__main__":
accessory_type = 'all' accessory_type = 'all'
accessory_type = 'graphics_card' # accessory_type = 'graphics_card'
if accessory_type == 'cpu': if accessory_type == 'cpu':
cpu_spider = CPUSpider('cpu') cpu_spider = CPUSpider('cpu')
cpu_spider.main() cpu_spider.main()

Loading…
Cancel
Save