You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
115 lines
4.3 KiB
115 lines
4.3 KiB
from time import sleep
|
|
import yaml
|
|
import redis
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from pymongo import MongoClient
|
|
|
|
|
|
def scroll_by_xpath(webDriver, xpath):
|
|
"""
|
|
将webDriver里的网页下拉到xpath的位置
|
|
:param webDriver: web driver, 用于操作驱动后的浏览器
|
|
:param xpath: 要定位的元素的xpath, str
|
|
"""
|
|
target = webDriver.find_element(By.XPATH, xpath)
|
|
webDriver.execute_script("arguments[0].scrollIntoView();", target)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# 获取配置文件
|
|
yaml_file = "./settings.yaml"
|
|
with open(yaml_file, 'r') as f:
|
|
settings = yaml.safe_load(f)
|
|
|
|
# 获取redis连接
|
|
redis_connection = redis.Redis(
|
|
host=settings['redis']['host'],
|
|
port=settings['redis']['port'],
|
|
password=settings['redis']['password'],
|
|
db=settings['redis']['db'],
|
|
charset='UTF-8',
|
|
encoding='UTF-8')
|
|
|
|
mongo_connection = MongoClient('mongodb://{}:{}'.format(settings['mongodb']['host'], settings['mongodb']['port']),
|
|
username = settings['mongodb']['username'],
|
|
password = settings['mongodb']['password'])
|
|
|
|
# 打开京东商品搜索页
|
|
driver = webdriver.Chrome()
|
|
driver.get('https://search.jd.com/Search?keyword={}&enc=utf-8&wq={}'.format(settings['search']['keyword'], settings['search']['keyword']))
|
|
# 下拉到翻页导航的位置
|
|
scroll_by_xpath(driver, '//*[@id="J_bottomPage"]/span[1]/a[9]')
|
|
# 强制休眠5s再读取元素, 京东的搜索页需要下拉才加载全, 而下拉操作是异步的, 不匹配下拉时间读取不全元素
|
|
sleep(settings['search']['sleep'])
|
|
# 获取当前搜索的关键词的最大商品页
|
|
max_page = int(driver.find_element(By.XPATH, '//*[@id="J_bottomPage"]/span[2]/em[1]/b').text)
|
|
|
|
skuid_prev = []
|
|
skuid_url = []
|
|
page = 0
|
|
while page < max_page:
|
|
sku_price = []
|
|
|
|
# 下拉
|
|
scroll_by_xpath(driver, '//*[@id="J_bottomPage"]/span[1]/a[9]')
|
|
sleep(settings['search']['sleep'])
|
|
|
|
# 获取搜索页的商品的skuid和对应的价格
|
|
li_list = driver.find_elements(By.XPATH, '//*[@id="J_goodsList"]/ul/li')
|
|
for li in li_list:
|
|
li_text = li.text.split("\n")
|
|
result = 0
|
|
for text in li_text:
|
|
if text[1] in [str(i) for i in range(10)] and text[2] in [str(i) for i in range(10)]:
|
|
result = text[1:-1]
|
|
break
|
|
price = result
|
|
sku_price.append({li.get_attribute("data-sku"): price})
|
|
# 重复读取时, 重来
|
|
if page != 0 and sku_price[0] == skuid_prev[0]:
|
|
page -= 1
|
|
continue
|
|
# 防止未加载完就读取元素, 重新读取
|
|
while len(skuid_prev) < len(skuid_prev):
|
|
li_list = driver.find_elements(By.XPATH, '//*[@id="J_goodsList"]/ul/li')
|
|
for li in li_list:
|
|
price = li.find_element(By.XPATH, "//*[@class='gl-i-wrap']/*[@class='p-price']//strong/i")
|
|
sku_price.append({li.get_attribute("data-sku"): price})
|
|
|
|
#拼接成url, 入redis的start_url
|
|
for dic in sku_price:
|
|
for key in dic.keys():
|
|
skuid_url.append('https://item.jd.com/{}.html'.format(key))
|
|
for url in skuid_url:
|
|
redis_connection.rpush(settings['result']['rpush_key'], url)
|
|
|
|
#将sku和price放入mongodb
|
|
mongoDB = mongo_connection['admin']
|
|
mongoCollection = mongoDB['a']
|
|
|
|
for dic in sku_price:
|
|
document = {}
|
|
for key in dic.keys():
|
|
document['sku'] = key
|
|
document['price'] = dic[key]
|
|
# 更新MongoDB:
|
|
# 当数据库里没有, 插入
|
|
mongoCollection.update_one(
|
|
{'sku': document['sku']},
|
|
{
|
|
'$setOnInsert': document #生效需要upsert=True
|
|
},
|
|
upsert=True
|
|
)
|
|
|
|
print(str(page) + " " + str(len(skuid_url)) + " ", end="")
|
|
print(sku_price)
|
|
print(skuid_url)
|
|
sleep(settings['search']['sleep'])
|
|
|
|
skuid_prev = sku_price.copy()
|
|
skuid_url = []
|
|
page += 1
|
|
# 翻页
|
|
driver.find_element(By.XPATH, '//*[@id="J_bottomPage"]/span[1]/a[9]').click() |