You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

115 lines
4.3 KiB

from time import sleep
import yaml
import redis
from selenium import webdriver
from selenium.webdriver.common.by import By
from pymongo import MongoClient
def scroll_by_xpath(webDriver, xpath):
"""
将webDriver里的网页下拉到xpath的位置
:param webDriver: web driver, 用于操作驱动后的浏览器
:param xpath: 要定位的元素的xpath, str
"""
target = webDriver.find_element(By.XPATH, xpath)
webDriver.execute_script("arguments[0].scrollIntoView();", target)
if __name__ == '__main__':
# 获取配置文件
yaml_file = "./settings.yaml"
with open(yaml_file, 'r') as f:
settings = yaml.safe_load(f)
# 获取redis连接
redis_connection = redis.Redis(
host=settings['redis']['host'],
port=settings['redis']['port'],
password=settings['redis']['password'],
db=settings['redis']['db'],
charset='UTF-8',
encoding='UTF-8')
mongo_connection = MongoClient('mongodb://{}:{}'.format(settings['mongodb']['host'], settings['mongodb']['port']),
username = settings['mongodb']['username'],
password = settings['mongodb']['password'])
# 打开京东商品搜索页
driver = webdriver.Chrome()
driver.get('https://search.jd.com/Search?keyword={}&enc=utf-8&wq={}'.format(settings['search']['keyword'], settings['search']['keyword']))
# 下拉到翻页导航的位置
scroll_by_xpath(driver, '//*[@id="J_bottomPage"]/span[1]/a[9]')
# 强制休眠5s再读取元素, 京东的搜索页需要下拉才加载全, 而下拉操作是异步的, 不匹配下拉时间读取不全元素
sleep(settings['search']['sleep'])
# 获取当前搜索的关键词的最大商品页
max_page = int(driver.find_element(By.XPATH, '//*[@id="J_bottomPage"]/span[2]/em[1]/b').text)
skuid_prev = []
skuid_url = []
page = 0
while page < max_page:
sku_price = []
# 下拉
scroll_by_xpath(driver, '//*[@id="J_bottomPage"]/span[1]/a[9]')
sleep(settings['search']['sleep'])
# 获取搜索页的商品的skuid和对应的价格
li_list = driver.find_elements(By.XPATH, '//*[@id="J_goodsList"]/ul/li')
for li in li_list:
li_text = li.text.split("\n")
result = 0
for text in li_text:
if text[1] in [str(i) for i in range(10)] and text[2] in [str(i) for i in range(10)]:
result = text[1:-1]
break
price = result
sku_price.append({li.get_attribute("data-sku"): price})
# 重复读取时, 重来
if page != 0 and sku_price[0] == skuid_prev[0]:
page -= 1
continue
# 防止未加载完就读取元素, 重新读取
while len(skuid_prev) < len(skuid_prev):
li_list = driver.find_elements(By.XPATH, '//*[@id="J_goodsList"]/ul/li')
for li in li_list:
price = li.find_element(By.XPATH, "//*[@class='gl-i-wrap']/*[@class='p-price']//strong/i")
sku_price.append({li.get_attribute("data-sku"): price})
#拼接成url, 入redis的start_url
for dic in sku_price:
for key in dic.keys():
skuid_url.append('https://item.jd.com/{}.html'.format(key))
for url in skuid_url:
redis_connection.rpush(settings['result']['rpush_key'], url)
#将sku和price放入mongodb
mongoDB = mongo_connection['admin']
mongoCollection = mongoDB['a']
for dic in sku_price:
document = {}
for key in dic.keys():
document['sku'] = key
document['price'] = dic[key]
# 更新MongoDB:
# 当数据库里没有, 插入
mongoCollection.update_one(
{'sku': document['sku']},
{
'$setOnInsert': document #生效需要upsert=True
},
upsert=True
)
print(str(page) + " " + str(len(skuid_url)) + " ", end="")
print(sku_price)
print(skuid_url)
sleep(settings['search']['sleep'])
skuid_prev = sku_price.copy()
skuid_url = []
page += 1
# 翻页
driver.find_element(By.XPATH, '//*[@id="J_bottomPage"]/span[1]/a[9]').click()