|
|
|
|
@ -0,0 +1,92 @@
|
|
|
|
|
import bs4
|
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import csv
|
|
|
|
|
import time
|
|
|
|
|
import random
|
|
|
|
|
""""
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def get_lists(product_lists):
|
|
|
|
|
item_lists = []
|
|
|
|
|
for li in product_lists:
|
|
|
|
|
# print(111111111111111111111)
|
|
|
|
|
# print(li)
|
|
|
|
|
# print(type(li))
|
|
|
|
|
# print(111111111111111111111)
|
|
|
|
|
if type(li) is bs4.element.Tag:
|
|
|
|
|
""""
|
|
|
|
|
通过调试发现似乎传入了两种“li”,有一个内容是空的,并且类型不同:
|
|
|
|
|
<class 'bs4.element.NavigableString'>
|
|
|
|
|
<class 'bs4.element.Tag'>
|
|
|
|
|
通过这里来进行辨别
|
|
|
|
|
<li class="product-item" pid="9140197013" skuid="1130806223">
|
|
|
|
|
"""
|
|
|
|
|
sku_id = li.get('skuid')
|
|
|
|
|
pid = li.get('pid')
|
|
|
|
|
""""
|
|
|
|
|
<a class="item-link" rel="nofollow" href="//item.gome.com.cn/9140254919-1130962345.html?search_id=WRDPL@1FhxCydVsEo@B@S"
|
|
|
|
|
target="_blank" data-code="9000000700-1_2_1" title="蔡徐坤代言 惠普(HP)星14青春版 14英寸轻薄窄边框笔记本电脑 i5-1135G7 16G 512G SSD
|
|
|
|
|
UMA 银(14s-dr2002TU)"><img gome-src="//gfs17.gomein.net.cn/T1QQK7BKDT1RCvBVdK_210.jpg" src="//gfs17.gomein.net.cn/T1QQK7BKDT1RCvBVdK_210.jpg"
|
|
|
|
|
alt="蔡徐坤代言 惠普(HP)星14青春版 14英寸轻薄窄边框笔记本电脑 i5-1135G7 16G 512G SSD UMA 银(14s-dr2002TU)"></a>
|
|
|
|
|
"""
|
|
|
|
|
pro_item = li.find('a', class_='item-link')#获取上列数据段
|
|
|
|
|
product_title = pro_item.get('title')
|
|
|
|
|
product_url = pro_item.get('href')
|
|
|
|
|
product_price = get_price(product_id=pid, skuid=sku_id)#调用get_price来获取商品价格
|
|
|
|
|
# info = {'name': product_title, 'pid': pid, 'sku_id': sku_id, "href": product_url, 'price': product_price}
|
|
|
|
|
product_info = (product_title, 'http:' + product_url, product_price)#商品信息元组
|
|
|
|
|
item_lists.append(product_info)#将元组插入到列表中
|
|
|
|
|
print(product_info)#可以通过这个查看爬到的商品信息
|
|
|
|
|
time.sleep(random.uniform(1, 2))#随机休眠1到2秒钟,防止网站封ip
|
|
|
|
|
|
|
|
|
|
with open('联想笔记本.csv', 'w', newline='', encoding='utf-8-sig') as f:#将爬到的数据写入csv文件中
|
|
|
|
|
f = csv.writer(f)
|
|
|
|
|
f.writerow(['商品名称', '商品链接', '商品价格'])
|
|
|
|
|
for row in item_lists:
|
|
|
|
|
f.writerow(row)
|
|
|
|
|
return item_lists
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_info(url):
|
|
|
|
|
#headers中有很多内容,主要常用的就是user-agent 和 host,他们是以键对的形式展现出来,
|
|
|
|
|
# 如果user-agent 以字典键对形式作为headers的内容,就可以反爬成功,就不需要其他键对;否则,需要加入headers下的更多键对形式。
|
|
|
|
|
headers = {
|
|
|
|
|
"Host": "search.gome.com.cn",
|
|
|
|
|
"Referer": "http://search.gome.com.cn/search?question=%E7%AC%94%E8%AE%B0%E6%9C%AC&searchType=goods&search_mode=normal&reWrite=true&instock=1",
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
|
|
|
|
|
'X-Requested-With': 'XMLHttpRequest'
|
|
|
|
|
}
|
|
|
|
|
try:
|
|
|
|
|
r = requests.post(url, headers=headers)
|
|
|
|
|
soup = BeautifulSoup(r.text, 'lxml')#使用lxml解析器解析网页信息,将解析后的文档存储到新建的变量 soup
|
|
|
|
|
product_list = soup.find('ul', class_='product-lists clearfix')#审查网页元素后可以发现,书目信息都包含在 li 中,从属于 class 为 product-lists clearfix
|
|
|
|
|
# print(product_list)#class_='product-lists clearfix'
|
|
|
|
|
i_lists = get_lists(product_list)
|
|
|
|
|
print(r"爬取的项目是{},本页爬取数据{}条".format(project, i_lists.__len__()))
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(e)
|
|
|
|
|
print('链接失败')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_price(product_id, skuid):
|
|
|
|
|
#price_url:这个是该电商商品页的价格获取链接,通过访问这个链接来获取价格
|
|
|
|
|
#示例:http://ss.gome.com.cn/search/v1/price/single/null/null/9140257462/1130961371/null/flag/item
|
|
|
|
|
price_url = 'http://ss.gome.com.cn/search/v1/price/single/null/null/' + product_id + '/' + skuid + '/null/flag/item'
|
|
|
|
|
headers = {
|
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
|
|
|
|
'(KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(price_url, headers=headers, timeout=10) #Request Method: GET这里用get方法来获取页面信息<Response [200]>
|
|
|
|
|
if response.status_code == 200:#r.status_code,HTTP请求的返回状态,200为成功,404失败
|
|
|
|
|
return response.json()['result']['price']#result: {price: "4799.0", priceType: "RUSHBUYPRICE", productId: "9140251991", skuId: "1130975332"}
|
|
|
|
|
else:
|
|
|
|
|
return "价格未知"
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
object = {"联想笔记本": 'lenovo_laptop'}
|
|
|
|
|
for k in object.items():
|
|
|
|
|
project = k[0]
|
|
|
|
|
get_info(r'http://search.gome.com.cn/search?question={}'.format(k[0]))
|