From beee44d616d7bb35dec1815c63ca6d21a31b0169 Mon Sep 17 00:00:00 2001 From: Qshfmlgn5 <1343980745@qq.com> Date: Fri, 28 May 2021 18:00:52 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E8=99=AB=E6=A0=B8=E5=BF=83=E4=BB=A3?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 有注释版.txt | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 有注释版.txt diff --git a/有注释版.txt b/有注释版.txt new file mode 100644 index 0000000..1498002 --- /dev/null +++ b/有注释版.txt @@ -0,0 +1,92 @@ +import bs4 +import requests +from bs4 import BeautifulSoup +import csv +import time +import random +"""" + +""" + +def get_lists(product_lists): + item_lists = [] + for li in product_lists: + # print(111111111111111111111) + # print(li) + # print(type(li)) + # print(111111111111111111111) + if type(li) is bs4.element.Tag: + """" + 通过调试发现似乎传入了两种“li”,有一个内容是空的,并且类型不同: + + + 通过这里来进行辨别 +
  • + """ + sku_id = li.get('skuid') + pid = li.get('pid') + """" + 蔡徐坤代言 惠普(HP)星14青春版 14英寸轻薄窄边框笔记本电脑 i5-1135G7 16G 512G SSD UMA 银(14s-dr2002TU) + """ + pro_item = li.find('a', class_='item-link')#获取上列数据段 + product_title = pro_item.get('title') + product_url = pro_item.get('href') + product_price = get_price(product_id=pid, skuid=sku_id)#调用get_price来获取商品价格 + # info = {'name': product_title, 'pid': pid, 'sku_id': sku_id, "href": product_url, 'price': product_price} + product_info = (product_title, 'http:' + product_url, product_price)#商品信息元组 + item_lists.append(product_info)#将元组插入到列表中 + print(product_info)#可以通过这个查看爬到的商品信息 + time.sleep(random.uniform(1, 2))#随机休眠1到2秒钟,防止网站封ip + + with open('联想笔记本.csv', 'w', newline='', encoding='utf-8-sig') as f:#将爬到的数据写入csv文件中 + f = csv.writer(f) + f.writerow(['商品名称', '商品链接', '商品价格']) + for row in item_lists: + f.writerow(row) + return item_lists + + + +def get_info(url): + #headers中有很多内容,主要常用的就是user-agent 和 host,他们是以键对的形式展现出来, + # 如果user-agent 以字典键对形式作为headers的内容,就可以反爬成功,就不需要其他键对;否则,需要加入headers下的更多键对形式。 + headers = { + "Host": "search.gome.com.cn", + "Referer": "http://search.gome.com.cn/search?question=%E7%AC%94%E8%AE%B0%E6%9C%AC&searchType=goods&search_mode=normal&reWrite=true&instock=1", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", + 'X-Requested-With': 'XMLHttpRequest' + } + try: + r = requests.post(url, headers=headers) + soup = BeautifulSoup(r.text, 'lxml')#使用lxml解析器解析网页信息,将解析后的文档存储到新建的变量 soup + product_list = soup.find('ul', class_='product-lists clearfix')#审查网页元素后可以发现,书目信息都包含在 li 中,从属于 class 为 product-lists clearfix + # print(product_list)#class_='product-lists clearfix' + i_lists = get_lists(product_list) + print(r"爬取的项目是{},本页爬取数据{}条".format(project, i_lists.__len__())) + except Exception as e: + print(e) + print('链接失败') + + +def get_price(product_id, skuid): + #price_url:这个是该电商商品页的价格获取链接,通过访问这个链接来获取价格 + #示例:http://ss.gome.com.cn/search/v1/price/single/null/null/9140257462/1130961371/null/flag/item + price_url = 'http://ss.gome.com.cn/search/v1/price/single/null/null/' + product_id + '/' + skuid + '/null/flag/item' + headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' + } + response = requests.get(price_url, headers=headers, timeout=10) #Request Method: GET这里用get方法来获取页面信息 + if response.status_code == 200:#r.status_code,HTTP请求的返回状态,200为成功,404失败 + return response.json()['result']['price']#result: {price: "4799.0", priceType: "RUSHBUYPRICE", productId: "9140251991", skuId: "1130975332"} + else: + return "价格未知" + +if __name__ == '__main__': + object = {"联想笔记本": 'lenovo_laptop'} + for k in object.items(): + project = k[0] + get_info(r'http://search.gome.com.cn/search?question={}'.format(k[0]))