爬虫核心代码

5 years ago · beee44d616
parent 9b0243e5e4
commit beee44d616
1 changed files with 92 additions and 0 deletions
--- a/有注释版.txt
+++ b/有注释版.txt
@ -0,0 +1,92 @@
+import bs4
+import requests
+from bs4 import BeautifulSoup
+import csv
+import time
+import random
+""""
+
+"""
+
+def get_lists(product_lists):
+    item_lists = []
+    for li in product_lists:
+        # print(111111111111111111111)
+        # print(li)
+        # print(type(li))
+        # print(111111111111111111111)
+        if type(li) is bs4.element.Tag:
+            """"
+            通过调试发现似乎传入了两种“li”，有一个内容是空的，并且类型不同：
+            <class 'bs4.element.NavigableString'>
+            <class 'bs4.element.Tag'>
+            通过这里来进行辨别
+            <li class="product-item" pid="9140197013" skuid="1130806223">
+            """
+            sku_id = li.get('skuid')
+            pid = li.get('pid')
+            """"
+            <a class="item-link" rel="nofollow" href="//item.gome.com.cn/9140254919-1130962345.html?search_id=WRDPL@1FhxCydVsEo@B@S" 
+            target="_blank" data-code="9000000700-1_2_1" title="蔡徐坤代言 惠普(HP)星14青春版 14英寸轻薄窄边框笔记本电脑 i5-1135G7 16G 512G SSD 
+            UMA 银(14s-dr2002TU)"><img gome-src="//gfs17.gomein.net.cn/T1QQK7BKDT1RCvBVdK_210.jpg" src="//gfs17.gomein.net.cn/T1QQK7BKDT1RCvBVdK_210.jpg" 
+            alt="蔡徐坤代言 惠普(HP)星14青春版 14英寸轻薄窄边框笔记本电脑 i5-1135G7 16G 512G SSD UMA 银(14s-dr2002TU)"></a>
+            """
+            pro_item = li.find('a', class_='item-link')#获取上列数据段
+            product_title = pro_item.get('title')
+            product_url = pro_item.get('href')
+            product_price = get_price(product_id=pid, skuid=sku_id)#调用get_price来获取商品价格
+            # info = {'name': product_title, 'pid': pid, 'sku_id': sku_id, "href": product_url, 'price': product_price}
+            product_info = (product_title, 'http:' + product_url, product_price)#商品信息元组
+            item_lists.append(product_info)#将元组插入到列表中
+            print(product_info)#可以通过这个查看爬到的商品信息
+            time.sleep(random.uniform(1, 2))#随机休眠1到2秒钟，防止网站封ip
+
+    with open('联想笔记本.csv', 'w', newline='', encoding='utf-8-sig') as f:#将爬到的数据写入csv文件中
+        f = csv.writer(f)
+        f.writerow(['商品名称', '商品链接', '商品价格'])
+        for row in item_lists:
+            f.writerow(row)
+    return item_lists
+
+
+
+def get_info(url):
+    #headers中有很多内容，主要常用的就是user-agent 和 host，他们是以键对的形式展现出来，
+    # 如果user-agent 以字典键对形式作为headers的内容，就可以反爬成功，就不需要其他键对；否则，需要加入headers下的更多键对形式。
+    headers = {
+        "Host": "search.gome.com.cn",
+        "Referer": "http://search.gome.com.cn/search?question=%E7%AC%94%E8%AE%B0%E6%9C%AC&searchType=goods&search_mode=normal&reWrite=true&instock=1",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
+        'X-Requested-With': 'XMLHttpRequest'
+    }
+    try:
+        r = requests.post(url, headers=headers)
+        soup = BeautifulSoup(r.text, 'lxml')#使用lxml解析器解析网页信息,将解析后的文档存储到新建的变量 soup
+        product_list = soup.find('ul', class_='product-lists clearfix')#审查网页元素后可以发现，书目信息都包含在 li 中，从属于 class 为 product-lists clearfix
+        # print(product_list)#class_='product-lists clearfix'
+        i_lists = get_lists(product_list)
+        print(r"爬取的项目是{},本页爬取数据{}条".format(project, i_lists.__len__()))
+    except Exception as e:
+        print(e)
+        print('链接失败')
+
+
+def get_price(product_id, skuid):
+    #price_url:这个是该电商商品页的价格获取链接,通过访问这个链接来获取价格
+    #示例：http://ss.gome.com.cn/search/v1/price/single/null/null/9140257462/1130961371/null/flag/item
+    price_url = 'http://ss.gome.com.cn/search/v1/price/single/null/null/' + product_id + '/' + skuid + '/null/flag/item'
+    headers = {
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+                      '(KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
+    }
+    response = requests.get(price_url, headers=headers, timeout=10) #Request Method: GET这里用get方法来获取页面信息<Response [200]>
+    if response.status_code == 200:#r.status_code，HTTP请求的返回状态，200为成功，404失败
+        return response.json()['result']['price']#result: {price: "4799.0", priceType: "RUSHBUYPRICE", productId: "9140251991", skuId: "1130975332"}
+    else:
+        return "价格未知"
+
+if __name__ == '__main__':
+    object = {"联想笔记本": 'lenovo_laptop'}
+    for k in object.items():
+        project = k[0]
+        get_info(r'http://search.gome.com.cn/search?question={}'.format(k[0]))