爬虫核心代码

master
Qshfmlgn5 5 years ago
parent 9b0243e5e4
commit beee44d616

@ -0,0 +1,92 @@
import bs4
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
""""
"""
def get_lists(product_lists):
item_lists = []
for li in product_lists:
# print(111111111111111111111)
# print(li)
# print(type(li))
# print(111111111111111111111)
if type(li) is bs4.element.Tag:
""""
通过调试发现似乎传入了两种“li”有一个内容是空的并且类型不同
<class 'bs4.element.NavigableString'>
<class 'bs4.element.Tag'>
通过这里来进行辨别
<li class="product-item" pid="9140197013" skuid="1130806223">
"""
sku_id = li.get('skuid')
pid = li.get('pid')
""""
<a class="item-link" rel="nofollow" href="//item.gome.com.cn/9140254919-1130962345.html?search_id=WRDPL@1FhxCydVsEo@B@S"
target="_blank" data-code="9000000700-1_2_1" title="蔡徐坤代言 惠普(HP)星14青春版 14英寸轻薄窄边框笔记本电脑 i5-1135G7 16G 512G SSD
UMA 银(14s-dr2002TU)"><img gome-src="//gfs17.gomein.net.cn/T1QQK7BKDT1RCvBVdK_210.jpg" src="//gfs17.gomein.net.cn/T1QQK7BKDT1RCvBVdK_210.jpg"
alt="蔡徐坤代言 惠普(HP)星14青春版 14英寸轻薄窄边框笔记本电脑 i5-1135G7 16G 512G SSD UMA 银(14s-dr2002TU)"></a>
"""
pro_item = li.find('a', class_='item-link')#获取上列数据段
product_title = pro_item.get('title')
product_url = pro_item.get('href')
product_price = get_price(product_id=pid, skuid=sku_id)#调用get_price来获取商品价格
# info = {'name': product_title, 'pid': pid, 'sku_id': sku_id, "href": product_url, 'price': product_price}
product_info = (product_title, 'http:' + product_url, product_price)#商品信息元组
item_lists.append(product_info)#将元组插入到列表中
print(product_info)#可以通过这个查看爬到的商品信息
time.sleep(random.uniform(1, 2))#随机休眠1到2秒钟防止网站封ip
with open('联想笔记本.csv', 'w', newline='', encoding='utf-8-sig') as f:#将爬到的数据写入csv文件中
f = csv.writer(f)
f.writerow(['商品名称', '商品链接', '商品价格'])
for row in item_lists:
f.writerow(row)
return item_lists
def get_info(url):
#headers中有很多内容主要常用的就是user-agent 和 host他们是以键对的形式展现出来
# 如果user-agent 以字典键对形式作为headers的内容就可以反爬成功就不需要其他键对否则需要加入headers下的更多键对形式。
headers = {
"Host": "search.gome.com.cn",
"Referer": "http://search.gome.com.cn/search?question=%E7%AC%94%E8%AE%B0%E6%9C%AC&searchType=goods&search_mode=normal&reWrite=true&instock=1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
'X-Requested-With': 'XMLHttpRequest'
}
try:
r = requests.post(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')#使用lxml解析器解析网页信息,将解析后的文档存储到新建的变量 soup
product_list = soup.find('ul', class_='product-lists clearfix')#审查网页元素后可以发现,书目信息都包含在 li 中,从属于 class 为 product-lists clearfix
# print(product_list)#class_='product-lists clearfix'
i_lists = get_lists(product_list)
print(r"爬取的项目是{},本页爬取数据{}条".format(project, i_lists.__len__()))
except Exception as e:
print(e)
print('链接失败')
def get_price(product_id, skuid):
#price_url:这个是该电商商品页的价格获取链接,通过访问这个链接来获取价格
#示例http://ss.gome.com.cn/search/v1/price/single/null/null/9140257462/1130961371/null/flag/item
price_url = 'http://ss.gome.com.cn/search/v1/price/single/null/null/' + product_id + '/' + skuid + '/null/flag/item'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
response = requests.get(price_url, headers=headers, timeout=10) #Request Method: GET这里用get方法来获取页面信息<Response [200]>
if response.status_code == 200:#r.status_codeHTTP请求的返回状态200为成功404失败
return response.json()['result']['price']#result: {price: "4799.0", priceType: "RUSHBUYPRICE", productId: "9140251991", skuId: "1130975332"}
else:
return "价格未知"
if __name__ == '__main__':
object = {"联想笔记本": 'lenovo_laptop'}
for k in object.items():
project = k[0]
get_info(r'http://search.gome.com.cn/search?question={}'.format(k[0]))
Loading…
Cancel
Save