You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
131 lines
6.7 KiB
131 lines
6.7 KiB
# -*- coding: utf-8 -*-
|
|
import csv
|
|
import re
|
|
import time
|
|
import urllib
|
|
import xlwt
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
class SNProcess():
|
|
def __init__(self):
|
|
#模拟浏览器
|
|
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
|
|
self.run()
|
|
def get_html(self, url):
|
|
res = requests.get(url, headers=self.headers)
|
|
# res.encoding = 'utf-8'
|
|
return res.text
|
|
def write_data(self, data):
|
|
with open("congtent.csv", "a", encoding="UTF-8", errors='ignore', newline="") as f:
|
|
f_csv = csv.writer(f)
|
|
f_csv.writerow(data)
|
|
#获取评论的总的数量
|
|
def get_comment_num(self, clsid, goods_src):
|
|
src_args = re.findall(r"com/(.*?).html", goods_src)[0]
|
|
key1 = src_args.split("/")[-1]
|
|
if clsid:
|
|
url = "https://review.suning.com/ajax/review_count/cluster-"+str(clsid)+\
|
|
"-0000000"+str(key1)+"-0000000000-----satisfy.htm?callback=satisfy"
|
|
#https://list.suning.com/0-20006-0-0-0-0-0-0-0-0-11635.html?safp=d488778a.phone2018.103327226421.5&safc=cate.0.0&safpn=10003.00006
|
|
|
|
else:
|
|
url = "http://review.suning.com/ajax/review_count/general--0000000"+str(key1)+"-0000000000-----satisfy.htm?callback=satisfy"
|
|
#https://list.suning.com/0-20006-0-0-0-0-0-0-0-0-11635.html?safp=d488778a.phone2018.103327226421.5&safc=cate.0.0&safpn=10003.00006#second-filter
|
|
html = self.get_html(url)
|
|
# print(html)
|
|
oneStarCount = re.findall(r'"oneStarCount":(.*?),', html)[0]
|
|
twoStarCount = re.findall(r'"twoStarCount":(.*?),', html)[0]
|
|
threeStarCount = re.findall(r'"fourStarCount":(.*?),', html)[0]
|
|
fourStarCount = re.findall(r'"threeStarCount":(.*?),', html)[0]
|
|
fiveStarCount = re.findall(r'"fiveStarCount":(.*?),', html)[0]
|
|
picFlagCount = re.findall(r'"picFlagCount":(.*?),', html)[0]
|
|
totalCount = re.findall(r'"totalCount":(.*?),', html)[0]
|
|
againCount = re.findall(r'"againCount":(.*?),', html)[0]
|
|
return totalCount, picFlagCount, int(fiveStarCount)+int(fourStarCount), int(twoStarCount)+int(threeStarCount), oneStarCount, againCount
|
|
#获取手机的信息 里面的获取clusterid这个很关键 主要是后面的评论和评论统计数据url构造中都有这个参数
|
|
def get_goods_title(self, url):
|
|
html = self.get_html("https:" + url)
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
# print(html)
|
|
title = soup.find_all('title')[0].get_text()
|
|
clusterId = re.compile(r'"clusterId":"(.*?)"', re.S)
|
|
clusterId_ret = clusterId.findall(html)
|
|
try:
|
|
args0 = soup.find_all("dd", attrs={"class": "r-info"})[0].get_text()
|
|
args1 = soup.find_all("dd", attrs={"class": "r-info"})[1].get_text()
|
|
args2 = soup.find_all("dd", attrs={"class": "r-info"})[2].get_text()
|
|
except:
|
|
args0, args1, args2 = ["无参数"] * 3
|
|
return clusterId_ret[0],title, args0, args1, args2
|
|
#获取手机的价格 手机价格的连接需要自己拼凑
|
|
def get_price_html(self, goods_src):
|
|
try:
|
|
src_args = re.findall(r"com/(.*?).html", goods_src)[0]
|
|
key0 = src_args.split("/")[0]
|
|
key1 = src_args.split("/")[-1]
|
|
price_src = "https://pas.suning.com/nspcsale_0_0000000" + key1 + "_0000000" + key1 + "_" + key0 + "_250_029_0290199_20089_1000257_9254_12006_Z001___R1901001_0.5_0___000060864___.html?callback=pcData&_=1581050220963"
|
|
html = self.get_html(price_src)
|
|
price = re.compile(r'"netPrice":"(.*?)"', re.S)
|
|
price_ret = price.findall(html)
|
|
return price_ret[0]
|
|
except:
|
|
return -1
|
|
#获取评论数据
|
|
def get_comment_data(self, goods_src, clsid, num, redata):
|
|
src_args = re.findall(r"com/(.*?).html", goods_src)[0]
|
|
key1 = src_args.split("/")[-1]
|
|
for i in range(1, int(num/10)):
|
|
url = "http://review.suning.com/ajax/cluster_review_lists/cluster-"+str(clsid)+"-0000000"+str(key1)+"-0000000000-total-"+str(i)+"-default-10-----reviewList.htm?callback=reviewList"
|
|
html = self.get_html(url)
|
|
print(html)
|
|
content = re.findall(r'"content":"(.*?)","publishTime', html)
|
|
publishTime = re.findall(r'"publishTime":"(.*?)",.*?sourceSystem"', html)
|
|
username = re.findall(r'"nickName":"(.*?)"', html)
|
|
color = re.findall(r'"charaterDesc1":"(.*?)"', html)
|
|
edition = re.findall(r'charaterDesc2":"(.*?)"', html)
|
|
labelnames = re.findall(r'"labelNames":(.*?),"score"', html)
|
|
if len(content) == 0:
|
|
break
|
|
for val in zip(content, publishTime, username, color, edition, labelnames):
|
|
result = redata + list(val)
|
|
print(len(result), result)
|
|
# # 数据保存到csv文件
|
|
self.write_data(result)
|
|
time.sleep(1)
|
|
#主页面数据获取 关键函数
|
|
def get_phone_data(self, html):
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
li = soup.find_all('ul', attrs={'class': 'general clearfix'})[0].find_all("li")
|
|
# print(li[1])
|
|
for i in range(len(li)):
|
|
try:
|
|
src = li[i].find_all("a", attrs={"target": "_blank"})[0].get("href")
|
|
print(src)
|
|
comment_num = li[i].find_all("div", attrs={"class": "info-evaluate"})[0].find_all("a")[0].get_text()
|
|
# print(comment_num)
|
|
is_self_support = li[i].find_all("div", attrs={"class": "store-stock"})[0].find_all("a")[0].get_text()
|
|
# print(is_self_support)
|
|
price = self.get_price_html(src)
|
|
# print(price)
|
|
clusterId, title, args0, args1, args2 = self.get_goods_title(src)
|
|
# print(title)
|
|
totalCount, picFlagCount, five_fourStarCount, two_threeStarCount, oneStarCount, againCount = self.get_comment_num(clusterId, src)
|
|
ret_data = [title, comment_num, price, is_self_support, args0, args1, args2, totalCount, picFlagCount, five_fourStarCount, two_threeStarCount, oneStarCount, againCount]
|
|
# print(ret_data)
|
|
self.get_comment_data(src, clusterId, int(totalCount), ret_data)
|
|
except:
|
|
print("数据异常")
|
|
continue
|
|
|
|
def run(self):
|
|
a = urllib.parse.quote("华为手机")
|
|
for i in range(50):
|
|
print("第%s页" % i)
|
|
url = "https://search.suning.com/"+a+"/&iy=0&isNoResult=0&cp=" + str(i)
|
|
html = self.get_html(url)
|
|
self.get_phone_data(html)
|
|
if __name__ == "__main__":
|
|
SNProcess()
|