diff --git a/collection.py b/collection.py new file mode 100644 index 0000000..6d8225e --- /dev/null +++ b/collection.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +import csv +import re +import time +import urllib +import xlwt +import requests +from bs4 import BeautifulSoup + + +class SNProcess(): + def __init__(self): + #模拟浏览器 + self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'} + self.run() + def get_html(self, url): + res = requests.get(url, headers=self.headers) + # res.encoding = 'utf-8' + return res.text + def write_data(self, data): + with open("congtent.csv", "a", encoding="UTF-8", errors='ignore', newline="") as f: + f_csv = csv.writer(f) + f_csv.writerow(data) + #获取评论的总的数量 + def get_comment_num(self, clsid, goods_src): + src_args = re.findall(r"com/(.*?).html", goods_src)[0] + key1 = src_args.split("/")[-1] + if clsid: + url = "https://review.suning.com/ajax/review_count/cluster-"+str(clsid)+\ + "-0000000"+str(key1)+"-0000000000-----satisfy.htm?callback=satisfy" + #https://list.suning.com/0-20006-0-0-0-0-0-0-0-0-11635.html?safp=d488778a.phone2018.103327226421.5&safc=cate.0.0&safpn=10003.00006 + + else: + url = "http://review.suning.com/ajax/review_count/general--0000000"+str(key1)+"-0000000000-----satisfy.htm?callback=satisfy" + #https://list.suning.com/0-20006-0-0-0-0-0-0-0-0-11635.html?safp=d488778a.phone2018.103327226421.5&safc=cate.0.0&safpn=10003.00006#second-filter + html = self.get_html(url) + # print(html) + oneStarCount = re.findall(r'"oneStarCount":(.*?),', html)[0] + twoStarCount = re.findall(r'"twoStarCount":(.*?),', html)[0] + threeStarCount = re.findall(r'"fourStarCount":(.*?),', html)[0] + fourStarCount = re.findall(r'"threeStarCount":(.*?),', html)[0] + fiveStarCount = re.findall(r'"fiveStarCount":(.*?),', html)[0] + picFlagCount = re.findall(r'"picFlagCount":(.*?),', html)[0] + totalCount = re.findall(r'"totalCount":(.*?),', html)[0] + againCount = re.findall(r'"againCount":(.*?),', html)[0] + return totalCount, picFlagCount, int(fiveStarCount)+int(fourStarCount), int(twoStarCount)+int(threeStarCount), oneStarCount, againCount + #获取手机的信息 里面的获取clusterid这个很关键 主要是后面的评论和评论统计数据url构造中都有这个参数 + def get_goods_title(self, url): + html = self.get_html("https:" + url) + soup = BeautifulSoup(html, 'lxml') + # print(html) + title = soup.find_all('title')[0].get_text() + clusterId = re.compile(r'"clusterId":"(.*?)"', re.S) + clusterId_ret = clusterId.findall(html) + try: + args0 = soup.find_all("dd", attrs={"class": "r-info"})[0].get_text() + args1 = soup.find_all("dd", attrs={"class": "r-info"})[1].get_text() + args2 = soup.find_all("dd", attrs={"class": "r-info"})[2].get_text() + except: + args0, args1, args2 = ["无参数"] * 3 + return clusterId_ret[0],title, args0, args1, args2 + #获取手机的价格 手机价格的连接需要自己拼凑 + def get_price_html(self, goods_src): + try: + src_args = re.findall(r"com/(.*?).html", goods_src)[0] + key0 = src_args.split("/")[0] + key1 = src_args.split("/")[-1] + price_src = "https://pas.suning.com/nspcsale_0_0000000" + key1 + "_0000000" + key1 + "_" + key0 + "_250_029_0290199_20089_1000257_9254_12006_Z001___R1901001_0.5_0___000060864___.html?callback=pcData&_=1581050220963" + html = self.get_html(price_src) + price = re.compile(r'"netPrice":"(.*?)"', re.S) + price_ret = price.findall(html) + return price_ret[0] + except: + return -1 + #获取评论数据 + def get_comment_data(self, goods_src, clsid, num, redata): + src_args = re.findall(r"com/(.*?).html", goods_src)[0] + key1 = src_args.split("/")[-1] + for i in range(1, int(num/10)): + url = "http://review.suning.com/ajax/cluster_review_lists/cluster-"+str(clsid)+"-0000000"+str(key1)+"-0000000000-total-"+str(i)+"-default-10-----reviewList.htm?callback=reviewList" + html = self.get_html(url) + print(html) + content = re.findall(r'"content":"(.*?)","publishTime', html) + publishTime = re.findall(r'"publishTime":"(.*?)",.*?sourceSystem"', html) + username = re.findall(r'"nickName":"(.*?)"', html) + color = re.findall(r'"charaterDesc1":"(.*?)"', html) + edition = re.findall(r'charaterDesc2":"(.*?)"', html) + labelnames = re.findall(r'"labelNames":(.*?),"score"', html) + if len(content) == 0: + break + for val in zip(content, publishTime, username, color, edition, labelnames): + result = redata + list(val) + print(len(result), result) + # # 数据保存到csv文件 + self.write_data(result) + time.sleep(1) + #主页面数据获取 关键函数 + def get_phone_data(self, html): + soup = BeautifulSoup(html, 'lxml') + li = soup.find_all('ul', attrs={'class': 'general clearfix'})[0].find_all("li") + # print(li[1]) + for i in range(len(li)): + try: + src = li[i].find_all("a", attrs={"target": "_blank"})[0].get("href") + print(src) + comment_num = li[i].find_all("div", attrs={"class": "info-evaluate"})[0].find_all("a")[0].get_text() + # print(comment_num) + is_self_support = li[i].find_all("div", attrs={"class": "store-stock"})[0].find_all("a")[0].get_text() + # print(is_self_support) + price = self.get_price_html(src) + # print(price) + clusterId, title, args0, args1, args2 = self.get_goods_title(src) + # print(title) + totalCount, picFlagCount, five_fourStarCount, two_threeStarCount, oneStarCount, againCount = self.get_comment_num(clusterId, src) + ret_data = [title, comment_num, price, is_self_support, args0, args1, args2, totalCount, picFlagCount, five_fourStarCount, two_threeStarCount, oneStarCount, againCount] + # print(ret_data) + self.get_comment_data(src, clusterId, int(totalCount), ret_data) + except: + print("数据异常") + continue + + def run(self): + a = urllib.parse.quote("华为手机") + for i in range(50): + print("第%s页" % i) + url = "https://search.suning.com/"+a+"/&iy=0&isNoResult=0&cp=" + str(i) + html = self.get_html(url) + self.get_phone_data(html) +if __name__ == "__main__": + SNProcess() diff --git a/功能模块.vsdx b/功能模块.vsdx new file mode 100644 index 0000000..e1faa2e Binary files /dev/null and b/功能模块.vsdx differ