qwer/collection.py

# -*- coding: utf-8 -*-
import csv
import re
import time
import urllib
import xlwt
import requests
from bs4 import BeautifulSoup


class SNProcess():
    def __init__(self):
        #模拟浏览器
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
        self.run()
    def get_html(self, url):
        res = requests.get(url, headers=self.headers)
        # res.encoding = 'utf-8'
        return res.text
    def write_data(self, data):
        with open("congtent.csv", "a", encoding="UTF-8", errors='ignore', newline="") as f:
            f_csv = csv.writer(f)
            f_csv.writerow(data)
    #获取评论的总的数量
    def get_comment_num(self, clsid, goods_src):
        src_args = re.findall(r"com/(.*?).html", goods_src)[0]
        key1 = src_args.split("/")[-1]
        if clsid:
            url = "https://review.suning.com/ajax/review_count/cluster-"+str(clsid)+\
              "-0000000"+str(key1)+"-0000000000-----satisfy.htm?callback=satisfy"
            #https://list.suning.com/0-20006-0-0-0-0-0-0-0-0-11635.html?safp=d488778a.phone2018.103327226421.5&safc=cate.0.0&safpn=10003.00006

        else:
            url = "http://review.suning.com/ajax/review_count/general--0000000"+str(key1)+"-0000000000-----satisfy.htm?callback=satisfy"
            #https://list.suning.com/0-20006-0-0-0-0-0-0-0-0-11635.html?safp=d488778a.phone2018.103327226421.5&safc=cate.0.0&safpn=10003.00006#second-filter
        html = self.get_html(url)
        # print(html)
        oneStarCount = re.findall(r'"oneStarCount":(.*?),', html)[0]
        twoStarCount = re.findall(r'"twoStarCount":(.*?),', html)[0]
        threeStarCount = re.findall(r'"fourStarCount":(.*?),', html)[0]
        fourStarCount = re.findall(r'"threeStarCount":(.*?),', html)[0]
        fiveStarCount = re.findall(r'"fiveStarCount":(.*?),', html)[0]
        picFlagCount = re.findall(r'"picFlagCount":(.*?),', html)[0]
        totalCount = re.findall(r'"totalCount":(.*?),', html)[0]
        againCount = re.findall(r'"againCount":(.*?),', html)[0]
        return totalCount, picFlagCount, int(fiveStarCount)+int(fourStarCount), int(twoStarCount)+int(threeStarCount), oneStarCount, againCount
    #获取手机的信息 里面的获取clusterid这个很关键 主要是后面的评论和评论统计数据url构造中都有这个参数
    def get_goods_title(self, url):
        html = self.get_html("https:" + url)
        soup = BeautifulSoup(html, 'lxml')
        # print(html)
        title = soup.find_all('title')[0].get_text()
        clusterId = re.compile(r'"clusterId":"(.*?)"', re.S)
        clusterId_ret = clusterId.findall(html)
        try:
            args0 = soup.find_all("dd", attrs={"class": "r-info"})[0].get_text()
            args1 = soup.find_all("dd", attrs={"class": "r-info"})[1].get_text()
            args2 = soup.find_all("dd", attrs={"class": "r-info"})[2].get_text()
        except:
            args0, args1, args2 = ["无参数"] * 3
        return clusterId_ret[0],title, args0, args1, args2
    #获取手机的价格 手机价格的连接需要自己拼凑
    def get_price_html(self, goods_src):
        try:
            src_args = re.findall(r"com/(.*?).html", goods_src)[0]
            key0 = src_args.split("/")[0]
            key1 = src_args.split("/")[-1]
            price_src = "https://pas.suning.com/nspcsale_0_0000000" + key1 + "_0000000" + key1 + "_" + key0 + "_250_029_0290199_20089_1000257_9254_12006_Z001___R1901001_0.5_0___000060864___.html?callback=pcData&_=1581050220963"
            html = self.get_html(price_src)
            price = re.compile(r'"netPrice":"(.*?)"', re.S)
            price_ret = price.findall(html)
            return price_ret[0]
        except:
            return -1
    #获取评论数据
    def get_comment_data(self, goods_src, clsid, num, redata):
        src_args = re.findall(r"com/(.*?).html", goods_src)[0]
        key1 = src_args.split("/")[-1]
        for i in range(1, int(num/10)):
            url = "http://review.suning.com/ajax/cluster_review_lists/cluster-"+str(clsid)+"-0000000"+str(key1)+"-0000000000-total-"+str(i)+"-default-10-----reviewList.htm?callback=reviewList"
            html = self.get_html(url)
            print(html)
            content = re.findall(r'"content":"(.*?)","publishTime', html)
            publishTime = re.findall(r'"publishTime":"(.*?)",.*?sourceSystem"', html)
            username = re.findall(r'"nickName":"(.*?)"', html)
            color = re.findall(r'"charaterDesc1":"(.*?)"', html)
            edition = re.findall(r'charaterDesc2":"(.*?)"', html)
            labelnames = re.findall(r'"labelNames":(.*?),"score"', html)
            if len(content) == 0:
                break
            for val in zip(content, publishTime, username, color, edition, labelnames):
                result = redata + list(val)
                print(len(result), result)
                # # 数据保存到csv文件
                self.write_data(result)
            time.sleep(1)
    #主页面数据获取  关键函数
    def get_phone_data(self, html):
        soup = BeautifulSoup(html, 'lxml')
        li = soup.find_all('ul', attrs={'class': 'general clearfix'})[0].find_all("li")
        # print(li[1])
        for i in range(len(li)):
            try:
                src = li[i].find_all("a", attrs={"target": "_blank"})[0].get("href")
                print(src)
                comment_num = li[i].find_all("div", attrs={"class": "info-evaluate"})[0].find_all("a")[0].get_text()
                # print(comment_num)
                is_self_support = li[i].find_all("div", attrs={"class": "store-stock"})[0].find_all("a")[0].get_text()
                # print(is_self_support)
                price = self.get_price_html(src)
                # print(price)
                clusterId, title, args0, args1, args2 = self.get_goods_title(src)
                # print(title)
                totalCount, picFlagCount, five_fourStarCount, two_threeStarCount, oneStarCount, againCount = self.get_comment_num(clusterId, src)
                ret_data = [title, comment_num, price, is_self_support, args0, args1, args2, totalCount, picFlagCount, five_fourStarCount, two_threeStarCount, oneStarCount, againCount]
                # print(ret_data)
                self.get_comment_data(src, clusterId, int(totalCount), ret_data)
            except:
                print("数据异常")
                continue

    def run(self):
        a = urllib.parse.quote("华为手机")
        for i in range(50):
            print("第%s页" % i)
            url = "https://search.suning.com/"+a+"/&iy=0&isNoResult=0&cp=" + str(i)
            html = self.get_html(url)
            self.get_phone_data(html)
if __name__ == "__main__":
    SNProcess()
first commit 6 months ago			`# -- coding: utf-8 --`
			`import csv`
			`import re`
			`import time`
			`import urllib`
			`import xlwt`
			`import requests`
			`from bs4 import BeautifulSoup`


			`class SNProcess():`
			`def __init__(self):`
			`#模拟浏览器`
			`self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}`
			`self.run()`
			`def get_html(self, url):`
			`res = requests.get(url, headers=self.headers)`
			`# res.encoding = 'utf-8'`
			`return res.text`
			`def write_data(self, data):`
			`with open("congtent.csv", "a", encoding="UTF-8", errors='ignore', newline="") as f:`
			`f_csv = csv.writer(f)`
			`f_csv.writerow(data)`
			`#获取评论的总的数量`
			`def get_comment_num(self, clsid, goods_src):`
			`src_args = re.findall(r"com/(.*?).html", goods_src)[0]`
			`key1 = src_args.split("/")[-1]`
			`if clsid:`
			`url = "https://review.suning.com/ajax/review_count/cluster-"+str(clsid)+\`
			`"-0000000"+str(key1)+"-0000000000-----satisfy.htm?callback=satisfy"`
			`#https://list.suning.com/0-20006-0-0-0-0-0-0-0-0-11635.html?safp=d488778a.phone2018.103327226421.5&safc=cate.0.0&safpn=10003.00006`

			`else:`
			`url = "http://review.suning.com/ajax/review_count/general--0000000"+str(key1)+"-0000000000-----satisfy.htm?callback=satisfy"`
			`#https://list.suning.com/0-20006-0-0-0-0-0-0-0-0-11635.html?safp=d488778a.phone2018.103327226421.5&safc=cate.0.0&safpn=10003.00006#second-filter`
			`html = self.get_html(url)`
			`# print(html)`
			`oneStarCount = re.findall(r'"oneStarCount":(.*?),', html)[0]`
			`twoStarCount = re.findall(r'"twoStarCount":(.*?),', html)[0]`
			`threeStarCount = re.findall(r'"fourStarCount":(.*?),', html)[0]`
			`fourStarCount = re.findall(r'"threeStarCount":(.*?),', html)[0]`
			`fiveStarCount = re.findall(r'"fiveStarCount":(.*?),', html)[0]`
			`picFlagCount = re.findall(r'"picFlagCount":(.*?),', html)[0]`
			`totalCount = re.findall(r'"totalCount":(.*?),', html)[0]`
			`againCount = re.findall(r'"againCount":(.*?),', html)[0]`
			`return totalCount, picFlagCount, int(fiveStarCount)+int(fourStarCount), int(twoStarCount)+int(threeStarCount), oneStarCount, againCount`
			`#获取手机的信息里面的获取clusterid这个很关键主要是后面的评论和评论统计数据url构造中都有这个参数`
			`def get_goods_title(self, url):`
			`html = self.get_html("https:" + url)`
			`soup = BeautifulSoup(html, 'lxml')`
			`# print(html)`
			`title = soup.find_all('title')[0].get_text()`
			`clusterId = re.compile(r'"clusterId":"(.*?)"', re.S)`
			`clusterId_ret = clusterId.findall(html)`
			`try:`
			`args0 = soup.find_all("dd", attrs={"class": "r-info"})[0].get_text()`
			`args1 = soup.find_all("dd", attrs={"class": "r-info"})[1].get_text()`
			`args2 = soup.find_all("dd", attrs={"class": "r-info"})[2].get_text()`
			`except:`
			`args0, args1, args2 = ["无参数"] * 3`
			`return clusterId_ret[0],title, args0, args1, args2`
			`#获取手机的价格手机价格的连接需要自己拼凑`
			`def get_price_html(self, goods_src):`
			`try:`
			`src_args = re.findall(r"com/(.*?).html", goods_src)[0]`
			`key0 = src_args.split("/")[0]`
			`key1 = src_args.split("/")[-1]`
			`price_src = "https://pas.suning.com/nspcsale_0_0000000" + key1 + "_0000000" + key1 + "_" + key0 + "_250_029_0290199_20089_1000257_9254_12006_Z001___R1901001_0.5_0___000060864___.html?callback=pcData&_=1581050220963"`
			`html = self.get_html(price_src)`
			`price = re.compile(r'"netPrice":"(.*?)"', re.S)`
			`price_ret = price.findall(html)`
			`return price_ret[0]`
			`except:`
			`return -1`
			`#获取评论数据`
			`def get_comment_data(self, goods_src, clsid, num, redata):`
			`src_args = re.findall(r"com/(.*?).html", goods_src)[0]`
			`key1 = src_args.split("/")[-1]`
			`for i in range(1, int(num/10)):`
			`url = "http://review.suning.com/ajax/cluster_review_lists/cluster-"+str(clsid)+"-0000000"+str(key1)+"-0000000000-total-"+str(i)+"-default-10-----reviewList.htm?callback=reviewList"`
			`html = self.get_html(url)`
			`print(html)`
			`content = re.findall(r'"content":"(.*?)","publishTime', html)`
			`publishTime = re.findall(r'"publishTime":"(.?)",.?sourceSystem"', html)`
			`username = re.findall(r'"nickName":"(.*?)"', html)`
			`color = re.findall(r'"charaterDesc1":"(.*?)"', html)`
			`edition = re.findall(r'charaterDesc2":"(.*?)"', html)`
			`labelnames = re.findall(r'"labelNames":(.*?),"score"', html)`
			`if len(content) == 0:`
			`break`
			`for val in zip(content, publishTime, username, color, edition, labelnames):`
			`result = redata + list(val)`
			`print(len(result), result)`
			`# # 数据保存到csv文件`
			`self.write_data(result)`
			`time.sleep(1)`
			`#主页面数据获取关键函数`
			`def get_phone_data(self, html):`
			`soup = BeautifulSoup(html, 'lxml')`
			`li = soup.find_all('ul', attrs={'class': 'general clearfix'})[0].find_all("li")`
			`# print(li[1])`
			`for i in range(len(li)):`
			`try:`
			`src = li[i].find_all("a", attrs={"target": "_blank"})[0].get("href")`
			`print(src)`
			`comment_num = li[i].find_all("div", attrs={"class": "info-evaluate"})[0].find_all("a")[0].get_text()`
			`# print(comment_num)`
			`is_self_support = li[i].find_all("div", attrs={"class": "store-stock"})[0].find_all("a")[0].get_text()`
			`# print(is_self_support)`
			`price = self.get_price_html(src)`
			`# print(price)`
			`clusterId, title, args0, args1, args2 = self.get_goods_title(src)`
			`# print(title)`
			`totalCount, picFlagCount, five_fourStarCount, two_threeStarCount, oneStarCount, againCount = self.get_comment_num(clusterId, src)`
			`ret_data = [title, comment_num, price, is_self_support, args0, args1, args2, totalCount, picFlagCount, five_fourStarCount, two_threeStarCount, oneStarCount, againCount]`
			`# print(ret_data)`
			`self.get_comment_data(src, clusterId, int(totalCount), ret_data)`
			`except:`
			`print("数据异常")`
			`continue`

			`def run(self):`
			`a = urllib.parse.quote("华为手机")`
			`for i in range(50):`
			`print("第%s页" % i)`
			`url = "https://search.suning.com/"+a+"/&iy=0&isNoResult=0&cp=" + str(i)`
			`html = self.get_html(url)`
			`self.get_phone_data(html)`
			`if __name__ == "__main__":`
			`SNProcess()`