milkSpider/pipelines.py

# -*- coding: utf-8 -*-
from lxml import etree
import csv
import os

def gethtml(response, gethtml_mode = "url"):  # 用etree格式化得到的对象
    try:
        if isElementTree(response):
            return response
        if gethtml_mode == "cache":
            html = etree.parse(response, etree.HTMLParser(encoding='utf-8'))
        elif gethtml_mode == "url":
            html = etree.HTML(response)
        else:
            print("sth wrong with parameters in pipelines.gethtml(gethtml_mode)")
            exit()
        return html
    except BaseException as e:
        print(e)
        print("sth wrong in pipelines.gethtml, see your settings in settings.py")
        exit()

def getidlist(response) -> list:    # 获取id
    reg = r"//li/@data-sku"
    if isElementTree(response):
        html = response
    else:
        html = gethtml(response)
    # print(html)
    idlist = html.xpath(reg)
    return idlist

def myreplace(name) -> str: # 简单的处理输出
    name = name.strip()
    return name

def isElementTree(response) -> bool:    # 用于判断是否已经为etree对象
    if str(type(response)) == "<class 'lxml.etree._ElementTree'>":
        return True
    else:
        return False

class item:

    def __init__(self, id, response):
        self.id = id
        self.response = response

    def getitem(self) -> list:
        reg = r"//li[@data-sku='" + str(self.id) + r"']"
        item = self.response.xpath(reg)[0]
        item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')

        def name() -> str:
            reg = r"//div[@class='p-name p-name-type-3']/a/em/text()"
            html = etree.HTML(item)
            name = html.xpath(reg)[0]
            name = myreplace(name)
            # print(name)
            return name

        def price() -> str:
            reg = r"//i[@data-price]/text()"
            html = etree.HTML(item)
            price = html.xpath(reg)[0]
            price = str(price)
            # print(price)
            return price

        def attribute() -> str:
            reg = r"//span[@class='attr']/b/text()"
            html = etree.HTML(item)
            attribute = html.xpath(reg)
            attrStr = ""
            for attr in attribute:
                attrStr += attr + ' '
            # print(attribute)
            return myreplace(attrStr)

        def sales() -> str:
            reg = r"//div[@class='p-icons']/i/text()"
            html = etree.HTML(item)
            sales = html.xpath(reg)
            saleStr = ""
            for sale in sales:
                saleStr += sale + ' '
            # print(sales)
            return myreplace(saleStr)

        def url() -> str:
            url = r"https://item.jd.com/" + str(self.id) + r".html"
            return url

        itemlist = [str(self.id), name(), price(), attribute(), sales(), url()]
        return itemlist

def print2console(response):    # 输出到命令行

    def output(itemlist = []):
        print("商品id：" + itemlist[0])
        print("商品名称：" + itemlist[1])
        print("价格：￥" + itemlist[2])
        print("关键词：" + itemlist[3])
        print("促销活动：" + itemlist[4])
        print("商品链接：" + itemlist[5])
        print("")

    try:
        for id in getidlist(response):
            if int(id) < 1000:
                continue
            aitem = item(id, gethtml(response))
            itemlist = aitem.getitem()
            output(itemlist)

    except BaseException as e:
        print(e)
        print("pipelines.py didn't work properly")

    print("pipelines.print2console is done")

def write2csv(response, filename_csv):    # 写入到csv文件

    def write(writer):
        for id in getidlist(response):
            if int(id) < 1000:
                continue
            aitem = item(id, gethtml(response))
            itemlist = aitem.getitem()
            # print(itemlist)
            writer.writerow(itemlist)

    try:
        if os.path.exists(filename_csv):
            with open(filename_csv, 'a+', encoding = 'utf-8-sig', newline = '') as fd:  # 存在，文件尾追加
                try:
                    writer = csv.writer(fd)
                except BaseException as e:
                    print(e)
                write(writer)
                fd.close()

        else:
            with open(filename_csv, 'w+', encoding = 'utf-8-sig', newline = '') as fd:  # 不存在，创建并从文件头开始
                try:
                    writer = csv.writer(fd)
                except BaseException as e:
                    print(e)

                headers = ['id', '商品名称', '价格(人民币)', '标签', '促销策略', 'url']
                writer.writerow(headers)

                write(writer)
                fd.close()

        # print("pipelines.write2csv is done")

    except BaseException as e:
        print(e)
        print("sth wrong in pipelines.write2csv")

if __name__ == "__main__":
    write2csv()