import requests
import  os
from pyquery import PyQuery as pq

def csdn():
    page_num = 1

    account = str(input('print csdn id:'))
    #account = "ygdxt"
    # 首页地址
    baseUrl = 'http://blog.csdn.net/' + account
    # 连接页号，组成爬取的页面网址
    myUrl = baseUrl + '/article/list/' + str(page_num)

    headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
    # 构造请求

    # 访问页面
    myPage = requests.get(myUrl,headers=headers).text

    doc = pq(myPage)

    data_info = doc("aside .data-info dl").items()

    root = "C://Users//86138//Desktop//个人信息//csdn//"
    path = root+'个人信息.txt'

    if not os.path.exists(root):  # 判断根目录是否存在，不存在就创建
        os.mkdir(root)
    with open (path,'w') as fp:
        for i,item in enumerate(data_info):
            if i==0:
                str1 = "原创:"+item.attr("title")
            if i==6:
                str2 = "粉丝:"+item.attr("title")
            if i==3:
                str3 = "访问:"+item.attr("title")
            if i==8:
                str4 = "评论:"+item.attr("title")
            if i==9:
                str5 = "收藏:"+item.attr("title")
        str5 = str1+'\n'+str2+'\n'+str3+'\n'+str4+'\n'+str5
        fp.write(str5)

    # root = "C://Users//86138//Desktop//个人信息//csdn//"
    # path = root + '个人信息2.txt'
    #
    # if not os.path.exists(root):  # 判断根目录是否存在，不存在就创建
    #     os.mkdir(root)
    # grade_box = doc(".grade-box dl").items()
    # with open(path,'w') as f:
    #     for i,item in enumerate(grade_box):
    #         if i==0:
    #             childitem = item("dd > a")
    #             str1 = "等级:"+childitem.attr("title")[0:2]
    #         if i==1:
    #             childitem = item("dd")
    #             str2 = "访问:"+childitem.attr("title")
    #         if i==2:
    #             childitem = item("dd")
    #             str3 = "积分:"+childitem.attr("title")
    #         if i==3:
    #             str4 = "排名:"+item.attr("title")
    #     str5 = str1+str2+str3+str4
    #     f.write(str5)


    # 获取每一页的信息
    root = "C://Users//86138//Desktop//个人信息//csdn//"
    path = root + '博客.txt'

    if not os.path.exists(root):  # 判断根目录是否存在，不存在就创建
        os.mkdir(root)
    with open(path,'w') as fp:
        fp.write('')

    with open(path,'a') as f:
        while True:

            # 首页地址
            baseUrl = 'http://blog.csdn.net/' + account
            # 连接页号，组成爬取的页面网址
            myUrl = baseUrl + '/article/list/' + str(page_num)
            # 构造请求
            myPage = requests.get(myUrl,headers=headers).text
            if len(myPage) == i:
                break

            f.write('-----------------------------第 %d 页---------------------------------\n' % (page_num,))

            doc = pq(myPage)
            articles = doc(".article-list > div").items()
            articleList = []
            for i,item in enumerate(articles):
                if i == 0:
                    continue
                title = item("h4 > a").text()[2:]
                date = item("p > .date").text()
                num_item = item("p > .read-num").items()
                ariticle = [date, title]
                for j,jitem in enumerate(num_item):
                    if j == 0:
                        read_num = jitem.text()
                        ariticle.append(read_num)
                    else:
                        comment_num = jitem.text()
                        ariticle.append(comment_num)
                articleList.append(ariticle)
            for item in articleList:
                if(len(item)==4):
                    f.write("%s %s %s %s\n"%(item[0],item[1],item[2],item[3]))
                    print("%s %s %s %s\n" % (item[0], item[1], item[2], item[3]))
            page_num = page_num + 1
            i = len(myPage)

if __name__ == '__main__':
    csdn()