import requests import os from pyquery import PyQuery as pq def csdn(): page_num = 1 account = str(input('print csdn id:')) #account = "ygdxt" # 首页地址 baseUrl = 'http://blog.csdn.net/' + account # 连接页号,组成爬取的页面网址 myUrl = baseUrl + '/article/list/' + str(page_num) headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} # 构造请求 # 访问页面 myPage = requests.get(myUrl,headers=headers).text doc = pq(myPage) data_info = doc("aside .data-info dl").items() root = "C://Users//86138//Desktop//个人信息//csdn//" path = root+'个人信息.txt' if not os.path.exists(root): # 判断根目录是否存在,不存在就创建 os.mkdir(root) with open (path,'w') as fp: for i,item in enumerate(data_info): if i==0: str1 = "原创:"+item.attr("title") if i==6: str2 = "粉丝:"+item.attr("title") if i==3: str3 = "访问:"+item.attr("title") if i==8: str4 = "评论:"+item.attr("title") if i==9: str5 = "收藏:"+item.attr("title") str5 = str1+'\n'+str2+'\n'+str3+'\n'+str4+'\n'+str5 fp.write(str5) # root = "C://Users//86138//Desktop//个人信息//csdn//" # path = root + '个人信息2.txt' # # if not os.path.exists(root): # 判断根目录是否存在,不存在就创建 # os.mkdir(root) # grade_box = doc(".grade-box dl").items() # with open(path,'w') as f: # for i,item in enumerate(grade_box): # if i==0: # childitem = item("dd > a") # str1 = "等级:"+childitem.attr("title")[0:2] # if i==1: # childitem = item("dd") # str2 = "访问:"+childitem.attr("title") # if i==2: # childitem = item("dd") # str3 = "积分:"+childitem.attr("title") # if i==3: # str4 = "排名:"+item.attr("title") # str5 = str1+str2+str3+str4 # f.write(str5) # 获取每一页的信息 root = "C://Users//86138//Desktop//个人信息//csdn//" path = root + '博客.txt' if not os.path.exists(root): # 判断根目录是否存在,不存在就创建 os.mkdir(root) with open(path,'w') as fp: fp.write('') with open(path,'a') as f: while True: # 首页地址 baseUrl = 'http://blog.csdn.net/' + account # 连接页号,组成爬取的页面网址 myUrl = baseUrl + '/article/list/' + str(page_num) # 构造请求 myPage = requests.get(myUrl,headers=headers).text if len(myPage) == i: break f.write('-----------------------------第 %d 页---------------------------------\n' % (page_num,)) doc = pq(myPage) articles = doc(".article-list > div").items() articleList = [] for i,item in enumerate(articles): if i == 0: continue title = item("h4 > a").text()[2:] date = item("p > .date").text() num_item = item("p > .read-num").items() ariticle = [date, title] for j,jitem in enumerate(num_item): if j == 0: read_num = jitem.text() ariticle.append(read_num) else: comment_num = jitem.text() ariticle.append(comment_num) articleList.append(ariticle) for item in articleList: if(len(item)==4): f.write("%s %s %s %s\n"%(item[0],item[1],item[2],item[3])) print("%s %s %s %s\n" % (item[0], item[1], item[2], item[3])) page_num = page_num + 1 i = len(myPage) if __name__ == '__main__': csdn()