From 493200e67e15c88cc97d5098406c7ecbc22af54a Mon Sep 17 00:00:00 2001 From: pcgueqlrn <1010121704@qq.com> Date: Fri, 28 May 2021 16:22:37 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E5=8F=96csdn=E4=B8=AA=E4=BA=BA?= =?UTF-8?q?=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- csdn.py | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 csdn.py diff --git a/csdn.py b/csdn.py new file mode 100644 index 0000000..b2dc828 --- /dev/null +++ b/csdn.py @@ -0,0 +1,117 @@ +import requests +import os +from pyquery import PyQuery as pq + +def csdn(): + page_num = 1 + + account = str(input('print csdn id:')) + #account = "ygdxt" + # 首页地址 + baseUrl = 'http://blog.csdn.net/' + account + # 连接页号,组成爬取的页面网址 + myUrl = baseUrl + '/article/list/' + str(page_num) + + headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} + # 构造请求 + + # 访问页面 + myPage = requests.get(myUrl,headers=headers).text + + doc = pq(myPage) + + data_info = doc("aside .data-info dl").items() + + root = "C://Users//86138//Desktop//个人信息//csdn//" + path = root+'个人信息.txt' + + if not os.path.exists(root): # 判断根目录是否存在,不存在就创建 + os.mkdir(root) + with open (path,'w') as fp: + for i,item in enumerate(data_info): + if i==0: + str1 = "原创:"+item.attr("title") + if i==6: + str2 = "粉丝:"+item.attr("title") + if i==3: + str3 = "访问:"+item.attr("title") + if i==8: + str4 = "评论:"+item.attr("title") + if i==9: + str5 = "收藏:"+item.attr("title") + str5 = str1+'\n'+str2+'\n'+str3+'\n'+str4+'\n'+str5 + fp.write(str5) + + # root = "C://Users//86138//Desktop//个人信息//csdn//" + # path = root + '个人信息2.txt' + # + # if not os.path.exists(root): # 判断根目录是否存在,不存在就创建 + # os.mkdir(root) + # grade_box = doc(".grade-box dl").items() + # with open(path,'w') as f: + # for i,item in enumerate(grade_box): + # if i==0: + # childitem = item("dd > a") + # str1 = "等级:"+childitem.attr("title")[0:2] + # if i==1: + # childitem = item("dd") + # str2 = "访问:"+childitem.attr("title") + # if i==2: + # childitem = item("dd") + # str3 = "积分:"+childitem.attr("title") + # if i==3: + # str4 = "排名:"+item.attr("title") + # str5 = str1+str2+str3+str4 + # f.write(str5) + + + # 获取每一页的信息 + root = "C://Users//86138//Desktop//个人信息//csdn//" + path = root + '博客.txt' + + if not os.path.exists(root): # 判断根目录是否存在,不存在就创建 + os.mkdir(root) + with open(path,'w') as fp: + fp.write('') + + with open(path,'a') as f: + while True: + + # 首页地址 + baseUrl = 'http://blog.csdn.net/' + account + # 连接页号,组成爬取的页面网址 + myUrl = baseUrl + '/article/list/' + str(page_num) + # 构造请求 + myPage = requests.get(myUrl,headers=headers).text + if len(myPage) == i: + break + + f.write('-----------------------------第 %d 页---------------------------------\n' % (page_num,)) + + doc = pq(myPage) + articles = doc(".article-list > div").items() + articleList = [] + for i,item in enumerate(articles): + if i == 0: + continue + title = item("h4 > a").text()[2:] + date = item("p > .date").text() + num_item = item("p > .read-num").items() + ariticle = [date, title] + for j,jitem in enumerate(num_item): + if j == 0: + read_num = jitem.text() + ariticle.append(read_num) + else: + comment_num = jitem.text() + ariticle.append(comment_num) + articleList.append(ariticle) + for item in articleList: + if(len(item)==4): + f.write("%s %s %s %s\n"%(item[0],item[1],item[2],item[3])) + print("%s %s %s %s\n" % (item[0], item[1], item[2], item[3])) + page_num = page_num + 1 + i = len(myPage) + +if __name__ == '__main__': + csdn()