爬取csdn个人信息

master
pcgueqlrn 4 years ago
parent 7902f0f68d
commit 493200e67e

@ -0,0 +1,117 @@
import requests
import os
from pyquery import PyQuery as pq
def csdn():
page_num = 1
account = str(input('print csdn id:'))
#account = "ygdxt"
# 首页地址
baseUrl = 'http://blog.csdn.net/' + account
# 连接页号,组成爬取的页面网址
myUrl = baseUrl + '/article/list/' + str(page_num)
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
# 构造请求
# 访问页面
myPage = requests.get(myUrl,headers=headers).text
doc = pq(myPage)
data_info = doc("aside .data-info dl").items()
root = "C://Users//86138//Desktop//个人信息//csdn//"
path = root+'个人信息.txt'
if not os.path.exists(root): # 判断根目录是否存在,不存在就创建
os.mkdir(root)
with open (path,'w') as fp:
for i,item in enumerate(data_info):
if i==0:
str1 = "原创:"+item.attr("title")
if i==6:
str2 = "粉丝:"+item.attr("title")
if i==3:
str3 = "访问:"+item.attr("title")
if i==8:
str4 = "评论:"+item.attr("title")
if i==9:
str5 = "收藏:"+item.attr("title")
str5 = str1+'\n'+str2+'\n'+str3+'\n'+str4+'\n'+str5
fp.write(str5)
# root = "C://Users//86138//Desktop//个人信息//csdn//"
# path = root + '个人信息2.txt'
#
# if not os.path.exists(root): # 判断根目录是否存在,不存在就创建
# os.mkdir(root)
# grade_box = doc(".grade-box dl").items()
# with open(path,'w') as f:
# for i,item in enumerate(grade_box):
# if i==0:
# childitem = item("dd > a")
# str1 = "等级:"+childitem.attr("title")[0:2]
# if i==1:
# childitem = item("dd")
# str2 = "访问:"+childitem.attr("title")
# if i==2:
# childitem = item("dd")
# str3 = "积分:"+childitem.attr("title")
# if i==3:
# str4 = "排名:"+item.attr("title")
# str5 = str1+str2+str3+str4
# f.write(str5)
# 获取每一页的信息
root = "C://Users//86138//Desktop//个人信息//csdn//"
path = root + '博客.txt'
if not os.path.exists(root): # 判断根目录是否存在,不存在就创建
os.mkdir(root)
with open(path,'w') as fp:
fp.write('')
with open(path,'a') as f:
while True:
# 首页地址
baseUrl = 'http://blog.csdn.net/' + account
# 连接页号,组成爬取的页面网址
myUrl = baseUrl + '/article/list/' + str(page_num)
# 构造请求
myPage = requests.get(myUrl,headers=headers).text
if len(myPage) == i:
break
f.write('-----------------------------第 %d 页---------------------------------\n' % (page_num,))
doc = pq(myPage)
articles = doc(".article-list > div").items()
articleList = []
for i,item in enumerate(articles):
if i == 0:
continue
title = item("h4 > a").text()[2:]
date = item("p > .date").text()
num_item = item("p > .read-num").items()
ariticle = [date, title]
for j,jitem in enumerate(num_item):
if j == 0:
read_num = jitem.text()
ariticle.append(read_num)
else:
comment_num = jitem.text()
ariticle.append(comment_num)
articleList.append(ariticle)
for item in articleList:
if(len(item)==4):
f.write("%s %s %s %s\n"%(item[0],item[1],item[2],item[3]))
print("%s %s %s %s\n" % (item[0], item[1], item[2], item[3]))
page_num = page_num + 1
i = len(myPage)
if __name__ == '__main__':
csdn()
Loading…
Cancel
Save