parent
7902f0f68d
commit
493200e67e
@ -0,0 +1,117 @@
|
|||||||
|
import requests
|
||||||
|
import os
|
||||||
|
from pyquery import PyQuery as pq
|
||||||
|
|
||||||
|
def csdn():
|
||||||
|
page_num = 1
|
||||||
|
|
||||||
|
account = str(input('print csdn id:'))
|
||||||
|
#account = "ygdxt"
|
||||||
|
# 首页地址
|
||||||
|
baseUrl = 'http://blog.csdn.net/' + account
|
||||||
|
# 连接页号,组成爬取的页面网址
|
||||||
|
myUrl = baseUrl + '/article/list/' + str(page_num)
|
||||||
|
|
||||||
|
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
|
||||||
|
# 构造请求
|
||||||
|
|
||||||
|
# 访问页面
|
||||||
|
myPage = requests.get(myUrl,headers=headers).text
|
||||||
|
|
||||||
|
doc = pq(myPage)
|
||||||
|
|
||||||
|
data_info = doc("aside .data-info dl").items()
|
||||||
|
|
||||||
|
root = "C://Users//86138//Desktop//个人信息//csdn//"
|
||||||
|
path = root+'个人信息.txt'
|
||||||
|
|
||||||
|
if not os.path.exists(root): # 判断根目录是否存在,不存在就创建
|
||||||
|
os.mkdir(root)
|
||||||
|
with open (path,'w') as fp:
|
||||||
|
for i,item in enumerate(data_info):
|
||||||
|
if i==0:
|
||||||
|
str1 = "原创:"+item.attr("title")
|
||||||
|
if i==6:
|
||||||
|
str2 = "粉丝:"+item.attr("title")
|
||||||
|
if i==3:
|
||||||
|
str3 = "访问:"+item.attr("title")
|
||||||
|
if i==8:
|
||||||
|
str4 = "评论:"+item.attr("title")
|
||||||
|
if i==9:
|
||||||
|
str5 = "收藏:"+item.attr("title")
|
||||||
|
str5 = str1+'\n'+str2+'\n'+str3+'\n'+str4+'\n'+str5
|
||||||
|
fp.write(str5)
|
||||||
|
|
||||||
|
# root = "C://Users//86138//Desktop//个人信息//csdn//"
|
||||||
|
# path = root + '个人信息2.txt'
|
||||||
|
#
|
||||||
|
# if not os.path.exists(root): # 判断根目录是否存在,不存在就创建
|
||||||
|
# os.mkdir(root)
|
||||||
|
# grade_box = doc(".grade-box dl").items()
|
||||||
|
# with open(path,'w') as f:
|
||||||
|
# for i,item in enumerate(grade_box):
|
||||||
|
# if i==0:
|
||||||
|
# childitem = item("dd > a")
|
||||||
|
# str1 = "等级:"+childitem.attr("title")[0:2]
|
||||||
|
# if i==1:
|
||||||
|
# childitem = item("dd")
|
||||||
|
# str2 = "访问:"+childitem.attr("title")
|
||||||
|
# if i==2:
|
||||||
|
# childitem = item("dd")
|
||||||
|
# str3 = "积分:"+childitem.attr("title")
|
||||||
|
# if i==3:
|
||||||
|
# str4 = "排名:"+item.attr("title")
|
||||||
|
# str5 = str1+str2+str3+str4
|
||||||
|
# f.write(str5)
|
||||||
|
|
||||||
|
|
||||||
|
# 获取每一页的信息
|
||||||
|
root = "C://Users//86138//Desktop//个人信息//csdn//"
|
||||||
|
path = root + '博客.txt'
|
||||||
|
|
||||||
|
if not os.path.exists(root): # 判断根目录是否存在,不存在就创建
|
||||||
|
os.mkdir(root)
|
||||||
|
with open(path,'w') as fp:
|
||||||
|
fp.write('')
|
||||||
|
|
||||||
|
with open(path,'a') as f:
|
||||||
|
while True:
|
||||||
|
|
||||||
|
# 首页地址
|
||||||
|
baseUrl = 'http://blog.csdn.net/' + account
|
||||||
|
# 连接页号,组成爬取的页面网址
|
||||||
|
myUrl = baseUrl + '/article/list/' + str(page_num)
|
||||||
|
# 构造请求
|
||||||
|
myPage = requests.get(myUrl,headers=headers).text
|
||||||
|
if len(myPage) == i:
|
||||||
|
break
|
||||||
|
|
||||||
|
f.write('-----------------------------第 %d 页---------------------------------\n' % (page_num,))
|
||||||
|
|
||||||
|
doc = pq(myPage)
|
||||||
|
articles = doc(".article-list > div").items()
|
||||||
|
articleList = []
|
||||||
|
for i,item in enumerate(articles):
|
||||||
|
if i == 0:
|
||||||
|
continue
|
||||||
|
title = item("h4 > a").text()[2:]
|
||||||
|
date = item("p > .date").text()
|
||||||
|
num_item = item("p > .read-num").items()
|
||||||
|
ariticle = [date, title]
|
||||||
|
for j,jitem in enumerate(num_item):
|
||||||
|
if j == 0:
|
||||||
|
read_num = jitem.text()
|
||||||
|
ariticle.append(read_num)
|
||||||
|
else:
|
||||||
|
comment_num = jitem.text()
|
||||||
|
ariticle.append(comment_num)
|
||||||
|
articleList.append(ariticle)
|
||||||
|
for item in articleList:
|
||||||
|
if(len(item)==4):
|
||||||
|
f.write("%s %s %s %s\n"%(item[0],item[1],item[2],item[3]))
|
||||||
|
print("%s %s %s %s\n" % (item[0], item[1], item[2], item[3]))
|
||||||
|
page_num = page_num + 1
|
||||||
|
i = len(myPage)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
csdn()
|
Loading…
Reference in new issue