forked from pcgueqlrn/InfoSpider
parent
7902f0f68d
commit
493200e67e
@ -0,0 +1,117 @@
|
||||
import requests
|
||||
import os
|
||||
from pyquery import PyQuery as pq
|
||||
|
||||
def csdn():
|
||||
page_num = 1
|
||||
|
||||
account = str(input('print csdn id:'))
|
||||
#account = "ygdxt"
|
||||
# 首页地址
|
||||
baseUrl = 'http://blog.csdn.net/' + account
|
||||
# 连接页号,组成爬取的页面网址
|
||||
myUrl = baseUrl + '/article/list/' + str(page_num)
|
||||
|
||||
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
|
||||
# 构造请求
|
||||
|
||||
# 访问页面
|
||||
myPage = requests.get(myUrl,headers=headers).text
|
||||
|
||||
doc = pq(myPage)
|
||||
|
||||
data_info = doc("aside .data-info dl").items()
|
||||
|
||||
root = "C://Users//86138//Desktop//个人信息//csdn//"
|
||||
path = root+'个人信息.txt'
|
||||
|
||||
if not os.path.exists(root): # 判断根目录是否存在,不存在就创建
|
||||
os.mkdir(root)
|
||||
with open (path,'w') as fp:
|
||||
for i,item in enumerate(data_info):
|
||||
if i==0:
|
||||
str1 = "原创:"+item.attr("title")
|
||||
if i==6:
|
||||
str2 = "粉丝:"+item.attr("title")
|
||||
if i==3:
|
||||
str3 = "访问:"+item.attr("title")
|
||||
if i==8:
|
||||
str4 = "评论:"+item.attr("title")
|
||||
if i==9:
|
||||
str5 = "收藏:"+item.attr("title")
|
||||
str5 = str1+'\n'+str2+'\n'+str3+'\n'+str4+'\n'+str5
|
||||
fp.write(str5)
|
||||
|
||||
# root = "C://Users//86138//Desktop//个人信息//csdn//"
|
||||
# path = root + '个人信息2.txt'
|
||||
#
|
||||
# if not os.path.exists(root): # 判断根目录是否存在,不存在就创建
|
||||
# os.mkdir(root)
|
||||
# grade_box = doc(".grade-box dl").items()
|
||||
# with open(path,'w') as f:
|
||||
# for i,item in enumerate(grade_box):
|
||||
# if i==0:
|
||||
# childitem = item("dd > a")
|
||||
# str1 = "等级:"+childitem.attr("title")[0:2]
|
||||
# if i==1:
|
||||
# childitem = item("dd")
|
||||
# str2 = "访问:"+childitem.attr("title")
|
||||
# if i==2:
|
||||
# childitem = item("dd")
|
||||
# str3 = "积分:"+childitem.attr("title")
|
||||
# if i==3:
|
||||
# str4 = "排名:"+item.attr("title")
|
||||
# str5 = str1+str2+str3+str4
|
||||
# f.write(str5)
|
||||
|
||||
|
||||
# 获取每一页的信息
|
||||
root = "C://Users//86138//Desktop//个人信息//csdn//"
|
||||
path = root + '博客.txt'
|
||||
|
||||
if not os.path.exists(root): # 判断根目录是否存在,不存在就创建
|
||||
os.mkdir(root)
|
||||
with open(path,'w') as fp:
|
||||
fp.write('')
|
||||
|
||||
with open(path,'a') as f:
|
||||
while True:
|
||||
|
||||
# 首页地址
|
||||
baseUrl = 'http://blog.csdn.net/' + account
|
||||
# 连接页号,组成爬取的页面网址
|
||||
myUrl = baseUrl + '/article/list/' + str(page_num)
|
||||
# 构造请求
|
||||
myPage = requests.get(myUrl,headers=headers).text
|
||||
if len(myPage) == i:
|
||||
break
|
||||
|
||||
f.write('-----------------------------第 %d 页---------------------------------\n' % (page_num,))
|
||||
|
||||
doc = pq(myPage)
|
||||
articles = doc(".article-list > div").items()
|
||||
articleList = []
|
||||
for i,item in enumerate(articles):
|
||||
if i == 0:
|
||||
continue
|
||||
title = item("h4 > a").text()[2:]
|
||||
date = item("p > .date").text()
|
||||
num_item = item("p > .read-num").items()
|
||||
ariticle = [date, title]
|
||||
for j,jitem in enumerate(num_item):
|
||||
if j == 0:
|
||||
read_num = jitem.text()
|
||||
ariticle.append(read_num)
|
||||
else:
|
||||
comment_num = jitem.text()
|
||||
ariticle.append(comment_num)
|
||||
articleList.append(ariticle)
|
||||
for item in articleList:
|
||||
if(len(item)==4):
|
||||
f.write("%s %s %s %s\n"%(item[0],item[1],item[2],item[3]))
|
||||
print("%s %s %s %s\n" % (item[0], item[1], item[2], item[3]))
|
||||
page_num = page_num + 1
|
||||
i = len(myPage)
|
||||
|
||||
if __name__ == '__main__':
|
||||
csdn()
|
Loading…
Reference in new issue