|
|
|
@ -0,0 +1,100 @@
|
|
|
|
|
import urllib.request
|
|
|
|
|
import os
|
|
|
|
|
import random
|
|
|
|
|
import requests
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
def url_open(url):
|
|
|
|
|
req = urllib.request.Request(url)
|
|
|
|
|
#req.add_header('User-Agent','')
|
|
|
|
|
|
|
|
|
|
proxies = ['60.195.206.86:80','124.205.155.156:9090','124.192.219.1:80']
|
|
|
|
|
proxy = random.choice(proxies)
|
|
|
|
|
|
|
|
|
|
proxy_support = urllib.request.ProxyHandler({'http':proxy})
|
|
|
|
|
opener = urllib.request.build_opener(proxy_support)
|
|
|
|
|
urllib.request.install_opener(opener)
|
|
|
|
|
|
|
|
|
|
response = urllib.request.urlopen(url)
|
|
|
|
|
html = response.read()
|
|
|
|
|
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
def find_picture(url):
|
|
|
|
|
html = url_open(url).decode('utf-8')
|
|
|
|
|
img_picture = []
|
|
|
|
|
|
|
|
|
|
a = html.find('href=')
|
|
|
|
|
|
|
|
|
|
while a != -1:
|
|
|
|
|
b = html.find('.jpg',a,a+255)
|
|
|
|
|
|
|
|
|
|
if b != -1:
|
|
|
|
|
img_picture.append(html[a+6:b+4])
|
|
|
|
|
else:
|
|
|
|
|
b = a+5
|
|
|
|
|
a = html.find('href=',b)
|
|
|
|
|
|
|
|
|
|
return img_picture
|
|
|
|
|
|
|
|
|
|
def save_picture(folder,img_picture):
|
|
|
|
|
for each in img_picture:
|
|
|
|
|
url=each
|
|
|
|
|
res=requests.get(url) #有了网站地址后向服务器发出请求
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
root="C://Users//86138//Desktop//个人信息//bilibili//" #需要存储的根目录
|
|
|
|
|
path=root+"头像.jpg" #需要存储的路径以及文件名,若要自定义文件名则只需将改为path=root+"文件名.jpg
|
|
|
|
|
|
|
|
|
|
try: #处理异常用
|
|
|
|
|
if not os.path.exists(root): #判断根目录是否存在,不存在就创建
|
|
|
|
|
os.mkdir(root)
|
|
|
|
|
if not os.path.exists(path): #查看文件(文件路径)是否存在
|
|
|
|
|
r=requests.get(url) #浏览器向服务器发出请求
|
|
|
|
|
with open(path,'wb') as f:
|
|
|
|
|
f.write(r.content) #把获取到的内容以二进制形式写入文件(图片等文件都是二进制存储的)
|
|
|
|
|
f.close() #写完后好像with自己会关,这行代码可要可不要
|
|
|
|
|
else:
|
|
|
|
|
print("文件已存在")
|
|
|
|
|
except:
|
|
|
|
|
print("爬取失败")
|
|
|
|
|
|
|
|
|
|
def find_name(url):
|
|
|
|
|
html = url_open(url).decode('utf-8')
|
|
|
|
|
|
|
|
|
|
a = html.find('<title>')
|
|
|
|
|
b = html.find('的个人空间',a,a+255)
|
|
|
|
|
|
|
|
|
|
name = html[a+7:b]
|
|
|
|
|
|
|
|
|
|
return name
|
|
|
|
|
|
|
|
|
|
def save_name(name):
|
|
|
|
|
root="C://Users//86138//Desktop//个人信息//bilibili//" #需要存储的根目录
|
|
|
|
|
path=root+"姓名.txt" #需要存储的路径以及文件名,若要自定义文件名则只需将改为path=root+"文件名.jpg
|
|
|
|
|
|
|
|
|
|
try: #处理异常用
|
|
|
|
|
if not os.path.exists(root): #判断根目录是否存在,不存在就创建
|
|
|
|
|
os.mkdir(root)
|
|
|
|
|
if not os.path.exists(path): #查看文件(文件路径)是否存在
|
|
|
|
|
with open(path,'w') as f:
|
|
|
|
|
f.write(name) #把获取到的内容以二进制形式写入文件(图片等文件都是二进制存储的)
|
|
|
|
|
f.close() #写完后好像with自己会关,这行代码可要可不要
|
|
|
|
|
else:
|
|
|
|
|
print("文件已存在")
|
|
|
|
|
except:
|
|
|
|
|
print("爬取失败")
|
|
|
|
|
|
|
|
|
|
def downlodabilibili(folder = 'bilibili'):
|
|
|
|
|
|
|
|
|
|
id = input('请输入个人id号:\n')
|
|
|
|
|
url = "https://space.bilibili.com/"+id
|
|
|
|
|
|
|
|
|
|
img_picture = find_picture(url)
|
|
|
|
|
save_picture(folder,img_picture)
|
|
|
|
|
|
|
|
|
|
img_name = find_name(url)
|
|
|
|
|
save_name(img_name)
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
downlodabilibili()
|