|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
from urllib.robotparser import RobotFileParser
|
|
|
from matplotlib import pyplot as plt
|
|
|
import numpy as np
|
|
|
|
|
|
def can_fetch(urlrobots, url):
|
|
|
rp = RobotFileParser()
|
|
|
rp.set_url(urlrobots+"/robots.txt")
|
|
|
rp.read()
|
|
|
return rp.can_fetch('*', url)
|
|
|
|
|
|
def check_robots(url):
|
|
|
if can_fetch(url, url):
|
|
|
response = requests.get(url)
|
|
|
if response.status_code == 200:
|
|
|
flag=1
|
|
|
print('Robots协议允许访问该网站')
|
|
|
return True
|
|
|
else:
|
|
|
print("Robots协议不允许访问该网站")
|
|
|
return False
|
|
|
|
|
|
def get_pictures(url,path):
|
|
|
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'}
|
|
|
re=requests.get(url,headers=headers)
|
|
|
print(re.status_code)#查看请求状态,返回200说明正常
|
|
|
with open('img/'+path, 'wb') as f:#把图片数据写入本地,wb表示二进制储存
|
|
|
f.write(re.content)
|
|
|
|
|
|
def get_pictures_urls(text):
|
|
|
st='img src="'
|
|
|
m=len(st)
|
|
|
i=0
|
|
|
n=len(text)
|
|
|
urls=[]#储存url
|
|
|
while i<n:
|
|
|
if len(urls)==9:
|
|
|
break
|
|
|
if text[i:i+m]==st:
|
|
|
url=''
|
|
|
for j in range(i+m,n):
|
|
|
if text[j]=='"':
|
|
|
i=j
|
|
|
if url[20:25]=='image':
|
|
|
urls.append(url)
|
|
|
break
|
|
|
url+=text[j]
|
|
|
i+=1
|
|
|
return urls
|
|
|
|
|
|
def get_pictures_names(text):
|
|
|
st='title="'
|
|
|
m=len(st)
|
|
|
i=0
|
|
|
n=len(text)
|
|
|
urls=[]#储存url
|
|
|
while i<n:
|
|
|
if len(urls)==9:
|
|
|
break
|
|
|
if text[i:i+m]==st:
|
|
|
url=''
|
|
|
for j in range(i+m,n):
|
|
|
if text[j]=='"':
|
|
|
i=j
|
|
|
urls.append(url)
|
|
|
break
|
|
|
url+=text[j]
|
|
|
i+=1
|
|
|
return urls
|
|
|
|
|
|
url='https://www.iqiyi.com/ranks1PCW/home?v=12.3.15170&deviceId=76f0b3e%E2%80%A6'
|
|
|
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'}
|
|
|
|
|
|
if check_robots(url):
|
|
|
re=requests.get(url,headers=headers)
|
|
|
re.encoding = "utf-8"
|
|
|
urls=get_pictures_urls(re.text)#获取当前页面所有图片的url
|
|
|
|
|
|
#print(re.text)
|
|
|
|
|
|
soup=BeautifulSoup(re.text,'lxml')#html.parser
|
|
|
all_top=soup.findAll(attrs={'class':'rvi__index__num'})
|
|
|
all_top1=str(soup.findAll(attrs={'class':'rvi__tit1'}))
|
|
|
names=get_pictures_names(all_top1)
|
|
|
|
|
|
for i in range(len(urls)):#批量爬取图片
|
|
|
url='https:'+urls[i]
|
|
|
path='榜'+str(i+1)+names[i]+'.jpg'
|
|
|
get_pictures(url,path)
|
|
|
|
|
|
#参数设置
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei'] #将中文字体设置为黑体
|
|
|
plt.rcParams['axes.unicode_minus'] = False #不包含中文负号
|
|
|
plt.rcParams['figure.dpi'] = 200 #分辨率为 200
|
|
|
plt.rcParams['figure.figsize'] = (5,3) #图像显示大小设置为 (5,3)
|
|
|
|
|
|
#排名和热度值数据导入
|
|
|
rankname = []
|
|
|
hotdegree = []
|
|
|
|
|
|
for i in names:
|
|
|
rankname.append(i)
|
|
|
|
|
|
for i in all_top:
|
|
|
hotdegree.append(int(i.string))
|
|
|
if len(hotdegree)==9:
|
|
|
break
|
|
|
|
|
|
#将横坐标转换为数值
|
|
|
x = np.arange(len(rankname))*8
|
|
|
width = 4
|
|
|
|
|
|
#计算每一块的起始坐标
|
|
|
rank_x = x
|
|
|
|
|
|
#绘图
|
|
|
plt.bar(rank_x,hotdegree,width=width,color="red",label="热度值")
|
|
|
|
|
|
|
|
|
#将横坐标标签命名
|
|
|
plt.xticks(x,labels=rankname,fontsize=4)
|
|
|
|
|
|
#显示柱状图的高度文本
|
|
|
for i in range(len(rankname)):
|
|
|
plt.text(rank_x[i],hotdegree[i], hotdegree[i],va="bottom",ha="center",fontsize=8)
|
|
|
|
|
|
#显示图例
|
|
|
plt.legend(loc="best")
|
|
|
plt.savefig('img/'+'Top9.png') |