You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

130 lines
3.9 KiB

import requests
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
from matplotlib import pyplot as plt
import numpy as np
def can_fetch(urlrobots, url):
rp = RobotFileParser()
rp.set_url(urlrobots+"/robots.txt")
rp.read()
return rp.can_fetch('*', url)
def check_robots(url):
if can_fetch(url, url):
response = requests.get(url)
if response.status_code == 200:
flag=1
print('Robots协议允许访问该网站')
return True
else:
print("Robots协议不允许访问该网站")
return False
def get_pictures(url,path):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'}
re=requests.get(url,headers=headers)
print(re.status_code)#查看请求状态返回200说明正常
with open('img/'+path, 'wb') as f:#把图片数据写入本地wb表示二进制储存
f.write(re.content)
def get_pictures_urls(text):
st='img src="'
m=len(st)
i=0
n=len(text)
urls=[]#储存url
while i<n:
if len(urls)==9:
break
if text[i:i+m]==st:
url=''
for j in range(i+m,n):
if text[j]=='"':
i=j
if url[20:25]=='image':
urls.append(url)
break
url+=text[j]
i+=1
return urls
def get_pictures_names(text):
st='title="'
m=len(st)
i=0
n=len(text)
urls=[]#储存url
while i<n:
if len(urls)==9:
break
if text[i:i+m]==st:
url=''
for j in range(i+m,n):
if text[j]=='"':
i=j
urls.append(url)
break
url+=text[j]
i+=1
return urls
url='https://www.iqiyi.com/ranks1PCW/home?v=12.3.15170&deviceId=76f0b3e%E2%80%A6'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'}
if check_robots(url):
re=requests.get(url,headers=headers)
re.encoding = "utf-8"
urls=get_pictures_urls(re.text)#获取当前页面所有图片的url
#print(re.text)
soup=BeautifulSoup(re.text,'lxml')#html.parser
all_top=soup.findAll(attrs={'class':'rvi__index__num'})
all_top1=str(soup.findAll(attrs={'class':'rvi__tit1'}))
names=get_pictures_names(all_top1)
for i in range(len(urls)):#批量爬取图片
url='https:'+urls[i]
path='榜'+str(i+1)+names[i]+'.jpg'
get_pictures(url,path)
#参数设置
plt.rcParams['font.sans-serif'] = ['SimHei'] #将中文字体设置为黑体
plt.rcParams['axes.unicode_minus'] = False #不包含中文负号
plt.rcParams['figure.dpi'] = 200 #分辨率为 200
plt.rcParams['figure.figsize'] = (5,3) #图像显示大小设置为 (5,3)
#排名和热度值数据导入
rankname = []
hotdegree = []
for i in names:
rankname.append(i)
for i in all_top:
hotdegree.append(int(i.string))
if len(hotdegree)==9:
break
#将横坐标转换为数值
x = np.arange(len(rankname))*8
width = 4
#计算每一块的起始坐标
rank_x = x
#绘图
plt.bar(rank_x,hotdegree,width=width,color="red",label="热度值")
#将横坐标标签命名
plt.xticks(x,labels=rankname,fontsize=4)
#显示柱状图的高度文本
for i in range(len(rankname)):
plt.text(rank_x[i],hotdegree[i], hotdegree[i],va="bottom",ha="center",fontsize=8)
#显示图例
plt.legend(loc="best")
plt.savefig('img/'+'Top9.png')