Climb_the_hot_list/新建文本文档.txt

import requests
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
from matplotlib import pyplot as plt
import numpy as np

def can_fetch(urlrobots, url):
    rp = RobotFileParser()
    rp.set_url(urlrobots+"/robots.txt")
    rp.read()
    return rp.can_fetch('*', url)

def check_robots(url):
    if can_fetch(url, url):
        response = requests.get(url)
        if response.status_code == 200:
            flag=1
            print('Robots协议允许访问该网站')
            return True
    else:
        print("Robots协议不允许访问该网站")
        return False

def get_pictures(url,path):
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'}
    re=requests.get(url,headers=headers)
    print(re.status_code)#查看请求状态，返回200说明正常
    with open('img/'+path, 'wb') as f:#把图片数据写入本地，wb表示二进制储存
        f.write(re.content)
        
def get_pictures_urls(text):
    st='img src="'
    m=len(st)
    i=0
    n=len(text)
    urls=[]#储存url
    while i<n:
        if len(urls)==9:
            break
        if text[i:i+m]==st:
            url=''
            for j in range(i+m,n):
                if text[j]=='"':
                    i=j
                    if url[20:25]=='image':
                        urls.append(url)
                    break
                url+=text[j]
        i+=1
    return urls

def get_pictures_names(text):
    st='title="'
    m=len(st)
    i=0
    n=len(text)
    urls=[]#储存url
    while i<n:
        if len(urls)==9:
            break
        if text[i:i+m]==st:
            url=''
            for j in range(i+m,n):
                if text[j]=='"':
                    i=j
                    urls.append(url)
                    break
                url+=text[j]
        i+=1
    return urls

url='https://www.iqiyi.com/ranks1PCW/home?v=12.3.15170&deviceId=76f0b3e%E2%80%A6'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'}

if check_robots(url):
    re=requests.get(url,headers=headers)
    re.encoding = "utf-8"
    urls=get_pictures_urls(re.text)#获取当前页面所有图片的url

    #print(re.text)

    soup=BeautifulSoup(re.text,'lxml')#html.parser
    all_top=soup.findAll(attrs={'class':'rvi__index__num'})
    all_top1=str(soup.findAll(attrs={'class':'rvi__tit1'}))
    names=get_pictures_names(all_top1) 

    for i in range(len(urls)):#批量爬取图片
        url='https:'+urls[i]
        path='榜'+str(i+1)+names[i]+'.jpg'
        get_pictures(url,path)
        
    #参数设置
    plt.rcParams['font.sans-serif'] = ['SimHei']    #将中文字体设置为黑体
    plt.rcParams['axes.unicode_minus'] = False     #不包含中文负号
    plt.rcParams['figure.dpi'] = 200               #分辨率为 200
    plt.rcParams['figure.figsize'] = (5,3)        #图像显示大小设置为 (5,3)

    #排名和热度值数据导入
    rankname = []
    hotdegree = []
    
    for i in names:
        rankname.append(i)
                         
    for i in all_top:
        hotdegree.append(int(i.string))
        if len(hotdegree)==9:
            break
        
    #将横坐标转换为数值
    x = np.arange(len(rankname))*8
    width = 4

    #计算每一块的起始坐标
    rank_x = x

    #绘图
    plt.bar(rank_x,hotdegree,width=width,color="red",label="热度值")
    

    #将横坐标标签命名
    plt.xticks(x,labels=rankname,fontsize=4)

    #显示柱状图的高度文本
    for i in range(len(rankname)):
        plt.text(rank_x[i],hotdegree[i], hotdegree[i],va="bottom",ha="center",fontsize=8)

    #显示图例
    plt.legend(loc="best") 
    plt.savefig('img/'+'Top9.png')