You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
a/网络爬虫爬取热门动画电影.py

43 lines
1.6 KiB

import re
import requests
import pandas as pd
import matplotlib.pyplot as mp
mp.rcParams['font.sans-serif'] = ['SimHei']
c=[]
for n in ['','_2','_3','_4'] :
a='http://www.manmankan.com/dy2013/dianying/donghua/index{}.shtml'.format(n)
A={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'}
b=requests.get(a,headers=A)
b.encoding = 'utf-8'
obj=re.compile('<dd style="display:none.*?<div class="title">(?P<name>.*?)</di.*?演:<span class="name">(?P<director>.*?)</span></d.*?电影">(?P<country>.*?)</a><a.*?上映:<span class="year">(?P<time>.*?)</spa',re.S)
res=obj.finditer(b.text)
for i in res:
C=[]
C.append(i.group('name'))
C.append(i.group('director'))
C.append(i.group('country'))
C.append(i.group('time'))
c.append(C)
df=pd.DataFrame(columns=['电影名称',"导演",'地区','上映时间'])
for i in range(len(c)):
df=df.append({'电影名称':c[i][0],'导演':c[i][1],'地区':c[i][2],'上映时间':c[i][3]},ignore_index=True)
A=df[df.地区=='大陆']
B=df[df.地区=='美国']
C=df[df.地区=='日本']
D=df[(df.地区!='大陆') & (df.地区!='美国') & (df.地区!='日本')]
shuliang=[len(A),len(B),len(C),len(D)]
label=['大陆','美国','日本','其他']
fig=mp.figure('fig',figsize=(100,100))
explode=[0,0,0,0]
mp.title('热门动画电影中各地区电影占比')
mp.pie(shuliang,labels=label,explode=explode,colors=['red','green','purple','orange'],autopct='%.1f%%')
mp.show()