import numpy as np import matplotlib.pyplot as plt import pandas as pd # 加载数据 df = pd.read_csv('/mnt/豆瓣电影_20250510_173909.csv') # 将主演列中的空值填充为空字符串 df['主演'] = df['主演'].fillna('') # 将主演列按斜杠分割并展开为新的行 actors = df['主演'].str.split(' / ').explode() # 使用NumPy统计不同演员的参演数量 unique_actors, counts = np.unique(actors, return_counts=True) # 创建DataFrame并排序 actor_df = pd.DataFrame({'actor': unique_actors, 'count': counts}) top_ten = actor_df.sort_values('count', ascending=False).head(10) # 准备饼图数据 labels = top_ten['actor'] sizes = top_ten['count'] # 设置图片清晰度 plt.rcParams['figure.dpi'] = 300 # 设置中文字体 plt.rcParams['font.sans-serif'] = ['WenQuanYi Zen Hei'] # 绘制饼图 plt.figure(figsize=(10, 8)) plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85) # 画一个白色的圆在中间,使饼图成为环形图 centre_circle = plt.Circle((0, 0), 0.70, fc='white') fig = plt.gcf() fig.gca().add_artist(centre_circle) # 设置标题 plt.title('演员参演数量前十名占比', fontsize=14) # 确保饼图是圆形 plt.axis('equal') plt.tight_layout() plt.show() # 打印统计结果 print('演员参演数量前十名:') for idx, row in top_ten.iterrows(): print(f"{row['actor']}: {row['count']}部")