diff --git a/DataAnalysis1.py b/DataAnalysis1.py new file mode 100644 index 0000000..e263647 --- /dev/null +++ b/DataAnalysis1.py @@ -0,0 +1,51 @@ +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# 加载数据 +df = pd.read_csv('/mnt/豆瓣电影_20250510_173909.csv') + +# 将主演列中的空值填充为空字符串 +df['主演'] = df['主演'].fillna('') + +# 将主演列按斜杠分割并展开为新的行 +actors = df['主演'].str.split(' / ').explode() + +# 使用NumPy统计不同演员的参演数量 +unique_actors, counts = np.unique(actors, return_counts=True) + +# 创建DataFrame并排序 +actor_df = pd.DataFrame({'actor': unique_actors, 'count': counts}) +top_ten = actor_df.sort_values('count', ascending=False).head(10) + +# 准备饼图数据 +labels = top_ten['actor'] +sizes = top_ten['count'] + +# 设置图片清晰度 +plt.rcParams['figure.dpi'] = 300 + +# 设置中文字体 +plt.rcParams['font.sans-serif'] = ['WenQuanYi Zen Hei'] + +# 绘制饼图 +plt.figure(figsize=(10, 8)) +plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85) + +# 画一个白色的圆在中间,使饼图成为环形图 +centre_circle = plt.Circle((0, 0), 0.70, fc='white') +fig = plt.gcf() +fig.gca().add_artist(centre_circle) + +# 设置标题 +plt.title('演员参演数量前十名占比', fontsize=14) + +# 确保饼图是圆形 +plt.axis('equal') +plt.tight_layout() +plt.show() + +# 打印统计结果 +print('演员参演数量前十名:') +for idx, row in top_ten.iterrows(): + print(f"{row['actor']}: {row['count']}部") \ No newline at end of file