|
|
2 years ago | |
|---|---|---|
| README.md | 2 years ago | |
README.md
doubanmovie
import requests from bs4 import BeautifulSoup import pandas as pd #1. 下载共10个页面的HTML page_indexs = range(0,250,25) list(page_indexs) #下载共10个页面的HTML import requests def download_all_htmls(): """" 下载所有页面的HTML,用于后续分析
"""
htmls = []
for idx in page_indexs:
url = f"https://movie.douban.com/top250?start={idx}&filter="
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"}
print("craw html:",url)
r=requests.get(url,headers=headers)#分别爬取十个页面
htmls.append(r.text)#把HTML加入到列表中
return htmls
#执行爬取3 htmls = download_all_htmls()
#2.解析HTML得到数据 import time def parse_single_html(html):
soup=BeautifulSoup(html,'html.parser')
article_items=(
soup.find("div",attrs="article")
.find("ol",class_="grid_view")
.find_all("div",class_="item")
)
time.sleep(5)
datas=[]
for article_item in article_items:
rank=article_item.find("div",class_="pic").find("em").get_text()
info=article_item.find("div",class_="info")
title=info.find("div",class_="hd").find("span",class_="title").get_text()
stars=(
info.find("div",class_="bd")
.find("div",class_="star")
.find_all("span")
)
time.sleep(5)
rating_star=stars[0]["class"][0]
rating_num=stars[1].get_text()
comments=stars[3].get_text()
datas.append({
"rank":rank,
"title":title,
"rating_star":rating_star.replace("rating","").replace("-t",""),
"rating_num":rating_num,
"comments":comments.replace("人评价","")
})
return datas
htmls[0] import pprint pprint.pprint(parse_single_html(htmls[0])) #需要一点时间反应 all_datas = [] for html in htmls: all_datas.extend(parse_single_html(html)) all_datas len(all_datas)
#3.将结果存入Excel df = pd.DataFrame(all_datas) df#3.将结果存入Excel df.to_excel("豆瓣电影TOP250.xlsx") df.to_csv('./top250.csv',encoding='utf_8_sig') all_datas = df[df['rating_star'] =='5'] display(all_datas)
#4.用"rating_num"绘制箱线图 #绘制箱线图来展示数据的分布情况,箱线图可以显示数据的中位数、四分位数、最大值、最小值以及异常值 #根据豆瓣电影TOP250数据中的评分数据绘制箱线图,能够了解评分数据的分布情况,并检测是否存在异常值 import seaborn as sns import matplotlib.pyplot as plt
将评分转换为数值类型
df['rating_num'] = df['rating_num'].astype(float)
绘制箱线图
plt.figure(figsize=(10, 6)) sns.boxplot(y='rating_num', data=df, color='skyblue') plt.ylabel('Rating') plt.title('Boxplot of Ratings for Top 250 Movies on Douban') plt.show()
#5.用"comments"数量绘制曲线图表
绘制曲线图
import matplotlib.pyplot as plt import matplotlib
设置全局中文字体为"SimHei"
matplotlib.rcParams['font.family'] = 'SimHei'
将comments转换为数值类型
df['comments_num'] = df['comments'].str.replace('万', '0000').astype(int)
按照排名对数据进行排序
df_sorted = df.sort_values(by='rank') plt.figure(figsize=(40, 20)) # 调整图形大小,手动放大看 plt.plot(df_sorted['title'], df_sorted['comments_num'], marker='o', color='skyblue', linestyle='-') plt.xlabel('Movie Title') plt.ylabel('Comments Count') plt.title('Comments Count of Top 250 Movies on Douban') plt.xticks(rotation=90) # 旋转x轴标签,避免重叠 plt.grid(axis='y', linestyle='--', alpha=0.7) # 添加网格线 plt.tight_layout() # 自动调整布局,避免重叠 plt.show()
#6.用"rating_num"绘制横向柱形图 import matplotlib.pyplot as plt import matplotlib
设置全局中文字体为"SimHei"
matplotlib.rcParams['font.family'] = 'SimHei'
将评分转换为数值类型
df['rating_num'] = df['rating_num'].astype(float)
按照评分对数据进行排序
df_sorted = df.sort_values(by='rating_num', ascending=False)
绘制横向柱形图
plt.figure(figsize=(15, 40)) # 调整图形大小 plt.barh(df_sorted['title'], df_sorted['rating_num'], color='skyblue') plt.xlabel('Rating') plt.ylabel('Movie Title') plt.title('Rating Distribution of Top 250 Movies on Douban') plt.gca().invert_yaxis() # 反转y轴,让评分高的电影显示在上方 plt.tight_layout() # 自动调整布局,避免重叠 plt.show()
#7.绘制多子图并存图表 import matplotlib.pyplot as plt import matplotlib
设置全局中文字体为"SimHei"
matplotlib.rcParams['font.family'] = 'SimHei'
创建一个包含两个子图的画布
fig, axs = plt.subplots(2, 1, figsize=(15, 30))
子图1:绘制横向柱形图
axs[0].barh(df_sorted['title'], df_sorted['rating_num'], color='skyblue') axs[0].set_xlabel('Rating') axs[0].set_ylabel('Movie Title') axs[0].set_title('Rating Distribution of Top 250 Movies on Douban') axs[0].invert_yaxis() # 反转y轴,让评分高的电影显示在上方
子图2:绘制曲线图
axs[1].plot(df_sorted['title'], df_sorted['comments_num'], marker='o', color='skyblue', linestyle='-') axs[1].set_xlabel('Movie Title') axs[1].set_ylabel('Comments Count') axs[1].set_title('Comments Count of Top 250 Movies on Douban') axs[1].tick_params(axis='x', rotation=90) # 旋转x轴标签,避免重叠 plt.tight_layout() # 自动调整布局,避免重叠 plt.subplots_adjust(hspace=0.5) # 调整子图之间的间距
存储图表文件
plt.savefig('top_movies_analysis.png') plt.show()
#8.用"rating_num"绘制饼图 import matplotlib.pyplot as plt
计算评分的数量分布
rating_counts = df['rating_num'].value_counts()
绘制饼形图
plt.figure(figsize=(10, 10)) # 调整图形大小 plt.pie(rating_counts, labels=rating_counts.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.tab20.colors) plt.axis('equal') # 使饼图比例相等 plt.title('Rating Distribution of Top 250 Movies on Douban') plt.show()
#9.用电影前10部评论数和评分绘制双曲线图表 import matplotlib.pyplot as plt
排序数据,取前10条数据
df_sorted_top10 = df.sort_values(by='comments_num', ascending=False).head(10)
创建画布
plt.figure(figsize=(15, 8))
绘制曲线图
plt.plot(df_sorted_top10['title'], df_sorted_top10['comments_num'], marker='o', color='skyblue', label='Comments Count') plt.plot(df_sorted_top10['title'], df_sorted_top10['rating_num'], marker='o', color='orange', label='Rating Count')
添加标签和标题
plt.xlabel('Movie Title') plt.ylabel('Count') plt.title('Top 10 Movies: Comments Count vs Rating Count') plt.xticks(rotation=45) plt.legend() plt.tight_layout() plt.show()
#生成包含豆瓣电影Top250电影名称的词云 import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt import pandas as pd
读取已经爬取的豆瓣电影TOP250数据
df = pd.read_excel("豆瓣电影TOP250.xlsx")
对电影名称进行分词处理
titles = df['title'].tolist() words = [] for title in titles: words.extend(jieba.lcut(title))
将分词后的结果转换为字符串
text = " ".join(words)
生成词云
wordcloud = WordCloud(font_path="simsun.ttc", width=800, height=400, background_color='white').generate(text)
显示词云
plt.figure(figsize=(10, 6)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show()
保存词云为图片
wordcloud.to_file("movie_title_wordcloud.png")