You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pk9wo7ype 24cc2f7279
Update README.md
2 years ago
README.md Update README.md 2 years ago

README.md

doubanmovie

import requests from bs4 import BeautifulSoup import pandas as pd #1. 下载共10个页面的HTML page_indexs = range(0,250,25) list(page_indexs) #下载共10个页面的HTML import requests def download_all_htmls(): """" 下载所有页面的HTML用于后续分析

"""
htmls = []
for idx in page_indexs:
    url = f"https://movie.douban.com/top250?start={idx}&filter="
    headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"}
    print("craw html:",url)
    r=requests.get(url,headers=headers)#分别爬取十个页面
    htmls.append(r.text)#把HTML加入到列表中
return htmls

#执行爬取3 htmls = download_all_htmls()

#2.解析HTML得到数据 import time def parse_single_html(html):

soup=BeautifulSoup(html,'html.parser')
article_items=(
    soup.find("div",attrs="article")
        .find("ol",class_="grid_view")
        .find_all("div",class_="item")
)
time.sleep(5)
datas=[]
for article_item in article_items:
    rank=article_item.find("div",class_="pic").find("em").get_text()
    info=article_item.find("div",class_="info")
    title=info.find("div",class_="hd").find("span",class_="title").get_text()
    stars=(
        info.find("div",class_="bd")
            .find("div",class_="star")
            .find_all("span")
    )
    time.sleep(5)
    rating_star=stars[0]["class"][0]
    rating_num=stars[1].get_text()
    comments=stars[3].get_text()
    
    datas.append({
        "rank":rank,
        "title":title,
        "rating_star":rating_star.replace("rating","").replace("-t",""),
        "rating_num":rating_num,
        "comments":comments.replace("人评价","")
    })
return datas

htmls[0] import pprint pprint.pprint(parse_single_html(htmls[0])) #需要一点时间反应 all_datas = [] for html in htmls: all_datas.extend(parse_single_html(html)) all_datas len(all_datas)

#3.将结果存入Excel df = pd.DataFrame(all_datas) df#3.将结果存入Excel df.to_excel("豆瓣电影TOP250.xlsx") df.to_csv('./top250.csv',encoding='utf_8_sig') all_datas = df[df['rating_star'] =='5'] display(all_datas)

#4.用"rating_num"绘制箱线图 #绘制箱线图来展示数据的分布情况,箱线图可以显示数据的中位数、四分位数、最大值、最小值以及异常值 #根据豆瓣电影TOP250数据中的评分数据绘制箱线图能够了解评分数据的分布情况并检测是否存在异常值 import seaborn as sns import matplotlib.pyplot as plt

将评分转换为数值类型

df['rating_num'] = df['rating_num'].astype(float)

绘制箱线图

plt.figure(figsize=(10, 6)) sns.boxplot(y='rating_num', data=df, color='skyblue') plt.ylabel('Rating') plt.title('Boxplot of Ratings for Top 250 Movies on Douban') plt.show()

#5.用"comments"数量绘制曲线图表

绘制曲线图

import matplotlib.pyplot as plt import matplotlib

设置全局中文字体为"SimHei"

matplotlib.rcParams['font.family'] = 'SimHei'

将comments转换为数值类型

df['comments_num'] = df['comments'].str.replace('万', '0000').astype(int)

按照排名对数据进行排序

df_sorted = df.sort_values(by='rank') plt.figure(figsize=(40, 20)) # 调整图形大小,手动放大看 plt.plot(df_sorted['title'], df_sorted['comments_num'], marker='o', color='skyblue', linestyle='-') plt.xlabel('Movie Title') plt.ylabel('Comments Count') plt.title('Comments Count of Top 250 Movies on Douban') plt.xticks(rotation=90) # 旋转x轴标签避免重叠 plt.grid(axis='y', linestyle='--', alpha=0.7) # 添加网格线 plt.tight_layout() # 自动调整布局,避免重叠 plt.show()

#6.用"rating_num"绘制横向柱形图 import matplotlib.pyplot as plt import matplotlib

设置全局中文字体为"SimHei"

matplotlib.rcParams['font.family'] = 'SimHei'

将评分转换为数值类型

df['rating_num'] = df['rating_num'].astype(float)

按照评分对数据进行排序

df_sorted = df.sort_values(by='rating_num', ascending=False)

绘制横向柱形图

plt.figure(figsize=(15, 40)) # 调整图形大小 plt.barh(df_sorted['title'], df_sorted['rating_num'], color='skyblue') plt.xlabel('Rating') plt.ylabel('Movie Title') plt.title('Rating Distribution of Top 250 Movies on Douban') plt.gca().invert_yaxis() # 反转y轴让评分高的电影显示在上方 plt.tight_layout() # 自动调整布局,避免重叠 plt.show()

#7.绘制多子图并存图表 import matplotlib.pyplot as plt import matplotlib

设置全局中文字体为"SimHei"

matplotlib.rcParams['font.family'] = 'SimHei'

创建一个包含两个子图的画布

fig, axs = plt.subplots(2, 1, figsize=(15, 30))

子图1绘制横向柱形图

axs[0].barh(df_sorted['title'], df_sorted['rating_num'], color='skyblue') axs[0].set_xlabel('Rating') axs[0].set_ylabel('Movie Title') axs[0].set_title('Rating Distribution of Top 250 Movies on Douban') axs[0].invert_yaxis() # 反转y轴让评分高的电影显示在上方

子图2绘制曲线图

axs[1].plot(df_sorted['title'], df_sorted['comments_num'], marker='o', color='skyblue', linestyle='-') axs[1].set_xlabel('Movie Title') axs[1].set_ylabel('Comments Count') axs[1].set_title('Comments Count of Top 250 Movies on Douban') axs[1].tick_params(axis='x', rotation=90) # 旋转x轴标签避免重叠 plt.tight_layout() # 自动调整布局,避免重叠 plt.subplots_adjust(hspace=0.5) # 调整子图之间的间距

存储图表文件

plt.savefig('top_movies_analysis.png') plt.show()

#8.用"rating_num"绘制饼图 import matplotlib.pyplot as plt

计算评分的数量分布

rating_counts = df['rating_num'].value_counts()

绘制饼形图

plt.figure(figsize=(10, 10)) # 调整图形大小 plt.pie(rating_counts, labels=rating_counts.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.tab20.colors) plt.axis('equal') # 使饼图比例相等 plt.title('Rating Distribution of Top 250 Movies on Douban') plt.show()

#9.用电影前10部评论数和评分绘制双曲线图表 import matplotlib.pyplot as plt

排序数据取前10条数据

df_sorted_top10 = df.sort_values(by='comments_num', ascending=False).head(10)

创建画布

plt.figure(figsize=(15, 8))

绘制曲线图

plt.plot(df_sorted_top10['title'], df_sorted_top10['comments_num'], marker='o', color='skyblue', label='Comments Count') plt.plot(df_sorted_top10['title'], df_sorted_top10['rating_num'], marker='o', color='orange', label='Rating Count')

添加标签和标题

plt.xlabel('Movie Title') plt.ylabel('Count') plt.title('Top 10 Movies: Comments Count vs Rating Count') plt.xticks(rotation=45) plt.legend() plt.tight_layout() plt.show()

#生成包含豆瓣电影Top250电影名称的词云 import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt import pandas as pd

读取已经爬取的豆瓣电影TOP250数据

df = pd.read_excel("豆瓣电影TOP250.xlsx")

对电影名称进行分词处理

titles = df['title'].tolist() words = [] for title in titles: words.extend(jieba.lcut(title))

将分词后的结果转换为字符串

text = " ".join(words)

生成词云

wordcloud = WordCloud(font_path="simsun.ttc", width=800, height=400, background_color='white').generate(text)

显示词云

plt.figure(figsize=(10, 6)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show()

保存词云为图片

wordcloud.to_file("movie_title_wordcloud.png")