You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

60 lines
2.2 KiB

import pandas as pd
from pyecharts.charts import Bar
from pyecharts import options as opts
from pyecharts.globals import ThemeType
df = pd.read_csv('./JD.csv', encoding='gbk')
# print(f"==>> df: {df}")
# 价格去掉无用字符,然后把价格转换成浮点类型数据
df['价格'] = df['价格'].str.replace('', '').str.replace(',', '').astype(float)
df['评论数'] = df['评论数'].str.replace('+', '').str.replace('', '0000').astype(float)
# print(f"==>> df: {df.head(5)}")
describe = df.describe()
print(f"==>> describe: {describe}")
# 价格区间,和标签
bins = [0, 100, 300, 500, 800, 1000, 1300]
labels = ['100以下', '100-300', '300-500', '500-800', '800-1000', '1000以上']
# 把价格区间分为6个区间
df['价格区间'] = pd.cut(df['价格'], bins=bins, labels=labels, include_lowest=True)
# 统计区间内的价格
df_price_count = df['价格区间'].value_counts(sort=False)
print(f"==>> df_price_count: {df_price_count}")
# 画直方图
hist = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
.add_xaxis(df_price_count.index.tolist())
.add_yaxis('价格区间', df_price_count.values.tolist())
.set_global_opts(title_opts=opts.TitleOpts(title='价格区间分布直方图'))
)
hist.render('./价格区间分布直方图.html')
# 把评论数去重,排序,然后取前十
df_sorted = df.sort_values(by='评论数', ascending=False).drop_duplicates(subset='评论数').head(10)
# print(f"==>> df_sorted: {df_sorted}")
# 画条形图
bar_reverse = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT, width='1200px', height='800px'))
.add_xaxis([i for i in reversed(df_sorted['商品'].tolist())])
# .add_xaxis([i for i in reversed(df_sorted['店铺'].tolist())])
.add_yaxis('评论数', [i for i in reversed(df_sorted['评论数'].values.tolist())])
.reversal_axis()
.set_series_opts(label_opts=opts.LabelOpts(position='right'))
.set_global_opts(
yaxis_opts=opts.AxisOpts(
axislabel_opts=opts.LabelOpts(font_size=10, rotate=60) # 调整字体大小并倾斜标签
)
)
)
bar_reverse.render('./评论数前十条直方图.html')