ADD file via upload

main
hut22412110237 5 months ago
parent af6353bcc1
commit f90a50fd42

@ -0,0 +1,439 @@
# -*- coding: utf-8 -*-
###############################################################################
####################### 任务实现 #######################
###############################################################################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
# 获取当前文件所在目录
current_dir = os.path.dirname(os.path.abspath(__file__))
print(f"当前文件目录: {current_dir}")
# 创建输出目录 - 修改为与code同级的tmp目录
project_root = os.path.dirname(current_dir) # 获取项目根目录
tmp_dir = os.path.join(project_root, 'tmp') # 在项目根目录下创建tmp
os.makedirs(tmp_dir, exist_ok=True)
print(f"项目根目录: {project_root}")
print(f"输出目录: {tmp_dir}")
# 检查目录是否存在
if os.path.exists(tmp_dir):
print(f"✓ 输出目录创建成功: {tmp_dir}")
else:
print(f"✗ 输出目录创建失败")
###############################################################################
####################### 数据预处理 #######################
###############################################################################
print("=" * 60)
print("开始数据预处理...")
print("=" * 60)
# 代码 7-1数据加载与清洗
data_path = os.path.join(project_root, 'data/air_data.csv')
print(f"数据文件路径: {data_path}")
# 检查数据文件是否存在
if not os.path.exists(data_path):
print(f"错误: 数据文件不存在: {data_path}")
print("请确保air_data.csv文件在data/目录下")
exit(1)
airline_data = pd.read_csv(data_path, encoding="gb18030")
print('原始数据的形状为:', airline_data.shape)
# 去除票价为空的记录
exp1 = airline_data["SUM_YR_1"].notnull()
exp2 = airline_data["SUM_YR_2"].notnull()
exp = exp1 & exp2
airline_notnull = airline_data.loc[exp, :]
print('删除缺失记录后数据的形状为:', airline_notnull.shape)
# 只保留票价非零的或者平均折扣率不为0且总飞行公里数大于0的记录
index1 = airline_notnull['SUM_YR_1'] != 0
index2 = airline_notnull['SUM_YR_2'] != 0
index3 = (airline_notnull['SEG_KM_SUM'] > 0) & (airline_notnull['avg_discount'] != 0)
airline = airline_notnull[(index1 | index2) & index3]
print('删除异常记录后数据的形状为:', airline.shape)
# 代码 7-2特征构建
print("\n构建LRFMC特征...")
airline_selection = airline[["FFP_DATE", "LOAD_TIME", "FLIGHT_COUNT", "LAST_TO_END", "avg_discount", "SEG_KM_SUM"]]
# 构建L特征
L = pd.to_datetime(airline_selection["LOAD_TIME"]) - pd.to_datetime(airline_selection["FFP_DATE"])
L = L.astype("str").str.split().str[0]
L = L.astype("int") / 30
# 合并特征
airline_features = pd.concat([L, airline_selection.iloc[:, 2:]], axis=1)
airline_features.columns = airline_features.columns.astype(str)
print('构建的LRFMC特征前5行为\n', airline_features.head())
# 代码 7-3数据标准化
print("\n进行数据标准化...")
data = StandardScaler().fit_transform(airline_features)
# 保存标准化数据
airline_scale_path = os.path.join(tmp_dir, 'airline_scale.npz')
np.savez(airline_scale_path, data)
print(f'标准化数据保存到: {airline_scale_path}')
print('标准化后LRFMC五个特征为\n', data[:5, :])
###############################################################################
####################### K-Means聚类分析 #######################
###############################################################################
print("\n" + "=" * 60)
print("开始K-Means聚类分析...")
print("=" * 60)
# 加载标准化后的数据
airline_scale = np.load(airline_scale_path)['arr_0']
k = 5 # 确定聚类中心数
# 构建模型
kmeans_model = KMeans(n_clusters=k, random_state=123)
fit_kmeans = kmeans_model.fit(airline_scale)
# 查看聚类中心
cluster_centers = kmeans_model.cluster_centers_
print('聚类中心:\n', cluster_centers)
# 统计不同类别样本的数目
r1 = pd.Series(kmeans_model.labels_).value_counts()
print('最终每个类别的数目为:\n', r1)
###############################################################################
####################### 客户分类与业务逻辑 #######################
###############################################################################
def classify_customer(L, R, F, M, C):
"""
根据LRFMC特征分类客户
基于航空公司客户价值分析的业务定义
"""
# 重要保持客户:高频率、高里程、近期消费
if F > 1.0 and M > 1.0 and R < 0:
return "重要保持客户"
# 重要发展客户:长期客户但消费频次较低
elif L > 0.5 and F < 0 and M < 0:
return "重要发展客户"
# 重要挽留客户:很久未消费但历史价值高
elif R > 1.0 and (F > 0 or M > 0):
return "重要挽留客户"
# 低价值客户:各方面特征都低于平均水平
elif L < -0.5 and R > 0 and F < -0.5 and M < -0.5 and C < -0.5:
return "低价值客户"
else:
return "一般客户"
def get_business_advice(customer_type):
"""根据客户类型提供业务建议"""
advice_map = {
"重要保持客户": "提供VIP服务优先保障个性化关怀",
"重要发展客户": "推送增值服务,推荐会员升级,定向营销",
"重要挽留客户": "主动联系,提供专属优惠,防止流失",
"一般客户": "常规服务,适度营销推送",
"低价值客户": "基础服务,低成本维护"
}
return advice_map.get(customer_type, "常规服务")
###############################################################################
####################### 客户价值分析报告 #######################
###############################################################################
print("\n" + "=" * 60)
print(" 航空公司客户价值分析报告")
print("=" * 60)
total_customers = len(airline_scale)
high_value_customers = 0
retention_customers = 0
development_customers = 0
low_value_customers = 0
print(f"\n总体客户数: {total_customers:,}")
print("\n各客户群详细分析:")
print("-" * 50)
customer_types = []
customer_details = []
for i in range(k):
L, R, F, M, C = cluster_centers[i]
customer_type = classify_customer(L, R, F, M, C)
customer_types.append(customer_type)
percentage = r1[i] / total_customers * 100
# 统计各类客户数量
if customer_type == "重要保持客户":
high_value_customers += r1[i]
elif customer_type == "重要挽留客户":
retention_customers += r1[i]
elif customer_type == "重要发展客户":
development_customers += r1[i]
elif customer_type == "低价值客户":
low_value_customers += r1[i]
# 存储详细信息
customer_details.append({
'cluster': i,
'type': customer_type,
'count': r1[i],
'percentage': percentage,
'features': [L, R, F, M, C],
'advice': get_business_advice(customer_type)
})
print(f"\n{customer_type} - 客户群{i}")
print(f" 样本数量: {r1[i]:,} ({percentage:.1f}%)")
print(f" 特征值: L={L:.3f}, R={R:.3f}, F={F:.3f}, M={M:.3f}, C={C:.3f}")
# 特征解读
feature_analysis = []
if L > 0.5:
feature_analysis.append("长期客户")
elif L < -0.5:
feature_analysis.append("新客户")
if R < -0.5:
feature_analysis.append("近期活跃")
elif R > 0.5:
feature_analysis.append("久未消费")
if F > 0.5:
feature_analysis.append("高频消费")
elif F < -0.5:
feature_analysis.append("低频消费")
if M > 0.5:
feature_analysis.append("高里程")
elif M < -0.5:
feature_analysis.append("低里程")
if C > 0.5:
feature_analysis.append("高折扣偏好")
elif C < -0.5:
feature_analysis.append("低折扣偏好")
if feature_analysis:
print(f" 特征解读: {', '.join(feature_analysis)}")
print(f" 营销建议: {get_business_advice(customer_type)}")
# 总体统计
general_customers = total_customers - high_value_customers - retention_customers - development_customers - low_value_customers
print(f"\n" + "=" * 50)
print("关键业务洞察:")
print("=" * 50)
print(f" • 高价值客户占比: {high_value_customers / total_customers * 100:.1f}%")
print(f" • 挽留客户占比: {retention_customers / total_customers * 100:.1f}%")
print(f" • 发展客户占比: {development_customers / total_customers * 100:.1f}%")
print(f" • 低价值客户占比: {low_value_customers / total_customers * 100:.1f}%")
print(f" • 一般客户占比: {general_customers / total_customers * 100:.1f}%")
print(f"\n业务建议摘要:")
print(f" • 重点维护 {high_value_customers:,} 名高价值客户")
print(f" • 积极挽留 {retention_customers:,} 名有流失风险的客户")
print(f" • 重点发展 {development_customers:,} 名潜力客户")
print(f" • 优化 {low_value_customers:,} 名低价值客户的成本结构")
###############################################################################
####################### 可视化分析 #######################
###############################################################################
print("\n生成可视化分析图表...")
# 1. 客户价值综合分析图
fig = plt.figure(figsize=(18, 12))
# 1.1 客户分布饼图
plt.subplot(2, 3, 1)
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57']
labels = [f'{i}\n({customer_types[i]})' for i in range(k)]
plt.pie(r1.values, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('客户群分布', fontsize=14, fontweight='bold')
# 1.2 特征对比柱状图
plt.subplot(2, 3, 2)
feature_labels = ['L', 'R', 'F', 'M', 'C']
x = np.arange(len(feature_labels))
width = 0.15
for i in range(min(3, k)):
plt.bar(x + i * width, cluster_centers[i], width, label=f'客户群{i}', alpha=0.8)
plt.xlabel('特征')
plt.ylabel('标准化值')
plt.title('主要客户群特征对比')
plt.xticks(x + width, feature_labels)
plt.legend()
plt.grid(True, alpha=0.3)
# 1.3 客户价值矩阵
plt.subplot(2, 3, 3)
for i in range(k):
plt.scatter(cluster_centers[i][3], cluster_centers[i][2],
s=r1[i] / 50, c=colors[i], label=f'{i}({customer_types[i]})',
alpha=0.7, edgecolors='black', linewidth=0.5)
plt.xlabel('飞行里程(M)')
plt.ylabel('消费频率(F)')
plt.title('客户价值矩阵\n(气泡大小=客户数量)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
# 1.4 客户价值类别分布
plt.subplot(2, 3, 4)
customer_categories = ['高价值', '需挽留', '可发展', '低价值', '一般']
customer_counts = [high_value_customers, retention_customers, development_customers,
low_value_customers, general_customers]
customer_percentages = [x / total_customers * 100 for x in customer_counts]
bars = plt.bar(customer_categories, customer_counts, color=colors[:5], alpha=0.8)
plt.xlabel('客户类别')
plt.ylabel('客户数量')
plt.title('客户价值类别分布')
# 在柱状图上添加数值标签
for bar, count, percentage in zip(bars, customer_counts, customer_percentages):
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2., height + total_customers * 0.01,
f'{count:,}\n({percentage:.1f}%)', ha='center', va='bottom', fontsize=9)
plt.tight_layout()
plt.savefig(os.path.join(tmp_dir, '客户价值综合分析.png'), dpi=300, bbox_inches='tight')
plt.show()
# 2. 客户分群雷达图
print("\n生成详细雷达图...")
feature_labels = ['客户关系(L)', '消费间隔(R)', '消费频率(F)', '飞行里程(M)', '折扣率(C)']
n_features = len(feature_labels)
angles = np.linspace(0, 2 * np.pi, n_features, endpoint=False).tolist()
angles += angles[:1]
fig = plt.figure(figsize=(16, 10))
for i in range(k):
values = cluster_centers[i].tolist()
values += values[:1]
ax = fig.add_subplot(2, 3, i + 1, polar=True)
ax.plot(angles, values, 'o-', linewidth=2, label=f'客户群{i}', color=colors[i])
ax.fill(angles, values, alpha=0.25, color=colors[i])
ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(feature_labels)
ax.set_title(f'{customer_types[i]}\n(客户群{i}, {r1[i]:,}人)', size=12, pad=20)
ax.grid(True)
# 汇总雷达图
ax = fig.add_subplot(2, 3, 6, polar=True)
for i in range(k):
values = cluster_centers[i].tolist()
values += values[:1]
ax.plot(angles, values, 'o-', linewidth=1, label=f'{i}-{customer_types[i]}', alpha=0.7)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(feature_labels)
ax.set_title('所有客户群特征对比', size=12, pad=20)
ax.legend(bbox_to_anchor=(1.3, 1.0))
ax.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(tmp_dir, '客户分群雷达图.png'), dpi=300, bbox_inches='tight')
plt.show()
###############################################################################
####################### 模型评价与优化 #######################
###############################################################################
print("\n" + "=" * 60)
print("开始模型评价...")
print("=" * 60)
# 模型评估
print("评估当前聚类模型...")
sil_score = silhouette_score(airline_scale, kmeans_model.labels_)
ch_score = calinski_harabasz_score(airline_scale, kmeans_model.labels_)
db_score = davies_bouldin_score(airline_scale, kmeans_model.labels_)
print('=' * 50)
print('聚类模型评估结果:')
print(f'轮廓系数: {sil_score:.4f}')
print(f'Calinski-Harabasz指数: {ch_score:.4f}')
print(f'Davies-Bouldin指数: {db_score:.4f}')
print('=' * 50)
# 模型评价雷达图
print("生成模型评价雷达图...")
features = ['L', 'R', 'F', 'M', 'C']
angles = np.linspace(0, 2*np.pi, len(features), endpoint=False)
angles = np.concatenate((angles, [angles[0]]))
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, polar=True)
colors = ['red', 'blue', 'green', 'orange', 'purple']
for i in range(len(cluster_centers)):
values = np.concatenate((cluster_centers[i], [cluster_centers[i][0]]))
ax.plot(angles, values, 'o-', linewidth=2, label=customer_types[i], color=colors[i])
ax.fill(angles, values, alpha=0.1, color=colors[i])
ax.set_xticks(angles[:-1])
ax.set_xticklabels(features)
ax.set_ylim(-2, 3)
plt.title('航空公司客户分群雷达图', size=16, y=1.05)
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
plt.savefig(os.path.join(tmp_dir, '客户分群评价雷达图.png'), dpi=300, bbox_inches='tight')
plt.show()
plt.close()
print("✓ 模型评价雷达图保存完成")
###############################################################################
####################### 最终报告 #######################
###############################################################################
print("\n" + "=" * 70)
print(" 分析完成!")
print("=" * 70)
print(f"✓ 已成功分析 {total_customers:,} 名航空客户")
print(f"✓ 生成 {k} 个客户分群")
print(f"✓ 创建可视化图表保存至 tmp/ 文件夹")
print(f"✓ 识别出 {high_value_customers:,} 名高价值客户需重点维护")
print(f"✓ 发现 {retention_customers:,} 名客户有流失风险需积极挽留")
print(f"\n生成的可视化文件:")
print(" • tmp/客户价值综合分析.png - 客户分布和特征分析")
print(" • tmp/客户分群雷达图.png - 详细特征雷达图")
print(" • tmp/客户分群评价雷达图.png - 模型评价雷达图")
print(f"\n模型评估指标:")
print(f" • 轮廓系数: {sil_score:.4f} (越接近1越好)")
print(f" • Calinski-Harabasz指数: {ch_score:.4f} (越大越好)")
print(f" • Davies-Bouldin指数: {db_score:.4f} (越小越好)")
print(f"\n项目总结:")
print(f" 数据预处理: 原始{airline_data.shape[0]:,}条 → 清洗后{airline.shape[0]:,}")
print(f" 特征工程: 构建LRFMC五个关键特征")
print(f" 聚类分析: 使用K-Means算法将客户分为5个群体")
print(f" 业务应用: 为不同客户群体制定精准营销策略")
print("\n" + "=" * 70)
Loading…
Cancel
Save