ADD file via upload

5 months ago · f90a50fd42
parent af6353bcc1
commit f90a50fd42
1 changed files with 439 additions and 0 deletions
--- a/航空公司客户价值分析_完整版.py
+++ b/航空公司客户价值分析_完整版.py
@ -0,0 +1,439 @@
+# -*- coding: utf-8 -*-
+
+###############################################################################
+#######################           任务实现             #######################
+###############################################################################
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+from sklearn.preprocessing import StandardScaler
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
+
+# 设置中文字体支持
+plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
+plt.rcParams['axes.unicode_minus'] = False
+
+# 获取当前文件所在目录
+current_dir = os.path.dirname(os.path.abspath(__file__))
+print(f"当前文件目录: {current_dir}")
+
+# 创建输出目录 - 修改为与code同级的tmp目录
+project_root = os.path.dirname(current_dir)  # 获取项目根目录
+tmp_dir = os.path.join(project_root, 'tmp')  # 在项目根目录下创建tmp
+os.makedirs(tmp_dir, exist_ok=True)
+print(f"项目根目录: {project_root}")
+print(f"输出目录: {tmp_dir}")
+
+# 检查目录是否存在
+if os.path.exists(tmp_dir):
+    print(f"✓ 输出目录创建成功: {tmp_dir}")
+else:
+    print(f"✗ 输出目录创建失败")
+
+###############################################################################
+#######################       数据预处理              #######################
+###############################################################################
+
+print("=" * 60)
+print("开始数据预处理...")
+print("=" * 60)
+
+# 代码 7-1：数据加载与清洗
+data_path = os.path.join(project_root, 'data/air_data.csv')
+print(f"数据文件路径: {data_path}")
+
+# 检查数据文件是否存在
+if not os.path.exists(data_path):
+    print(f"错误: 数据文件不存在: {data_path}")
+    print("请确保air_data.csv文件在data/目录下")
+    exit(1)
+
+airline_data = pd.read_csv(data_path, encoding="gb18030")
+print('原始数据的形状为：', airline_data.shape)
+
+# 去除票价为空的记录
+exp1 = airline_data["SUM_YR_1"].notnull()
+exp2 = airline_data["SUM_YR_2"].notnull()
+exp = exp1 & exp2
+airline_notnull = airline_data.loc[exp, :]
+print('删除缺失记录后数据的形状为：', airline_notnull.shape)
+
+# 只保留票价非零的，或者平均折扣率不为0且总飞行公里数大于0的记录
+index1 = airline_notnull['SUM_YR_1'] != 0
+index2 = airline_notnull['SUM_YR_2'] != 0
+index3 = (airline_notnull['SEG_KM_SUM'] > 0) & (airline_notnull['avg_discount'] != 0)
+airline = airline_notnull[(index1 | index2) & index3]
+print('删除异常记录后数据的形状为：', airline.shape)
+
+# 代码 7-2：特征构建
+print("\n构建LRFMC特征...")
+airline_selection = airline[["FFP_DATE", "LOAD_TIME", "FLIGHT_COUNT", "LAST_TO_END", "avg_discount", "SEG_KM_SUM"]]
+
+# 构建L特征
+L = pd.to_datetime(airline_selection["LOAD_TIME"]) - pd.to_datetime(airline_selection["FFP_DATE"])
+L = L.astype("str").str.split().str[0]
+L = L.astype("int") / 30
+
+# 合并特征
+airline_features = pd.concat([L, airline_selection.iloc[:, 2:]], axis=1)
+airline_features.columns = airline_features.columns.astype(str)
+print('构建的LRFMC特征前5行为：\n', airline_features.head())
+
+# 代码 7-3：数据标准化
+print("\n进行数据标准化...")
+data = StandardScaler().fit_transform(airline_features)
+
+# 保存标准化数据
+airline_scale_path = os.path.join(tmp_dir, 'airline_scale.npz')
+np.savez(airline_scale_path, data)
+print(f'标准化数据保存到: {airline_scale_path}')
+print('标准化后LRFMC五个特征为：\n', data[:5, :])
+
+###############################################################################
+#######################       K-Means聚类分析         #######################
+###############################################################################
+
+print("\n" + "=" * 60)
+print("开始K-Means聚类分析...")
+print("=" * 60)
+
+# 加载标准化后的数据
+airline_scale = np.load(airline_scale_path)['arr_0']
+k = 5  # 确定聚类中心数
+
+# 构建模型
+kmeans_model = KMeans(n_clusters=k, random_state=123)
+fit_kmeans = kmeans_model.fit(airline_scale)
+
+# 查看聚类中心
+cluster_centers = kmeans_model.cluster_centers_
+print('聚类中心：\n', cluster_centers)
+
+# 统计不同类别样本的数目
+r1 = pd.Series(kmeans_model.labels_).value_counts()
+print('最终每个类别的数目为：\n', r1)
+
+###############################################################################
+#######################       客户分类与业务逻辑       #######################
+###############################################################################
+
+def classify_customer(L, R, F, M, C):
+    """
+    根据LRFMC特征分类客户
+    基于航空公司客户价值分析的业务定义
+    """
+    # 重要保持客户：高频率、高里程、近期消费
+    if F > 1.0 and M > 1.0 and R < 0:
+        return "重要保持客户"
+    # 重要发展客户：长期客户但消费频次较低
+    elif L > 0.5 and F < 0 and M < 0:
+        return "重要发展客户"
+    # 重要挽留客户：很久未消费但历史价值高
+    elif R > 1.0 and (F > 0 or M > 0):
+        return "重要挽留客户"
+    # 低价值客户：各方面特征都低于平均水平
+    elif L < -0.5 and R > 0 and F < -0.5 and M < -0.5 and C < -0.5:
+        return "低价值客户"
+    else:
+        return "一般客户"
+
+def get_business_advice(customer_type):
+    """根据客户类型提供业务建议"""
+    advice_map = {
+        "重要保持客户": "提供VIP服务，优先保障，个性化关怀",
+        "重要发展客户": "推送增值服务，推荐会员升级，定向营销",
+        "重要挽留客户": "主动联系，提供专属优惠，防止流失",
+        "一般客户": "常规服务，适度营销推送",
+        "低价值客户": "基础服务，低成本维护"
+    }
+    return advice_map.get(customer_type, "常规服务")
+
+###############################################################################
+#######################       客户价值分析报告         #######################
+###############################################################################
+
+print("\n" + "=" * 60)
+print("                   航空公司客户价值分析报告")
+print("=" * 60)
+
+total_customers = len(airline_scale)
+high_value_customers = 0
+retention_customers = 0
+development_customers = 0
+low_value_customers = 0
+
+print(f"\n总体客户数: {total_customers:,}")
+print("\n各客户群详细分析:")
+print("-" * 50)
+
+customer_types = []
+customer_details = []
+
+for i in range(k):
+    L, R, F, M, C = cluster_centers[i]
+    customer_type = classify_customer(L, R, F, M, C)
+    customer_types.append(customer_type)
+    percentage = r1[i] / total_customers * 100
+
+    # 统计各类客户数量
+    if customer_type == "重要保持客户":
+        high_value_customers += r1[i]
+    elif customer_type == "重要挽留客户":
+        retention_customers += r1[i]
+    elif customer_type == "重要发展客户":
+        development_customers += r1[i]
+    elif customer_type == "低价值客户":
+        low_value_customers += r1[i]
+
+    # 存储详细信息
+    customer_details.append({
+        'cluster': i,
+        'type': customer_type,
+        'count': r1[i],
+        'percentage': percentage,
+        'features': [L, R, F, M, C],
+        'advice': get_business_advice(customer_type)
+    })
+
+    print(f"\n【{customer_type} - 客户群{i}】")
+    print(f"  样本数量: {r1[i]:,} ({percentage:.1f}%)")
+    print(f"  特征值: L={L:.3f}, R={R:.3f}, F={F:.3f}, M={M:.3f}, C={C:.3f}")
+
+    # 特征解读
+    feature_analysis = []
+    if L > 0.5:
+        feature_analysis.append("长期客户")
+    elif L < -0.5:
+        feature_analysis.append("新客户")
+
+    if R < -0.5:
+        feature_analysis.append("近期活跃")
+    elif R > 0.5:
+        feature_analysis.append("久未消费")
+
+    if F > 0.5:
+        feature_analysis.append("高频消费")
+    elif F < -0.5:
+        feature_analysis.append("低频消费")
+
+    if M > 0.5:
+        feature_analysis.append("高里程")
+    elif M < -0.5:
+        feature_analysis.append("低里程")
+
+    if C > 0.5:
+        feature_analysis.append("高折扣偏好")
+    elif C < -0.5:
+        feature_analysis.append("低折扣偏好")
+
+    if feature_analysis:
+        print(f"  特征解读: {', '.join(feature_analysis)}")
+
+    print(f"  营销建议: {get_business_advice(customer_type)}")
+
+# 总体统计
+general_customers = total_customers - high_value_customers - retention_customers - development_customers - low_value_customers
+
+print(f"\n" + "=" * 50)
+print("关键业务洞察:")
+print("=" * 50)
+print(f"  • 高价值客户占比: {high_value_customers / total_customers * 100:.1f}%")
+print(f"  • 挽留客户占比: {retention_customers / total_customers * 100:.1f}%")
+print(f"  • 发展客户占比: {development_customers / total_customers * 100:.1f}%")
+print(f"  • 低价值客户占比: {low_value_customers / total_customers * 100:.1f}%")
+print(f"  • 一般客户占比: {general_customers / total_customers * 100:.1f}%")
+
+print(f"\n业务建议摘要:")
+print(f"  • 重点维护 {high_value_customers:,} 名高价值客户")
+print(f"  • 积极挽留 {retention_customers:,} 名有流失风险的客户")
+print(f"  • 重点发展 {development_customers:,} 名潜力客户")
+print(f"  • 优化 {low_value_customers:,} 名低价值客户的成本结构")
+
+###############################################################################
+#######################       可视化分析             #######################
+###############################################################################
+
+print("\n生成可视化分析图表...")
+
+# 1. 客户价值综合分析图
+fig = plt.figure(figsize=(18, 12))
+
+# 1.1 客户分布饼图
+plt.subplot(2, 3, 1)
+colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57']
+labels = [f'群{i}\n({customer_types[i]})' for i in range(k)]
+plt.pie(r1.values, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
+plt.title('客户群分布', fontsize=14, fontweight='bold')
+
+# 1.2 特征对比柱状图
+plt.subplot(2, 3, 2)
+feature_labels = ['L', 'R', 'F', 'M', 'C']
+x = np.arange(len(feature_labels))
+width = 0.15
+
+for i in range(min(3, k)):
+    plt.bar(x + i * width, cluster_centers[i], width, label=f'客户群{i}', alpha=0.8)
+
+plt.xlabel('特征')
+plt.ylabel('标准化值')
+plt.title('主要客户群特征对比')
+plt.xticks(x + width, feature_labels)
+plt.legend()
+plt.grid(True, alpha=0.3)
+
+# 1.3 客户价值矩阵
+plt.subplot(2, 3, 3)
+for i in range(k):
+    plt.scatter(cluster_centers[i][3], cluster_centers[i][2],
+                s=r1[i] / 50, c=colors[i], label=f'群{i}({customer_types[i]})',
+                alpha=0.7, edgecolors='black', linewidth=0.5)
+plt.xlabel('飞行里程(M)')
+plt.ylabel('消费频率(F)')
+plt.title('客户价值矩阵\n(气泡大小=客户数量)')
+plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+plt.grid(True, alpha=0.3)
+
+# 1.4 客户价值类别分布
+plt.subplot(2, 3, 4)
+customer_categories = ['高价值', '需挽留', '可发展', '低价值', '一般']
+customer_counts = [high_value_customers, retention_customers, development_customers,
+                   low_value_customers, general_customers]
+customer_percentages = [x / total_customers * 100 for x in customer_counts]
+
+bars = plt.bar(customer_categories, customer_counts, color=colors[:5], alpha=0.8)
+plt.xlabel('客户类别')
+plt.ylabel('客户数量')
+plt.title('客户价值类别分布')
+
+# 在柱状图上添加数值标签
+for bar, count, percentage in zip(bars, customer_counts, customer_percentages):
+    height = bar.get_height()
+    plt.text(bar.get_x() + bar.get_width() / 2., height + total_customers * 0.01,
+             f'{count:,}\n({percentage:.1f}%)', ha='center', va='bottom', fontsize=9)
+
+plt.tight_layout()
+plt.savefig(os.path.join(tmp_dir, '客户价值综合分析.png'), dpi=300, bbox_inches='tight')
+plt.show()
+
+# 2. 客户分群雷达图
+print("\n生成详细雷达图...")
+feature_labels = ['客户关系(L)', '消费间隔(R)', '消费频率(F)', '飞行里程(M)', '折扣率(C)']
+n_features = len(feature_labels)
+
+angles = np.linspace(0, 2 * np.pi, n_features, endpoint=False).tolist()
+angles += angles[:1]
+
+fig = plt.figure(figsize=(16, 10))
+
+for i in range(k):
+    values = cluster_centers[i].tolist()
+    values += values[:1]
+
+    ax = fig.add_subplot(2, 3, i + 1, polar=True)
+    ax.plot(angles, values, 'o-', linewidth=2, label=f'客户群{i}', color=colors[i])
+    ax.fill(angles, values, alpha=0.25, color=colors[i])
+
+    ax.set_theta_offset(np.pi / 2)
+    ax.set_theta_direction(-1)
+    ax.set_xticks(angles[:-1])
+    ax.set_xticklabels(feature_labels)
+    ax.set_title(f'{customer_types[i]}\n(客户群{i}, {r1[i]:,}人)', size=12, pad=20)
+    ax.grid(True)
+
+# 汇总雷达图
+ax = fig.add_subplot(2, 3, 6, polar=True)
+for i in range(k):
+    values = cluster_centers[i].tolist()
+    values += values[:1]
+    ax.plot(angles, values, 'o-', linewidth=1, label=f'群{i}-{customer_types[i]}', alpha=0.7)
+ax.set_xticks(angles[:-1])
+ax.set_xticklabels(feature_labels)
+ax.set_title('所有客户群特征对比', size=12, pad=20)
+ax.legend(bbox_to_anchor=(1.3, 1.0))
+ax.grid(True)
+
+plt.tight_layout()
+plt.savefig(os.path.join(tmp_dir, '客户分群雷达图.png'), dpi=300, bbox_inches='tight')
+plt.show()
+
+###############################################################################
+#######################       模型评价与优化         #######################
+###############################################################################
+
+print("\n" + "=" * 60)
+print("开始模型评价...")
+print("=" * 60)
+
+# 模型评估
+print("评估当前聚类模型...")
+sil_score = silhouette_score(airline_scale, kmeans_model.labels_)
+ch_score = calinski_harabasz_score(airline_scale, kmeans_model.labels_)
+db_score = davies_bouldin_score(airline_scale, kmeans_model.labels_)
+
+print('=' * 50)
+print('聚类模型评估结果：')
+print(f'轮廓系数: {sil_score:.4f}')
+print(f'Calinski-Harabasz指数: {ch_score:.4f}')
+print(f'Davies-Bouldin指数: {db_score:.4f}')
+print('=' * 50)
+
+# 模型评价雷达图
+print("生成模型评价雷达图...")
+features = ['L', 'R', 'F', 'M', 'C']
+angles = np.linspace(0, 2*np.pi, len(features), endpoint=False)
+angles = np.concatenate((angles, [angles[0]]))
+
+fig = plt.figure(figsize=(10, 8))
+ax = fig.add_subplot(111, polar=True)
+
+colors = ['red', 'blue', 'green', 'orange', 'purple']
+
+for i in range(len(cluster_centers)):
+    values = np.concatenate((cluster_centers[i], [cluster_centers[i][0]]))
+    ax.plot(angles, values, 'o-', linewidth=2, label=customer_types[i], color=colors[i])
+    ax.fill(angles, values, alpha=0.1, color=colors[i])
+
+ax.set_xticks(angles[:-1])
+ax.set_xticklabels(features)
+ax.set_ylim(-2, 3)
+plt.title('航空公司客户分群雷达图', size=16, y=1.05)
+plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
+plt.savefig(os.path.join(tmp_dir, '客户分群评价雷达图.png'), dpi=300, bbox_inches='tight')
+plt.show()
+plt.close()
+
+print("✓ 模型评价雷达图保存完成")
+
+###############################################################################
+#######################       最终报告              #######################
+###############################################################################
+
+print("\n" + "=" * 70)
+print("                     分析完成!")
+print("=" * 70)
+print(f"✓ 已成功分析 {total_customers:,} 名航空客户")
+print(f"✓ 生成 {k} 个客户分群")
+print(f"✓ 创建可视化图表保存至 tmp/ 文件夹")
+print(f"✓ 识别出 {high_value_customers:,} 名高价值客户需重点维护")
+print(f"✓ 发现 {retention_customers:,} 名客户有流失风险需积极挽留")
+
+print(f"\n生成的可视化文件:")
+print("  • tmp/客户价值综合分析.png - 客户分布和特征分析")
+print("  • tmp/客户分群雷达图.png - 详细特征雷达图")
+print("  • tmp/客户分群评价雷达图.png - 模型评价雷达图")
+
+print(f"\n模型评估指标:")
+print(f"  • 轮廓系数: {sil_score:.4f} (越接近1越好)")
+print(f"  • Calinski-Harabasz指数: {ch_score:.4f} (越大越好)")
+print(f"  • Davies-Bouldin指数: {db_score:.4f} (越小越好)")
+
+print(f"\n项目总结:")
+print(f"  数据预处理: 原始{airline_data.shape[0]:,}条 → 清洗后{airline.shape[0]:,}条")
+print(f"  特征工程: 构建LRFMC五个关键特征")
+print(f"  聚类分析: 使用K-Means算法将客户分为5个群体")
+print(f"  业务应用: 为不同客户群体制定精准营销策略")
+
+print("\n" + "=" * 70)