dsj/3_centrality_analysis.py


import pickle
import pandas as pd
import networkx as nx
import numpy as np
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

def load_graph(graph_file):
    """
    加载保存的图结构

    Args:
        graph_file (str): 图文件路径

    Returns:
        nx.Graph: 网络图对象
    """
    print(f"正在加载图结构: {graph_file}")
    with open(graph_file, 'rb') as f:
        G = pickle.load(f)
    print(f"图加载完成 - 节点数: {G.number_of_nodes()}, 边数: {G.number_of_edges()}")
    return G

def calculate_degree_centrality(G):
    """
    计算度中心性

    Args:
        G (nx.Graph): 网络图对象

    Returns:
        dict: 节点度中心性字典
    """
    print("计算度中心性...")

    # 计算度中心性
    degree_centrality = nx.degree_centrality(G)

    # 计算加权度中心性（考虑边权重）
    weighted_degree_centrality = {}
    for node in G.nodes():
        total_weight = sum(G[node][neighbor]['weight'] for neighbor in G.neighbors(node))
        max_possible_weight = sum(data['weight'] for _, _, data in G.edges(data=True))
        weighted_degree_centrality[node] = total_weight / max_possible_weight if max_possible_weight > 0 else 0

    print(f"度中心性计算完成")
    return degree_centrality, weighted_degree_centrality

def calculate_betweenness_centrality(G):
    """
    计算介数中心性

    Args:
        G (nx.Graph): 网络图对象

    Returns:
        dict: 节点介数中心性字典
    """
    print("计算介数中心性...")

    # 检查图是否连通
    if not nx.is_connected(G):
        print("警告: 图不连通，将使用最大连通分量计算介数中心性")
        # 找到最大连通分量
        largest_cc = max(nx.connected_components(G), key=len)
        G_largest = G.subgraph(largest_cc)
        print(f"最大连通分量包含 {len(largest_cc)} 个节点")
    else:
        G_largest = G

    # 计算介数中心性
    betweenness_centrality = nx.betweenness_centrality(G_largest, weight='weight')

    # 为不在最大连通分量中的节点设置介数中心性为0
    full_betweenness_centrality = {}
    for node in G.nodes():
        if node in betweenness_centrality:
            full_betweenness_centrality[node] = betweenness_centrality[node]
        else:
            full_betweenness_centrality[node] = 0.0

    print(f"介数中心性计算完成")
    return full_betweenness_centrality

def calculate_additional_centralities(G):
    """
    计算其他重要的中心性指标

    Args:
        G (nx.Graph): 网络图对象

    Returns:
        dict: 包含各种中心性的字典
    """
    print("计算其他中心性指标...")

    centralities = {}

    # 特征向量中心性
    try:
        eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000, weight='weight')
        centralities['eigenvector'] = eigenvector_centrality
        print("特征向量中心性计算完成")
    except Exception as e:
        print(f"特征向量中心性计算失败: {e}")
        centralities['eigenvector'] = {node: 0.0 for node in G.nodes()}

    # 接近中心性（仅在连通分量内计算）
    try:
        if nx.is_connected(G):
            closeness_centrality = nx.closeness_centrality(G, distance='weight')
            centralities['closeness'] = closeness_centrality
            print("接近中心性计算完成")
        else:
            # 对每个连通分量分别计算
            closeness_centrality = {}
            for component in nx.connected_components(G):
                subgraph = G.subgraph(component)
                if len(component) > 1:  # 至少需要2个节点
                    sub_closeness = nx.closeness_centrality(subgraph, distance='weight')
                    closeness_centrality.update(sub_closeness)
                else:
                    # 孤立节点的接近中心性为0
                    for node in component:
                        closeness_centrality[node] = 0.0

            centralities['closeness'] = closeness_centrality
            print("接近中心性计算完成（分连通分量）")
    except Exception as e:
        print(f"接近中心性计算失败: {e}")
        centralities['closeness'] = {node: 0.0 for node in G.nodes()}

    # PageRank中心性
    try:
        pagerank = nx.pagerank(G, weight='weight')
        centralities['pagerank'] = pagerank
        print("PageRank中心性计算完成")
    except Exception as e:
        print(f"PageRank中心性计算失败: {e}")
        centralities['pagerank'] = {node: 0.0 for node in G.nodes()}

    return centralities

def analyze_centrality_results(degree_cent, weighted_degree_cent, betweenness_cent, additional_cent):
    """
    分析中心性结果

    Args:
        degree_cent (dict): 度中心性
        weighted_degree_cent (dict): 加权度中心性
        betweenness_cent (dict): 介数中心性
        additional_cent (dict): 其他中心性

    Returns:
        dict: 分析结果
    """
    print("\n=== 中心性分析结果 ===")

    # 基本统计
    centrality_stats = {}

    for name, centrality in [
        ('degree', degree_cent),
        ('weighted_degree', weighted_degree_cent),
        ('betweenness', betweenness_cent),
        ('eigenvector', additional_cent.get('eigenvector', {})),
        ('closeness', additional_cent.get('closeness', {})),
        ('pagerank', additional_cent.get('pagerank', {}))
    ]:
        values = list(centrality.values())
        centrality_stats[name] = {
            'mean': np.mean(values),
            'std': np.std(values),
            'min': np.min(values),
            'max': np.max(values),
            'median': np.median(values)
        }

        print(f"\n{name.upper()}中心性统计:")
        print(f"  平均值: {centrality_stats[name]['mean']:.6f}")
        print(f"  标准差: {centrality_stats[name]['std']:.6f}")
        print(f"  最小值: {centrality_stats[name]['min']:.6f}")
        print(f"  最大值: {centrality_stats[name]['max']:.6f}")
        print(f"  中位数: {centrality_stats[name]['median']:.6f}")

    # 找出关键节点
    top_nodes = {}
    for name, centrality in [
        ('degree', degree_cent),
        ('weighted_degree', weighted_degree_cent),
        ('betweenness', betweenness_cent),
        ('eigenvector', additional_cent.get('eigenvector', {})),
        ('closeness', additional_cent.get('closeness', {})),
        ('pagerank', additional_cent.get('pagerank', {}))
    ]:
        top_nodes[name] = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:10]
        print(f"\n{name.upper()}中心性前10节点:")
        for i, (node, value) in enumerate(top_nodes[name], 1):
            print(f"  {i:2d}. {node}: {value:.6f}")

    return {
        'centrality_stats': centrality_stats,
        'top_nodes': top_nodes
    }

def visualize_centrality_results(degree_cent, weighted_degree_cent, betweenness_cent, additional_cent, analysis_results, output_dir="output"):
    """
    可视化中心性结果

    Args:
        degree_cent (dict): 度中心性
        weighted_degree_cent (dict): 加权度中心性
        betweenness_cent (dict): 介数中心性
        additional_cent (dict): 其他中心性
        analysis_results (dict): 分析结果
        output_dir (str): 输出目录
    """
    print("生成中心性可视化图表...")

    # 设置中文字体
    plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
    plt.rcParams['axes.unicode_minus'] = False

    # 1. 中心性分布直方图
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('各种中心性分布', fontsize=16)

    centralities = [
        ('degree', degree_cent, '度中心性'),
        ('weighted_degree', weighted_degree_cent, '加权度中心性'),
        ('betweenness', betweenness_cent, '介数中心性'),
        ('eigenvector', additional_cent.get('eigenvector', {}), '特征向量中心性'),
        ('closeness', additional_cent.get('closeness', {}), '接近中心性'),
        ('pagerank', additional_cent.get('pagerank', {}), 'PageRank中心性')
    ]

    for i, (name, centrality, title) in enumerate(centralities):
        row, col = i // 3, i % 3
        values = list(centrality.values())

        axes[row, col].hist(values, bins=50, alpha=0.7, edgecolor='black')
        axes[row, col].set_title(title)
        axes[row, col].set_xlabel('中心性值')
        axes[row, col].set_ylabel('频次')
        axes[row, col].grid(True, alpha=0.3)

        # 添加统计信息
        mean_val = np.mean(values)
        axes[row, col].axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'平均值: {mean_val:.4f}')
        axes[row, col].legend()

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'centrality_distributions.png'), dpi=300, bbox_inches='tight')
    plt.close()

    # 2. 前10节点对比图
    top_nodes = analysis_results['top_nodes']

    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('各种中心性前10节点', fontsize=16)

    for i, (name, title) in enumerate([
        ('degree', '度中心性'),
        ('weighted_degree', '加权度中心性'),
        ('betweenness', '介数中心性'),
        ('eigenvector', '特征向量中心性'),
        ('closeness', '接近中心性'),
        ('pagerank', 'PageRank中心性')
    ]):
        row, col = i // 3, i % 3

        if name in top_nodes and top_nodes[name]:
            nodes = [item[0][:15] + '...' if len(item[0]) > 15 else item[0] for item in top_nodes[name]]
            values = [item[1] for item in top_nodes[name]]

            bars = axes[row, col].bar(range(len(nodes)), values, alpha=0.7, color='lightblue', edgecolor='black')
            axes[row, col].set_title(title)
            axes[row, col].set_xlabel('节点')
            axes[row, col].set_ylabel('中心性值')
            axes[row, col].set_xticks(range(len(nodes)))
            axes[row, col].set_xticklabels(nodes, rotation=45, ha='right')
            axes[row, col].grid(True, alpha=0.3)

            # 在柱状图上添加数值标签
            for bar, value in zip(bars, values):
                axes[row, col].text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(values)*0.01,
                                   f'{value:.3f}', ha='center', va='bottom', fontsize=8)

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'top_nodes_comparison.png'), dpi=300, bbox_inches='tight')
    plt.close()

    # 3. 中心性相关性热力图
    centrality_data = {
        'Degree': degree_cent,
        'Weighted_Degree': weighted_degree_cent,
        'Betweenness': betweenness_cent,
        'Eigenvector': additional_cent.get('eigenvector', {}),
        'Closeness': additional_cent.get('closeness', {}),
        'PageRank': additional_cent.get('pagerank', {})
    }

    # 创建DataFrame
    df_centrality = pd.DataFrame(centrality_data)
    correlation_matrix = df_centrality.corr()

    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, fmt='.3f', cbar_kws={'label': '相关系数'})
    plt.title('中心性指标相关性热力图')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'centrality_correlation.png'), dpi=300, bbox_inches='tight')
    plt.close()

    print("中心性可视化图表已保存")

def save_centrality_results(degree_cent, weighted_degree_cent, betweenness_cent, additional_cent, analysis_results, output_dir="output"):
    """
    保存中心性分析结果

    Args:
        degree_cent (dict): 度中心性
        weighted_degree_cent (dict): 加权度中心性
        betweenness_cent (dict): 介数中心性
        additional_cent (dict): 其他中心性
        analysis_results (dict): 分析结果
        output_dir (str): 输出目录
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 1. 保存所有中心性结果
    centrality_file = os.path.join(output_dir, "centrality_analysis.csv")

    # 创建DataFrame
    all_nodes = set(degree_cent.keys())
    centrality_data = []

    for node in all_nodes:
        row = {
            'node': node,
            'degree_centrality': degree_cent.get(node, 0),
            'weighted_degree_centrality': weighted_degree_cent.get(node, 0),
            'betweenness_centrality': betweenness_cent.get(node, 0),
            'eigenvector_centrality': additional_cent.get('eigenvector', {}).get(node, 0),
            'closeness_centrality': additional_cent.get('closeness', {}).get(node, 0),
            'pagerank_centrality': additional_cent.get('pagerank', {}).get(node, 0)
        }
        centrality_data.append(row)

    df_centrality = pd.DataFrame(centrality_data)
    df_centrality.to_csv(centrality_file, index=False, encoding='utf-8')
    print(f"中心性分析结果已保存到: {centrality_file}")

    # 2. 保存统计信息
    stats_file = os.path.join(output_dir, "centrality_statistics.txt")
    with open(stats_file, 'w', encoding='utf-8') as f:
        f.write("中心性分析统计结果\n")
        f.write("="*60 + "\n")

        centrality_stats = analysis_results['centrality_stats']
        for name, stats in centrality_stats.items():
            f.write(f"\n{name.upper()}中心性统计:\n")
            f.write(f"  平均值: {stats['mean']:.6f}\n")
            f.write(f"  标准差: {stats['std']:.6f}\n")
            f.write(f"  最小值: {stats['min']:.6f}\n")
            f.write(f"  最大值: {stats['max']:.6f}\n")
            f.write(f"  中位数: {stats['median']:.6f}\n")

        f.write("\n关键节点识别:\n")
        top_nodes = analysis_results['top_nodes']
        for name, top_list in top_nodes.items():
            f.write(f"\n{name.upper()}中心性前10节点:\n")
            for i, (node, value) in enumerate(top_list, 1):
                f.write(f"  {i:2d}. {node}: {value:.6f}\n")

    print(f"中心性统计信息已保存到: {stats_file}")

    # 3. 保存为pickle格式
    pickle_file = os.path.join(output_dir, "centrality_results.pkl")
    results = {
        'degree_centrality': degree_cent,
        'weighted_degree_centrality': weighted_degree_cent,
        'betweenness_centrality': betweenness_cent,
        'additional_centralities': additional_cent,
        'analysis_results': analysis_results
    }
    with open(pickle_file, 'wb') as f:
        pickle.dump(results, f)
    print(f"完整中心性结果已保存到: {pickle_file}")

def main():
    """主函数"""
    # 配置文件路径
    graph_file = "output/network_graph.pkl"
    output_dir = "output"

    try:
        # 1. 加载图结构
        G = load_graph(graph_file)

        # 2. 计算度中心性
        degree_cent, weighted_degree_cent = calculate_degree_centrality(G)

        # 3. 计算介数中心性
        betweenness_cent = calculate_betweenness_centrality(G)

        # 4. 计算其他中心性
        additional_cent = calculate_additional_centralities(G)

        # 5. 分析结果
        analysis_results = analyze_centrality_results(degree_cent, weighted_degree_cent, betweenness_cent, additional_cent)

        # 6. 可视化结果
        visualize_centrality_results(degree_cent, weighted_degree_cent, betweenness_cent, additional_cent, analysis_results, output_dir)

        # 7. 保存结果
        save_centrality_results(degree_cent, weighted_degree_cent, betweenness_cent, additional_cent, analysis_results, output_dir)

        print("\n✅ 中心性分析完成！")

    except FileNotFoundError:
        print(f"❌ 错误: 找不到图文件 {graph_file}")
        print("请先运行 1_build_graph.py 构建图结构")
    except Exception as e:
        print(f"❌ 发生错误: {str(e)}")

if __name__ == "__main__":
    main()