diff --git a/draw/draw_md_cluster_with_data_point.py b/draw/draw_md_cluster_with_data_point.py new file mode 100644 index 0000000..5b53606 --- /dev/null +++ b/draw/draw_md_cluster_with_data_point.py @@ -0,0 +1,66 @@ +# 将数据点和MD一起聚类 +import os +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt + +from draw_md_cluster import DBSCAN +from ml_er.ml_entity_resolver import build_col_pairs_sim_tensor_dict + + +def plot(md_keys_, data_, data_points_, labels_, output_path_): + clusterNum = len(set(labels_)) + fig = plt.figure() + scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown'] + ax = fig.add_subplot(111, projection='3d') + for i in range(-1, clusterNum): + colorStyle = scatterColors[i % len(scatterColors)] + subCluster = data_[np.where(labels_ == i)] + ax.scatter(subCluster[:, 0], subCluster[:, 1], subCluster[:, 2], c=colorStyle, s=12) + ax.scatter(data_points_[:, 0], data_points_[:, 1], data_points_[:, 2], c='#66CCFF', s=12, marker='x') + ax.set_xlabel(md_keys_[0], rotation=0) # 设置标签角度 + ax.set_ylabel(md_keys_[1], rotation=-45) + ax.set_zlabel(md_keys_[2], rotation=0) + plt.title(output_path_.split('\\')[-1].split('.')[0]) + plt.savefig(output_path_, dpi=500) + plt.show() + + +if __name__ == '__main__': + outcome_path = r'E:\Data\Research\Outcome' + config_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5' + dataset_name_list = [f.name for f in os.scandir(outcome_path) if f.is_dir()] + for dataset_name in dataset_name_list: + absolute_path = outcome_path + rf'\{dataset_name}' + config_dir + r'\mds.txt' # MD路径 + predictions = outcome_path + rf'\{dataset_name}' + config_dir + r'\predictions.csv' # prediction路径 + pred = pd.read_csv(predictions) + pred = pred.astype(str) + pred = pred[pred['predicted'] == str(1)] + sim_tensor_dict = build_col_pairs_sim_tensor_dict(pred) + # 选取的三个字段 + md_keys = [] + with open(absolute_path, 'r') as f: + # 读取每一行的md,加入该文件的md列表 + data = [] + for line in f.readlines(): + md_metadata = line.strip().split('\t') + md_tuple = eval(md_metadata[1]) + md_keys = list(md_tuple[0].keys())[1:4] + md_values = list(md_tuple[0].values()) + data.append(md_values[1:4]) + if len(data) == 10000: + break + + data_points = [] + for _ in range(len(pred)): + data_point_value = [] + for key in md_keys: + sim_tensor = sim_tensor_dict[key] + data_point_value.append(round(float(sim_tensor[_]), 4)) + data_points.append(data_point_value) + + data = np.array(data, dtype=np.float32) + data_points = np.array(data_points, dtype=np.float32) + labels = DBSCAN(data, 0.5, 30) + output_path = outcome_path + rf'\{dataset_name}_MD&data.png' + plot(md_keys, data, data_points, labels, output_path) diff --git a/ml_er/ml_entity_resolver.py b/ml_er/ml_entity_resolver.py index 145f640..2721ce7 100644 --- a/ml_er/ml_entity_resolver.py +++ b/ml_er/ml_entity_resolver.py @@ -113,6 +113,7 @@ def is_explicable(row, all_mds: list, st_dict): return -1.0 # 遍历结束,不能解释 +# 形成一个字典,key为字段名称,value为一维张量,记录了预测表中这一字段每行的左右属性的相似度 def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame): predictions_attrs = predictions.columns.values.tolist() col_tuple_list = []