|
|
# 将数据点和MD一起聚类
|
|
|
import os
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
from matplotlib import pyplot as plt
|
|
|
|
|
|
from draw_md_cluster import DBSCAN
|
|
|
from ml_er.ml_entity_resolver import build_col_pairs_sim_tensor_dict
|
|
|
|
|
|
|
|
|
def plot(md_keys_, data_, data_points_, labels_, output_path_):
|
|
|
clusterNum = len(set(labels_))
|
|
|
fig = plt.figure()
|
|
|
scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
|
|
|
ax = fig.add_subplot(111, projection='3d')
|
|
|
for i in range(-1, clusterNum):
|
|
|
colorStyle = scatterColors[i % len(scatterColors)]
|
|
|
subCluster = data_[np.where(labels_ == i)]
|
|
|
ax.scatter(subCluster[:, 0], subCluster[:, 1], subCluster[:, 2], c=colorStyle, s=12)
|
|
|
ax.scatter(data_points_[:, 0], data_points_[:, 1], data_points_[:, 2], c='#66CCFF', s=12, marker='x')
|
|
|
ax.set_xlabel(md_keys_[0], rotation=0) # 设置标签角度
|
|
|
ax.set_ylabel(md_keys_[1], rotation=-45)
|
|
|
ax.set_zlabel(md_keys_[2], rotation=0)
|
|
|
plt.title(output_path_.split('\\')[-1].split('.')[0])
|
|
|
plt.savefig(output_path_, dpi=500)
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
outcome_path = r'E:\Data\Research\Outcome'
|
|
|
config_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5'
|
|
|
dataset_name_list = [f.name for f in os.scandir(outcome_path) if f.is_dir()]
|
|
|
for dataset_name in dataset_name_list:
|
|
|
absolute_path = outcome_path + rf'\{dataset_name}' + config_dir + r'\mds.txt' # MD路径
|
|
|
predictions = outcome_path + rf'\{dataset_name}' + config_dir + r'\predictions.csv' # prediction路径
|
|
|
pred = pd.read_csv(predictions)
|
|
|
pred = pred.astype(str)
|
|
|
pred = pred[pred['predicted'] == str(1)]
|
|
|
sim_tensor_dict = build_col_pairs_sim_tensor_dict(pred)
|
|
|
# 选取的三个字段
|
|
|
md_keys = []
|
|
|
with open(absolute_path, 'r') as f:
|
|
|
# 读取每一行的md,加入该文件的md列表
|
|
|
data = []
|
|
|
for line in f.readlines():
|
|
|
md_metadata = line.strip().split('\t')
|
|
|
md_tuple = eval(md_metadata[1])
|
|
|
md_keys = list(md_tuple[0].keys())[1:4]
|
|
|
md_values = list(md_tuple[0].values())
|
|
|
data.append(md_values[1:4])
|
|
|
if len(data) == 10000:
|
|
|
break
|
|
|
|
|
|
data_points = []
|
|
|
for _ in range(len(pred)):
|
|
|
data_point_value = []
|
|
|
for key in md_keys:
|
|
|
sim_tensor = sim_tensor_dict[key]
|
|
|
data_point_value.append(round(float(sim_tensor[_]), 4))
|
|
|
data_points.append(data_point_value)
|
|
|
|
|
|
data = np.array(data, dtype=np.float32)
|
|
|
data_points = np.array(data_points, dtype=np.float32)
|
|
|
labels = DBSCAN(data, 0.5, 30)
|
|
|
output_path = outcome_path + rf'\{dataset_name}_MD&data.png'
|
|
|
plot(md_keys, data, data_points, labels, output_path)
|