HuangJintao 10 months ago
parent 0835b6df1c
commit 253eb0835f

4
.gitignore vendored

@ -0,0 +1,4 @@
/ml_er/output/*
/md_discovery/output/*
/hpo/output/*
/datasets/*

@ -1,2 +0,0 @@
# matching_dependency_pyJedAI

@ -0,0 +1,93 @@
import pandas as pd
import json
from time import *
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace.read_and_write import json as csj
from smac import HyperparameterOptimizationFacade, Scenario
from settings import *
from ml_er.ml_entity_resolver import er_process
from settings import ltable_path, ltable_id
class Classifier:
@property
def configspace(self) -> ConfigurationSpace:
cs = ConfigurationSpace(seed=0)
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1', sep='|')
selected_attrs = ltable.columns.values.tolist()
block_attr_items = selected_attrs[:]
block_attr_items.remove(ltable_id)
jed_blocker = Categorical("jed_blocker",
["Standard", "QGrams", "ExtendedQG", "SuffixArrays", "ExtendedSA"])
block_attr = Categorical("block_attr", block_attr_items)
# filter_ratio = Float("filter_ratio", (0.0, 1.0), default=0.8)
meta_blocker = Categorical("meta_blocker",
["WEP", "WNP", "CEP", "CNP", "BLAST", "RCNP", "RWNP", "CP"])
weighting_scheme = Categorical("weighting_scheme",
['CN-CBS', 'CBS', 'SN-CBS', 'CNC', 'SNC', 'SND', 'CND', 'CNJ', 'SNJ',
'COSINE', 'DICE', 'ECBS', 'JS', 'EJS', 'X2'])
# todo other hyperparameters
matching_metric = Categorical("matching_metric",
['cosine', 'euclidean'])
matching_tokenizer = Categorical("matching_tokenizer",
['char_tokenizer', 'word_tokenizer', 'white_space_tokenizer'])
matching_vectorizer = Categorical("matching_vectorizer",
['tfidf', 'tf', 'boolean'])
clusteror = Categorical("clusteror_name",
["CCC", "UMC", "EC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"])
cs.add_hyperparameters([jed_blocker, block_attr, meta_blocker, weighting_scheme, matching_metric,
matching_tokenizer, matching_vectorizer, clusteror])
return cs
def train(self, config: Configuration, seed: int = 0) -> float:
indicators = er_process(config)
return 1-indicators['performance']
def ml_er_hpo():
classifier = Classifier()
cs = classifier.configspace
str_configspace = csj.write(cs)
dict_configspace = json.loads(str_configspace)
with open(hpo_output_dir + r"\configspace.json", "w") as f:
json.dump(dict_configspace, f, indent=4)
scenario = Scenario(
cs,
deterministic=True,
n_trials=50, # We want to run max 50 trials (combination of config and seed)
n_workers=1
)
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
smac = HyperparameterOptimizationFacade(
scenario,
classifier.train,
initial_design=initial_design,
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
)
incumbent = smac.optimize()
incumbent_cost = smac.validate(incumbent)
default = cs.get_default_configuration()
default_cost = smac.validate(default)
print(f"Default Cost: {default_cost}")
print(f"Incumbent Cost: {incumbent_cost}")
if incumbent_cost > default_cost:
incumbent = default
print(f"Updated Incumbent Cost: {default_cost}")
print(f"Optimized Configuration:{incumbent.values()}")
with open(hpo_output_dir + r"\incumbent.json", "w") as f:
json.dump(dict(incumbent), f, indent=4)
return incumbent
if __name__ == '__main__':
ml_er_hpo()

@ -0,0 +1,143 @@
import math
import operator
import numpy as np
import pandas as pd
import torch
from ConfigSpace import Configuration
from settings import *
import random
from tqdm import tqdm
sample_number = 100000
step_length = 0.01
def md_discover(config: Configuration, source_path, target_path):
mds_list = discover(source_path, target_attr)
if len(mds_list) > 0:
with open(target_path, 'w') as f:
for _ in mds_list:
f.write('Target:'+str(target_attr) + '\t')
f.write(str(_))
f.write('\n')
def discover(path, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
columns = data.columns.values.tolist()
target_index = columns.index(target_col)
cols_but_target = list(set(columns) - {target_col})
length = data.shape[0]
width = data.shape[1]
# 除了目标列外所有列的索引
columns_indices = [_ for _ in range(0, width) if _ != target_index]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = data.values[row, col]
sentences.append(cell_value)
if len(sentences) == 0:
return []
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
sim_tensor = sim_tensor.float()
sim_tensor = torch.round(sim_tensor, decimals=4)
# 小于6列的可以尝试做笛卡尔积大于6列可能指数爆炸
if width < 6:
# 列出除目标列以外所有列的所有取值做笛卡尔积结果为所有可能MD取值
cartesian = build_cartesian(width, target_index)
# 抽取sample_number / (width - 1)条MD不含-1
if cartesian.shape[0] > sample_number / (width - 1):
index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda')
cartesian = torch.index_select(cartesian, 0, index)
else:
# 随机生成sample_number / (width - 1)条MD使用randint先转化为int再除成小数不含-1
cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100,
(math.ceil(sample_number / (width - 1)), width - 1), device='cuda')
cartesian = cartesian / 100
# 生成一列相似度为1的目标列插入目标列所在位置
ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda')
cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1)
cartesian = torch.round(cartesian, decimals=4)
# 此tensor将与其他置为-1的tensor拼接
joint_md_tensor = cartesian.clone()
# 随机将1列2列……置为-1
for i in range(width - 2):
index_list_format = []
for j in range(cartesian.shape[0]):
# 对每条MD随机选择将要置为-1的列索引
index_list_format.append(random.sample(columns_indices, i + 1))
index = torch.tensor(index_list_format, device='cuda')
# 随机调整为-1后的MD集合
modified_cartesian = cartesian.scatter(1, index, -1)
joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0)
md_list = []
# get_metric_tensor(cartesian, data, sim_tensor, target_index)
for _ in tqdm(range(joint_md_tensor.shape[0])):
s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index)
if s >= support_threshold and c >= confidence_threshold:
md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()]
md_dict_format = {}
for k in range(0, width):
md_dict_format[columns[k]] = md_list_format[k]
md_list.append((md_dict_format, s, c))
md_list.sort(key=operator.itemgetter(2), reverse=True)
return md_list
def get_metrics(md_tensor, data, sim_tensor, target_index):
length = data.shape[0]
width = data.shape[1]
# md_tensor = list(current_md.values())
# md_tensor = torch.tensor(md_tensor, device='cuda')
md_tensor_2d = md_tensor.unsqueeze(1)
md_tensor_3d = md_tensor_2d.unsqueeze(2)
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
sim_tensor = torch.round(sim_tensor, decimals=4)
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
for i in range(0, width):
if i != target_index:
sup_tensor_slice = sup_tensor[i]
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int()
support_Naumann = torch.count_nonzero(sup_tensor_int).item()
support_Naumann = (support_Naumann - length) / 2
conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = conf_tensor.int()
support_Fan = torch.count_nonzero(conf_tensor_int).item()
support_Fan = (support_Fan - length) / 2
confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
return support_Fan, confidence
def build_cartesian(width, target_index):
all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True,
num=math.ceil((1-similarity_threshold)/step_length) + 1)
all_values_array = np.round(all_values_array, 4)
all_values_tensor = torch.tensor(all_values_array, device='cuda')
all_values_tensor = all_values_tensor.float()
all_values_tensor = torch.round(all_values_tensor, decimals=4)
tensors_for_cartesian = []
for i in range(0, width):
if i == target_index:
t = torch.tensor([1.0], device='cuda')
tensors_for_cartesian.append(t)
else:
tensors_for_cartesian.append(all_values_tensor)
result = torch.cartesian_prod(*tensors_for_cartesian)
return result

@ -0,0 +1,364 @@
import json
import os
from time import time
import ConfigSpace
import pandas as pd
import torch
from ConfigSpace import Configuration
from ConfigSpace.read_and_write import json as csj
from pyjedai.clustering import UniqueMappingClustering
from tqdm import tqdm
from md_discovery.md_discover import md_discover
from settings import *
from pyjedai.datamodel import Data
from pyjedai.block_cleaning import BlockPurging
from pyjedai.block_cleaning import BlockFiltering
from pyjedai.matching import EntityMatching
from pyjedai.block_building import (
StandardBlocking,
QGramsBlocking,
ExtendedQGramsBlocking,
SuffixArraysBlocking,
ExtendedSuffixArraysBlocking,
)
from pyjedai.comparison_cleaning import (
WeightedEdgePruning,
WeightedNodePruning,
CardinalityEdgePruning,
CardinalityNodePruning,
BLAST,
ReciprocalCardinalityNodePruning,
ReciprocalWeightedNodePruning,
ComparisonPropagation
)
from pyjedai.clustering import *
def prepare_file_for_md_discovery(train: pd.DataFrame, t_single_tuple_path=er_output_dir + r"\t_single_tuple.csv"):
# 挑选出实际匹配的元组对
df = train[train['gold'] == '1']
# 将左表id赋值给右表
for index, row in df.iterrows():
df.loc[index, "rtable_" + rtable_id] = row["ltable_" + ltable_id]
train_columns = train.columns.values.tolist()
l_columns = []
r_columns = []
cols = []
# 左表和右表字段名分别加入两个列表
for _ in train_columns:
if _.startswith('ltable'):
l_columns.append(_)
elif _.startswith('rtable'):
r_columns.append(_)
# 将左表中字段名去掉前缀,作为统一的字段名列表(前提是两张表内对应字段名调整一致)
for _ in l_columns:
cols.append(_.replace('ltable_', ''))
ldf = df[l_columns]
rdf = df[r_columns]
ldf.columns = cols
rdf.columns = cols
t_single_tuple = pd.concat([ldf, rdf])
t_single_tuple = t_single_tuple.reset_index(drop=True)
t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1)
# 形成一个字典key为字段名称value为一维张量记录了预测表中这一字段每行的左右属性的相似度
def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame):
predictions_attrs = predictions.columns.values.tolist()
col_tuple_list = []
for _ in predictions_attrs:
if _.startswith('ltable'):
left_index = predictions_attrs.index(_)
right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_'))
col_tuple_list.append((left_index, right_index))
df = predictions.drop(columns=['gold', 'predicted', 'confidence'], inplace=False)
length = df.shape[0]
width = df.shape[1]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = df.values[row, col]
sentences.append(cell_value)
if len(sentences) == 0:
return {}
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
# prediction的归一化嵌入张量
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor_dict = {}
for col_tuple in col_tuple_list:
lattr_tensor = norm_table_tensor[col_tuple[0]]
rattr_tensor = norm_table_tensor[col_tuple[1]]
mul_tensor = lattr_tensor * rattr_tensor
sim_tensor = torch.sum(mul_tensor, 1)
sim_tensor = torch.round(sim_tensor, decimals=4)
sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
return sim_tensor_dict
def load_mds(paths: list) -> list:
if len(paths) == 0:
return []
all_mds = []
# 传入md路径列表
for md_path in paths:
if not os.path.exists(md_path):
continue
mds = []
# 打开每一个md文件
with open(md_path, 'r') as f_:
# 读取每一行的md加入该文件的md列表
for line in f_.readlines():
md_metadata = line.strip().split('\t')
# todo 如果MD文件的形式改了 这里也要改
md = eval(md_metadata[1])
mds.append(md)
all_mds.extend(mds)
return all_mds
def is_explicable(row, all_mds: list, st_dict):
attrs = all_mds[0][0].keys() # 从第一条md_tuple中的md字典中读取所有字段
for md_tuple in all_mds:
explicable = True # 假设这条md能解释当前元组
for a in attrs:
if a != target_attr:
if st_dict[a][row[0]].item() < md_tuple[0][a]:
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable:
return md_tuple[2] # 任意一条md能解释直接返回
return -1.0 # 遍历结束,不能解释
def er_process(config: Configuration):
print(f'\033[33mConfig: {config}\033[0m')
start = time()
ltable = pd.read_csv(ltable_path, sep='|', engine='python', na_filter=False)
rtable = pd.read_csv(rtable_path, sep='|', engine='python', na_filter=False)
mapping = pd.read_csv(mapping_path, sep='|', engine='python')
data = Data(dataset_1=ltable,
id_column_name_1=ltable_id,
dataset_2=rtable,
id_column_name_2=rtable_id,
ground_truth=mapping)
# clean data(optional)
data.clean_dataset(remove_stopwords=False,
remove_punctuation=False,
remove_numbers=False,
remove_unicodes=False)
# block building
blocker_name = config["jed_blocker"]
block_attribute = config["block_attr"]
match blocker_name:
case "Standard":
blocker = StandardBlocking()
case "QGrams":
blocker = QGramsBlocking()
case "ExtendedQG":
blocker = ExtendedQGramsBlocking()
case "SuffixArrays":
blocker = SuffixArraysBlocking()
case "ExtendedSA":
blocker = ExtendedSuffixArraysBlocking()
blocks = blocker.build_blocks(data, attributes_1=[block_attribute], attributes_2=[block_attribute])
# block purging(optional)
bp = BlockPurging()
cleaned_blocks = bp.process(blocks, data, tqdm_disable=False)
# block cleaning(optional)
bf = BlockFiltering(ratio=0.8) # todo what is ratio for?
filtered_blocks = bf.process(cleaned_blocks, data, tqdm_disable=False)
# Comparison Cleaning - Meta Blocking(optional)
pruning_method = config["meta_blocker"]
weighting_scheme_name = config["weighting_scheme"]
match pruning_method:
case "WEP":
meta_blocker = WeightedEdgePruning(weighting_scheme=weighting_scheme_name)
case "WNP":
meta_blocker = WeightedNodePruning(weighting_scheme=weighting_scheme_name)
case "CEP":
meta_blocker = CardinalityEdgePruning(weighting_scheme=weighting_scheme_name)
case "CNP":
meta_blocker = CardinalityNodePruning(weighting_scheme=weighting_scheme_name)
case "BLAST":
meta_blocker = BLAST(weighting_scheme=weighting_scheme_name)
case "RCNP":
meta_blocker = ReciprocalCardinalityNodePruning(weighting_scheme=weighting_scheme_name)
case "RWNP":
meta_blocker = ReciprocalWeightedNodePruning(weighting_scheme=weighting_scheme_name)
case "CP":
meta_blocker = ComparisonPropagation()
candidate_pairs_blocks = meta_blocker.process(blocks=filtered_blocks, data=data, tqdm_disable=True)
# entity matching
em = EntityMatching(
metric=config["matching_metric"],
tokenizer=config["matching_tokenizer"],
vectorizer=config["matching_vectorizer"],
qgram=3,
similarity_threshold=0.0
)
pairs_graph = em.predict(candidate_pairs_blocks, data, tqdm_disable=True)
# draw(pairs_graph)
# entity clustering
clusteror_name = config["clusteror_name"]
match clusteror_name:
case "CCC":
clusteror = ConnectedComponentsClustering()
case "UMC":
clusteror = UniqueMappingClustering()
case "EC":
clusteror = ExactClustering()
case "CenterC":
clusteror = CenterClustering()
case "BMC":
clusteror = BestMatchClustering()
case "MCC":
clusteror = MergeCenterClustering()
case "CC":
clusteror = CorrelationClustering()
case "CTC":
clusteror = CutClustering()
case "MCL":
clusteror = MarkovClustering()
case "KMAC":
clusteror = KiralyMSMApproximateClustering()
case "RSRC":
clusteror = RicochetSRClustering()
# 得到预测结果与评估指标
clusters = clusteror.process(pairs_graph, data, similarity_threshold=0.17)
matches_dataframe = clusteror.export_to_df(clusters)
matches_dataframe_path = er_output_dir + r'\matches_dataframe.csv'
matches_dataframe.to_csv(matches_dataframe_path, sep=',', index=False, header=True, quoting=1)
evaluation = clusteror.evaluate(clusters)
# evaluation.pop('True Positives')
# evaluation.pop('False Positives')
# evaluation.pop('True Negatives')
# evaluation.pop('False Negatives')
ltable = ltable.astype(str)
rtable = rtable.astype(str)
mapping = mapping.astype(str)
result_df = matches_dataframe.astype(str)
lcolumns_dict = {}
rcolumns_dict = {}
ltable_attrs = ltable.columns.values.tolist()
rtable_attrs = rtable.columns.values.tolist()
for _ in ltable_attrs:
lcolumns_dict[_] = 'ltable_' + _
for _ in rtable_attrs:
rcolumns_dict[_] = 'rtable_' + _
result_lid_list = result_df['id1'].tolist()
selected_ltable = ltable[ltable[ltable_id].isin(result_lid_list)]
selected_ltable = selected_ltable.rename(columns=lcolumns_dict)
selected_ltable['key'] = 1
result_rid_list = result_df['id2'].tolist()
selected_rtable = rtable[rtable[rtable_id].isin(result_rid_list)]
selected_rtable = selected_rtable.rename(columns=rcolumns_dict)
selected_rtable['key'] = 1
predictions = pd.merge(selected_ltable, selected_rtable, on='key')
predictions.drop(columns='key', inplace=True)
predictions = predictions.reset_index(drop=True)
predictions['gold'] = '0'
predictions['predicted'] = '0'
gold_match_rows = []
predicted_match_rows = []
for tuple_ in tqdm(predictions.itertuples()):
lid = getattr(tuple_, 'ltable_' + ltable_id)
map_row = mapping[mapping[mapping_lid] == lid]
result_row = result_df[result_df['id1'] == lid]
if map_row is not None:
rid = map_row[mapping_rid]
for value in rid:
if value == getattr(tuple_, 'rtable_' + rtable_id):
gold_match_rows.append(tuple_[0])
if result_row is not None:
rid = result_row['id2']
for value in rid:
if value == getattr(tuple_, 'rtable_' + rtable_id):
predicted_match_rows.append(tuple_[0])
for _ in gold_match_rows:
predictions.loc[_, 'gold'] = '1'
for _ in predicted_match_rows:
predictions.loc[_, 'predicted'] = '1'
# 挖MD 计算可解释性
prepare_file_for_md_discovery(predictions)
predictions['confidence'] = 0
predicted_match = predictions[predictions['predicted'] == '1']
predicted_match = predicted_match.reset_index(drop=True)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predicted_match)
md_discover(config, er_output_dir + r"\t_single_tuple.csv", md_output_dir + r"\mds.txt")
md_paths = [md_output_dir + r'\mds.txt']
md_list = load_mds(md_paths) # 从全局变量中读取所有的md
epl_match = 0 # 可解释预测match
if sim_tensor_dict:
if len(md_list) > 0:
for row in tqdm(predicted_match.itertuples()):
x = is_explicable(row, md_list, sim_tensor_dict)
if x > 0:
predicted_match.loc[row[0], 'confidence'] = x
epl_match += 1
interpretability = epl_match / len(predicted_match) # 可解释性
evaluation['interpretability'] = interpretability
else:
interpretability = 0
evaluation['interpretability'] = interpretability
f1 = evaluation['F1 %'] / 100
performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
evaluation['performance'] = performance
print(f'\033[33mEvaluation: {evaluation}\033[0m')
predicted_match.to_csv(er_output_dir + r'\predicted_match.csv', sep=',', index=False, header=True)
################################################################################################################
print(f'\033[33mTime consumed by ML-ER in seconds: {time() - start}\033[0m')
return evaluation
def ml_er(config: Configuration = None):
indicators = er_process(config)
output_path = er_output_dir + r"\eval_result.txt"
with open(output_path, 'w') as f_:
for key, value in indicators:
f_.write(key + " : " + value)
f_.write('\n')
# if __name__ == '__main__':
# if os.path.isfile(hpo_output_dir + r"\incumbent.json"):
# with open(hpo_output_dir + r"\configspace.json", 'r') as f:
# dict_configspace = json.load(f)
# str_configspace = json.dumps(dict_configspace)
# configspace = csj.read(str_configspace)
# with open(hpo_output_dir + r"\incumbent.json", 'r') as f:
# dic = json.load(f)
# configuration = ConfigSpace.Configuration(configspace, values=dic)
# ml_er(configuration)
if __name__ == '__main__':
with open(hpo_output_dir + r"\configspace.json", 'r') as f:
dict_configspace = json.load(f)
str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace)
with open(hpo_output_dir + r"\fuck.json", 'r') as f:
dic = json.load(f)
configuration = ConfigSpace.Configuration(configspace, values=dic)
ml_er(configuration)

@ -0,0 +1,160 @@
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph
import pyjedai
from pyjedai.utils import (
text_cleaning_method,
print_clusters,
print_blocks,
print_candidate_pairs
)
from pyjedai.block_building import (
StandardBlocking,
QGramsBlocking,
ExtendedQGramsBlocking,
SuffixArraysBlocking,
ExtendedSuffixArraysBlocking,
)
from pyjedai.comparison_cleaning import (
WeightedEdgePruning,
WeightedNodePruning,
CardinalityEdgePruning,
CardinalityNodePruning,
BLAST,
ReciprocalCardinalityNodePruning,
ReciprocalWeightedNodePruning,
ComparisonPropagation
)
from pyjedai.evaluation import Evaluation
from pyjedai.datamodel import Data
from pyjedai.block_cleaning import BlockPurging
from pyjedai.block_cleaning import BlockFiltering
from pyjedai.matching import EntityMatching
from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering
from settings import er_output_dir, ltable_path, rtable_path, ltable_id, rtable_id, mapping_lid, mapping_rid, \
mapping_path
def example():
# read data
d1 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\abt.csv",
sep='|', engine='python', na_filter=False)
d2 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\buy.csv",
sep='|', engine='python', na_filter=False)
gt = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\gt.csv",
sep='|', engine='python')
data = Data(dataset_1=d1,
id_column_name_1='id',
dataset_2=d2,
id_column_name_2='id',
ground_truth=gt)
# clean data(optional)
data.clean_dataset(remove_stopwords=False,
remove_punctuation=False,
remove_numbers=False,
remove_unicodes=False)
# block building
bb = StandardBlocking()
blocks = bb.build_blocks(data, attributes_1=['name'], attributes_2=['name'])
# block purging(optional)
bp = BlockPurging()
cleaned_blocks = bp.process(blocks, data, tqdm_disable=False)
# block cleaning(optional)
# todo ratio
bf = BlockFiltering(ratio=0.8)
filtered_blocks = bf.process(cleaned_blocks, data, tqdm_disable=False)
# Comparison Cleaning - Meta Blocking(optional)
# todo meta_blocking methods, weighting_scheme (more)
mb = WeightedEdgePruning(weighting_scheme='EJS')
candidate_pairs_blocks = mb.process(filtered_blocks, data, tqdm_disable=True)
# entity matching
# todo parameters(qgram, similarity_threshold)
em = EntityMatching(
metric='cosine',
tokenizer='char_tokenizer',
vectorizer='tfidf',
qgram=3,
similarity_threshold=0.0
)
# 无向权重图可视化
pairs_graph = em.predict(candidate_pairs_blocks, data, tqdm_disable=True)
draw(pairs_graph)
# entity clustering
# todo similarity_threshold
ccc = UniqueMappingClustering()
clusters = ccc.process(pairs_graph, data, similarity_threshold=0.17)
result_df = ccc.export_to_df(clusters)
p = er_output_dir + r'\result.csv'
result_df.to_csv(p, sep=',', index=False, header=True, quoting=1)
_ = ccc.evaluate(clusters)
return result_df
if __name__ == '__main__':
rdf = example()
rdf = rdf.astype(str)
ltable = pd.read_csv(ltable_path, sep='|', engine='python', na_filter=False)
rtable = pd.read_csv(rtable_path, sep='|', engine='python', na_filter=False)
mapping = pd.read_csv(mapping_path, sep='|', engine='python')
ltable = ltable.astype(str)
rtable = rtable.astype(str)
mapping = mapping.astype(str)
lcolumns_dict = {}
rcolumns_dict = {}
ltable_attrs = ltable.columns.values.tolist()
rtable_attrs = rtable.columns.values.tolist()
for _ in ltable_attrs:
lcolumns_dict[_] = 'ltable_' + _
for _ in rtable_attrs:
rcolumns_dict[_] = 'rtable_' + _
result_lid_list = rdf['id1'].tolist()
selected_ltable = ltable[ltable[ltable_id].isin(result_lid_list)]
selected_ltable = selected_ltable.rename(columns=lcolumns_dict)
selected_ltable['key'] = 1
result_rid_list = rdf['id2'].tolist()
selected_rtable = rtable[rtable[rtable_id].isin(result_rid_list)]
selected_rtable = selected_rtable.rename(columns=rcolumns_dict)
selected_rtable['key'] = 1
predictions = pd.merge(selected_ltable, selected_rtable, on='key')
predictions.drop(columns='key', inplace=True)
predictions = predictions.reset_index(drop=True)
predictions['gold'] = '0'
predictions['predicted'] = '0'
gold_match_rows = []
predicted_match_rows = []
for tuple_ in predictions.itertuples():
lid = getattr(tuple_, 'ltable_' + ltable_id)
map_row = mapping[mapping[mapping_lid] == lid]
result_row = rdf[rdf['id1'] == lid]
if map_row is not None:
rid = map_row[mapping_rid]
for value in rid:
if value == getattr(tuple_, 'rtable_' + rtable_id):
gold_match_rows.append(tuple_[0])
if result_row is not None:
rid = result_row['id2']
for value in rid:
if value == getattr(tuple_, 'rtable_' + rtable_id):
predicted_match_rows.append(tuple_[0])
for _ in gold_match_rows:
predictions.loc[_, 'gold'] = '1'
for _ in predicted_match_rows:
predictions.loc[_, 'predicted'] = '1'
predictions['confidence'] = 0
predicted_match = predictions[predictions['predicted'] == '1']
print(1)

@ -0,0 +1,22 @@
from sentence_transformers import SentenceTransformer
ltable_path = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\abt.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\buy.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\gt.csv'
mapping_lid = 'D1' # mapping表中左表id名
mapping_rid = 'D2' # mapping表中右表id名
ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段
# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
interpre_weight = 0.5 # 可解释性权重
similarity_threshold = 0.1
support_threshold = 1
confidence_threshold = 0.25
er_output_dir = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\ml_er\output'
md_output_dir = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\md_discovery\output'
hpo_output_dir = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\hpo\output'
Loading…
Cancel
Save