From 253eb0835f645e8defee56d2728e10cc3a9a2c0a Mon Sep 17 00:00:00 2001 From: HuangJintao <1447537163@qq.com> Date: Sat, 3 Feb 2024 00:36:00 +0800 Subject: [PATCH] fuck --- .gitignore | 4 + README.md | 2 - draw/__init__.py | 0 hpo/__init__.py | 0 hpo/er_model_hpo.py | 93 +++++++++ md_discovery/__init__.py | 0 md_discovery/md_discover.py | 143 ++++++++++++++ ml_er/__init__.py | 0 ml_er/ml_entity_resolver.py | 364 ++++++++++++++++++++++++++++++++++++ ml_er/new.py | 160 ++++++++++++++++ settings.py | 22 +++ 11 files changed, 786 insertions(+), 2 deletions(-) create mode 100644 .gitignore delete mode 100644 README.md create mode 100644 draw/__init__.py create mode 100644 hpo/__init__.py create mode 100644 hpo/er_model_hpo.py create mode 100644 md_discovery/__init__.py create mode 100644 md_discovery/md_discover.py create mode 100644 ml_er/__init__.py create mode 100644 ml_er/ml_entity_resolver.py create mode 100644 ml_er/new.py create mode 100644 settings.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..241c07c --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/ml_er/output/* +/md_discovery/output/* +/hpo/output/* +/datasets/* \ No newline at end of file diff --git a/README.md b/README.md deleted file mode 100644 index 9cba231..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# matching_dependency_pyJedAI - diff --git a/draw/__init__.py b/draw/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hpo/__init__.py b/hpo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hpo/er_model_hpo.py b/hpo/er_model_hpo.py new file mode 100644 index 0000000..13ee960 --- /dev/null +++ b/hpo/er_model_hpo.py @@ -0,0 +1,93 @@ +import pandas as pd +import json +from time import * +from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float +from ConfigSpace.read_and_write import json as csj +from smac import HyperparameterOptimizationFacade, Scenario +from settings import * +from ml_er.ml_entity_resolver import er_process +from settings import ltable_path, ltable_id + + +class Classifier: + @property + def configspace(self) -> ConfigurationSpace: + cs = ConfigurationSpace(seed=0) + ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1', sep='|') + selected_attrs = ltable.columns.values.tolist() + block_attr_items = selected_attrs[:] + block_attr_items.remove(ltable_id) + + jed_blocker = Categorical("jed_blocker", + ["Standard", "QGrams", "ExtendedQG", "SuffixArrays", "ExtendedSA"]) + block_attr = Categorical("block_attr", block_attr_items) + # filter_ratio = Float("filter_ratio", (0.0, 1.0), default=0.8) + meta_blocker = Categorical("meta_blocker", + ["WEP", "WNP", "CEP", "CNP", "BLAST", "RCNP", "RWNP", "CP"]) + weighting_scheme = Categorical("weighting_scheme", + ['CN-CBS', 'CBS', 'SN-CBS', 'CNC', 'SNC', 'SND', 'CND', 'CNJ', 'SNJ', + 'COSINE', 'DICE', 'ECBS', 'JS', 'EJS', 'X2']) + + # todo other hyperparameters + matching_metric = Categorical("matching_metric", + ['cosine', 'euclidean']) + matching_tokenizer = Categorical("matching_tokenizer", + ['char_tokenizer', 'word_tokenizer', 'white_space_tokenizer']) + matching_vectorizer = Categorical("matching_vectorizer", + ['tfidf', 'tf', 'boolean']) + clusteror = Categorical("clusteror_name", + ["CCC", "UMC", "EC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"]) + + cs.add_hyperparameters([jed_blocker, block_attr, meta_blocker, weighting_scheme, matching_metric, + matching_tokenizer, matching_vectorizer, clusteror]) + return cs + + def train(self, config: Configuration, seed: int = 0) -> float: + indicators = er_process(config) + return 1-indicators['performance'] + + +def ml_er_hpo(): + classifier = Classifier() + cs = classifier.configspace + str_configspace = csj.write(cs) + dict_configspace = json.loads(str_configspace) + with open(hpo_output_dir + r"\configspace.json", "w") as f: + json.dump(dict_configspace, f, indent=4) + + scenario = Scenario( + cs, + deterministic=True, + n_trials=50, # We want to run max 50 trials (combination of config and seed) + n_workers=1 + ) + + initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) + + smac = HyperparameterOptimizationFacade( + scenario, + classifier.train, + initial_design=initial_design, + overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state + ) + + incumbent = smac.optimize() + incumbent_cost = smac.validate(incumbent) + default = cs.get_default_configuration() + default_cost = smac.validate(default) + print(f"Default Cost: {default_cost}") + print(f"Incumbent Cost: {incumbent_cost}") + + if incumbent_cost > default_cost: + incumbent = default + print(f"Updated Incumbent Cost: {default_cost}") + + print(f"Optimized Configuration:{incumbent.values()}") + + with open(hpo_output_dir + r"\incumbent.json", "w") as f: + json.dump(dict(incumbent), f, indent=4) + return incumbent + + +if __name__ == '__main__': + ml_er_hpo() diff --git a/md_discovery/__init__.py b/md_discovery/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/md_discovery/md_discover.py b/md_discovery/md_discover.py new file mode 100644 index 0000000..ee3fe69 --- /dev/null +++ b/md_discovery/md_discover.py @@ -0,0 +1,143 @@ +import math +import operator + +import numpy as np +import pandas as pd +import torch +from ConfigSpace import Configuration +from settings import * +import random +from tqdm import tqdm + + +sample_number = 100000 +step_length = 0.01 + + +def md_discover(config: Configuration, source_path, target_path): + mds_list = discover(source_path, target_attr) + if len(mds_list) > 0: + with open(target_path, 'w') as f: + for _ in mds_list: + f.write('Target:'+str(target_attr) + '\t') + f.write(str(_)) + f.write('\n') + + +def discover(path, target_col): + data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') + data.fillna("", inplace=True) + data = data.astype(str) + columns = data.columns.values.tolist() + target_index = columns.index(target_col) + cols_but_target = list(set(columns) - {target_col}) + length = data.shape[0] + width = data.shape[1] + # 除了目标列外所有列的索引 + columns_indices = [_ for _ in range(0, width) if _ != target_index] + + sentences = [] + for col in range(0, width): + for row in range(0, length): + cell_value = data.values[row, col] + sentences.append(cell_value) + if len(sentences) == 0: + return [] + embedding = model.encode(sentences, convert_to_tensor=True, device="cuda") + split_embedding = torch.split(embedding, length, dim=0) + table_tensor = torch.stack(split_embedding, dim=0, out=None) + norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) + sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) + sim_tensor = sim_tensor.float() + sim_tensor = torch.round(sim_tensor, decimals=4) + + # 小于6列的可以尝试做笛卡尔积,大于6列可能指数爆炸 + if width < 6: + # 列出除目标列以外所有列的所有取值,做笛卡尔积,结果为所有可能MD取值 + cartesian = build_cartesian(width, target_index) + # 抽取sample_number / (width - 1)条MD,不含-1 + if cartesian.shape[0] > sample_number / (width - 1): + index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda') + cartesian = torch.index_select(cartesian, 0, index) + else: + # 随机生成sample_number / (width - 1)条MD,使用randint先转化为int再除成小数,不含-1 + cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100, + (math.ceil(sample_number / (width - 1)), width - 1), device='cuda') + cartesian = cartesian / 100 + # 生成一列相似度为1的目标列,插入目标列所在位置 + ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda') + cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1) + cartesian = torch.round(cartesian, decimals=4) + # 此tensor将与其他置为-1的tensor拼接 + joint_md_tensor = cartesian.clone() + # 随机将1列,2列……置为-1 + for i in range(width - 2): + index_list_format = [] + for j in range(cartesian.shape[0]): + # 对每条MD,随机选择将要置为-1的列索引 + index_list_format.append(random.sample(columns_indices, i + 1)) + index = torch.tensor(index_list_format, device='cuda') + # 随机调整为-1后的MD集合 + modified_cartesian = cartesian.scatter(1, index, -1) + joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0) + + md_list = [] + # get_metric_tensor(cartesian, data, sim_tensor, target_index) + for _ in tqdm(range(joint_md_tensor.shape[0])): + s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index) + if s >= support_threshold and c >= confidence_threshold: + md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()] + md_dict_format = {} + for k in range(0, width): + md_dict_format[columns[k]] = md_list_format[k] + md_list.append((md_dict_format, s, c)) + md_list.sort(key=operator.itemgetter(2), reverse=True) + return md_list + +def get_metrics(md_tensor, data, sim_tensor, target_index): + length = data.shape[0] + width = data.shape[1] + + # md_tensor = list(current_md.values()) + # md_tensor = torch.tensor(md_tensor, device='cuda') + md_tensor_2d = md_tensor.unsqueeze(1) + md_tensor_3d = md_tensor_2d.unsqueeze(2) + md_tensor_3d = md_tensor_3d.repeat(1, length, length) + + sim_tensor = torch.round(sim_tensor, decimals=4) + + sup_tensor = torch.ge(sim_tensor, md_tensor_3d) + ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda') + for i in range(0, width): + if i != target_index: + sup_tensor_slice = sup_tensor[i] + ini_slice = torch.logical_and(ini_slice, sup_tensor_slice) + sup_tensor_int = ini_slice.int() + support_Naumann = torch.count_nonzero(sup_tensor_int).item() + support_Naumann = (support_Naumann - length) / 2 + + conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index]) + conf_tensor_int = conf_tensor.int() + support_Fan = torch.count_nonzero(conf_tensor_int).item() + support_Fan = (support_Fan - length) / 2 + confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0 + + return support_Fan, confidence + + +def build_cartesian(width, target_index): + all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True, + num=math.ceil((1-similarity_threshold)/step_length) + 1) + all_values_array = np.round(all_values_array, 4) + all_values_tensor = torch.tensor(all_values_array, device='cuda') + all_values_tensor = all_values_tensor.float() + all_values_tensor = torch.round(all_values_tensor, decimals=4) + tensors_for_cartesian = [] + for i in range(0, width): + if i == target_index: + t = torch.tensor([1.0], device='cuda') + tensors_for_cartesian.append(t) + else: + tensors_for_cartesian.append(all_values_tensor) + result = torch.cartesian_prod(*tensors_for_cartesian) + return result diff --git a/ml_er/__init__.py b/ml_er/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ml_er/ml_entity_resolver.py b/ml_er/ml_entity_resolver.py new file mode 100644 index 0000000..a109e22 --- /dev/null +++ b/ml_er/ml_entity_resolver.py @@ -0,0 +1,364 @@ +import json +import os +from time import time + +import ConfigSpace +import pandas as pd +import torch +from ConfigSpace import Configuration +from ConfigSpace.read_and_write import json as csj +from pyjedai.clustering import UniqueMappingClustering +from tqdm import tqdm + +from md_discovery.md_discover import md_discover +from settings import * +from pyjedai.datamodel import Data +from pyjedai.block_cleaning import BlockPurging +from pyjedai.block_cleaning import BlockFiltering +from pyjedai.matching import EntityMatching +from pyjedai.block_building import ( + StandardBlocking, + QGramsBlocking, + ExtendedQGramsBlocking, + SuffixArraysBlocking, + ExtendedSuffixArraysBlocking, +) +from pyjedai.comparison_cleaning import ( + WeightedEdgePruning, + WeightedNodePruning, + CardinalityEdgePruning, + CardinalityNodePruning, + BLAST, + ReciprocalCardinalityNodePruning, + ReciprocalWeightedNodePruning, + ComparisonPropagation +) +from pyjedai.clustering import * + + +def prepare_file_for_md_discovery(train: pd.DataFrame, t_single_tuple_path=er_output_dir + r"\t_single_tuple.csv"): + # 挑选出实际匹配的元组对 + df = train[train['gold'] == '1'] + # 将左表id赋值给右表 + for index, row in df.iterrows(): + df.loc[index, "rtable_" + rtable_id] = row["ltable_" + ltable_id] + + train_columns = train.columns.values.tolist() + l_columns = [] + r_columns = [] + cols = [] + # 左表和右表字段名分别加入两个列表 + for _ in train_columns: + if _.startswith('ltable'): + l_columns.append(_) + elif _.startswith('rtable'): + r_columns.append(_) + # 将左表中字段名去掉前缀,作为统一的字段名列表(前提是两张表内对应字段名调整一致) + for _ in l_columns: + cols.append(_.replace('ltable_', '')) + + ldf = df[l_columns] + rdf = df[r_columns] + ldf.columns = cols + rdf.columns = cols + t_single_tuple = pd.concat([ldf, rdf]) + t_single_tuple = t_single_tuple.reset_index(drop=True) + + t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1) + + +# 形成一个字典,key为字段名称,value为一维张量,记录了预测表中这一字段每行的左右属性的相似度 +def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame): + predictions_attrs = predictions.columns.values.tolist() + col_tuple_list = [] + for _ in predictions_attrs: + if _.startswith('ltable'): + left_index = predictions_attrs.index(_) + right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_')) + col_tuple_list.append((left_index, right_index)) + + df = predictions.drop(columns=['gold', 'predicted', 'confidence'], inplace=False) + length = df.shape[0] + width = df.shape[1] + sentences = [] + for col in range(0, width): + for row in range(0, length): + cell_value = df.values[row, col] + sentences.append(cell_value) + if len(sentences) == 0: + return {} + embedding = model.encode(sentences, convert_to_tensor=True, device="cuda") + split_embedding = torch.split(embedding, length, dim=0) + table_tensor = torch.stack(split_embedding, dim=0, out=None) + # prediction的归一化嵌入张量 + norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) + sim_tensor_dict = {} + for col_tuple in col_tuple_list: + lattr_tensor = norm_table_tensor[col_tuple[0]] + rattr_tensor = norm_table_tensor[col_tuple[1]] + mul_tensor = lattr_tensor * rattr_tensor + sim_tensor = torch.sum(mul_tensor, 1) + sim_tensor = torch.round(sim_tensor, decimals=4) + sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor + return sim_tensor_dict + + +def load_mds(paths: list) -> list: + if len(paths) == 0: + return [] + all_mds = [] + # 传入md路径列表 + for md_path in paths: + if not os.path.exists(md_path): + continue + mds = [] + # 打开每一个md文件 + with open(md_path, 'r') as f_: + # 读取每一行的md,加入该文件的md列表 + for line in f_.readlines(): + md_metadata = line.strip().split('\t') + # todo 如果MD文件的形式改了 这里也要改 + md = eval(md_metadata[1]) + mds.append(md) + all_mds.extend(mds) + return all_mds + + +def is_explicable(row, all_mds: list, st_dict): + attrs = all_mds[0][0].keys() # 从第一条md_tuple中的md字典中读取所有字段 + for md_tuple in all_mds: + explicable = True # 假设这条md能解释当前元组 + for a in attrs: + if a != target_attr: + if st_dict[a][row[0]].item() < md_tuple[0][a]: + explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组 + break # 不再与当前md的其他相似度阈值比较,跳转到下一条md + if explicable: + return md_tuple[2] # 任意一条md能解释,直接返回 + return -1.0 # 遍历结束,不能解释 + + +def er_process(config: Configuration): + print(f'\033[33mConfig: {config}\033[0m') + start = time() + ltable = pd.read_csv(ltable_path, sep='|', engine='python', na_filter=False) + rtable = pd.read_csv(rtable_path, sep='|', engine='python', na_filter=False) + mapping = pd.read_csv(mapping_path, sep='|', engine='python') + data = Data(dataset_1=ltable, + id_column_name_1=ltable_id, + dataset_2=rtable, + id_column_name_2=rtable_id, + ground_truth=mapping) + + # clean data(optional) + data.clean_dataset(remove_stopwords=False, + remove_punctuation=False, + remove_numbers=False, + remove_unicodes=False) + + # block building + blocker_name = config["jed_blocker"] + block_attribute = config["block_attr"] + match blocker_name: + case "Standard": + blocker = StandardBlocking() + case "QGrams": + blocker = QGramsBlocking() + case "ExtendedQG": + blocker = ExtendedQGramsBlocking() + case "SuffixArrays": + blocker = SuffixArraysBlocking() + case "ExtendedSA": + blocker = ExtendedSuffixArraysBlocking() + blocks = blocker.build_blocks(data, attributes_1=[block_attribute], attributes_2=[block_attribute]) + + # block purging(optional) + bp = BlockPurging() + cleaned_blocks = bp.process(blocks, data, tqdm_disable=False) + + # block cleaning(optional) + bf = BlockFiltering(ratio=0.8) # todo what is ratio for? + filtered_blocks = bf.process(cleaned_blocks, data, tqdm_disable=False) + + # Comparison Cleaning - Meta Blocking(optional) + pruning_method = config["meta_blocker"] + weighting_scheme_name = config["weighting_scheme"] + match pruning_method: + case "WEP": + meta_blocker = WeightedEdgePruning(weighting_scheme=weighting_scheme_name) + case "WNP": + meta_blocker = WeightedNodePruning(weighting_scheme=weighting_scheme_name) + case "CEP": + meta_blocker = CardinalityEdgePruning(weighting_scheme=weighting_scheme_name) + case "CNP": + meta_blocker = CardinalityNodePruning(weighting_scheme=weighting_scheme_name) + case "BLAST": + meta_blocker = BLAST(weighting_scheme=weighting_scheme_name) + case "RCNP": + meta_blocker = ReciprocalCardinalityNodePruning(weighting_scheme=weighting_scheme_name) + case "RWNP": + meta_blocker = ReciprocalWeightedNodePruning(weighting_scheme=weighting_scheme_name) + case "CP": + meta_blocker = ComparisonPropagation() + candidate_pairs_blocks = meta_blocker.process(blocks=filtered_blocks, data=data, tqdm_disable=True) + + # entity matching + em = EntityMatching( + metric=config["matching_metric"], + tokenizer=config["matching_tokenizer"], + vectorizer=config["matching_vectorizer"], + qgram=3, + similarity_threshold=0.0 + ) + + pairs_graph = em.predict(candidate_pairs_blocks, data, tqdm_disable=True) + # draw(pairs_graph) + + # entity clustering + clusteror_name = config["clusteror_name"] + match clusteror_name: + case "CCC": + clusteror = ConnectedComponentsClustering() + case "UMC": + clusteror = UniqueMappingClustering() + case "EC": + clusteror = ExactClustering() + case "CenterC": + clusteror = CenterClustering() + case "BMC": + clusteror = BestMatchClustering() + case "MCC": + clusteror = MergeCenterClustering() + case "CC": + clusteror = CorrelationClustering() + case "CTC": + clusteror = CutClustering() + case "MCL": + clusteror = MarkovClustering() + case "KMAC": + clusteror = KiralyMSMApproximateClustering() + case "RSRC": + clusteror = RicochetSRClustering() + # 得到预测结果与评估指标 + clusters = clusteror.process(pairs_graph, data, similarity_threshold=0.17) + matches_dataframe = clusteror.export_to_df(clusters) + matches_dataframe_path = er_output_dir + r'\matches_dataframe.csv' + matches_dataframe.to_csv(matches_dataframe_path, sep=',', index=False, header=True, quoting=1) + evaluation = clusteror.evaluate(clusters) + # evaluation.pop('True Positives') + # evaluation.pop('False Positives') + # evaluation.pop('True Negatives') + # evaluation.pop('False Negatives') + + + ltable = ltable.astype(str) + rtable = rtable.astype(str) + mapping = mapping.astype(str) + result_df = matches_dataframe.astype(str) + lcolumns_dict = {} + rcolumns_dict = {} + ltable_attrs = ltable.columns.values.tolist() + rtable_attrs = rtable.columns.values.tolist() + for _ in ltable_attrs: + lcolumns_dict[_] = 'ltable_' + _ + for _ in rtable_attrs: + rcolumns_dict[_] = 'rtable_' + _ + result_lid_list = result_df['id1'].tolist() + selected_ltable = ltable[ltable[ltable_id].isin(result_lid_list)] + selected_ltable = selected_ltable.rename(columns=lcolumns_dict) + selected_ltable['key'] = 1 + result_rid_list = result_df['id2'].tolist() + selected_rtable = rtable[rtable[rtable_id].isin(result_rid_list)] + selected_rtable = selected_rtable.rename(columns=rcolumns_dict) + selected_rtable['key'] = 1 + predictions = pd.merge(selected_ltable, selected_rtable, on='key') + predictions.drop(columns='key', inplace=True) + predictions = predictions.reset_index(drop=True) + + predictions['gold'] = '0' + predictions['predicted'] = '0' + gold_match_rows = [] + predicted_match_rows = [] + for tuple_ in tqdm(predictions.itertuples()): + lid = getattr(tuple_, 'ltable_' + ltable_id) + map_row = mapping[mapping[mapping_lid] == lid] + result_row = result_df[result_df['id1'] == lid] + if map_row is not None: + rid = map_row[mapping_rid] + for value in rid: + if value == getattr(tuple_, 'rtable_' + rtable_id): + gold_match_rows.append(tuple_[0]) + if result_row is not None: + rid = result_row['id2'] + for value in rid: + if value == getattr(tuple_, 'rtable_' + rtable_id): + predicted_match_rows.append(tuple_[0]) + for _ in gold_match_rows: + predictions.loc[_, 'gold'] = '1' + for _ in predicted_match_rows: + predictions.loc[_, 'predicted'] = '1' + + # 挖MD 计算可解释性 + prepare_file_for_md_discovery(predictions) + predictions['confidence'] = 0 + predicted_match = predictions[predictions['predicted'] == '1'] + predicted_match = predicted_match.reset_index(drop=True) + sim_tensor_dict = build_col_pairs_sim_tensor_dict(predicted_match) + + md_discover(config, er_output_dir + r"\t_single_tuple.csv", md_output_dir + r"\mds.txt") + md_paths = [md_output_dir + r'\mds.txt'] + md_list = load_mds(md_paths) # 从全局变量中读取所有的md + epl_match = 0 # 可解释,预测match + + if sim_tensor_dict: + if len(md_list) > 0: + for row in tqdm(predicted_match.itertuples()): + x = is_explicable(row, md_list, sim_tensor_dict) + if x > 0: + predicted_match.loc[row[0], 'confidence'] = x + epl_match += 1 + interpretability = epl_match / len(predicted_match) # 可解释性 + evaluation['interpretability'] = interpretability + else: + interpretability = 0 + evaluation['interpretability'] = interpretability + f1 = evaluation['F1 %'] / 100 + performance = interpre_weight * interpretability + (1 - interpre_weight) * f1 + evaluation['performance'] = performance + print(f'\033[33mEvaluation: {evaluation}\033[0m') + predicted_match.to_csv(er_output_dir + r'\predicted_match.csv', sep=',', index=False, header=True) + ################################################################################################################ + print(f'\033[33mTime consumed by ML-ER in seconds: {time() - start}\033[0m') + + return evaluation + + +def ml_er(config: Configuration = None): + indicators = er_process(config) + output_path = er_output_dir + r"\eval_result.txt" + with open(output_path, 'w') as f_: + for key, value in indicators: + f_.write(key + " : " + value) + f_.write('\n') + + +# if __name__ == '__main__': +# if os.path.isfile(hpo_output_dir + r"\incumbent.json"): +# with open(hpo_output_dir + r"\configspace.json", 'r') as f: +# dict_configspace = json.load(f) +# str_configspace = json.dumps(dict_configspace) +# configspace = csj.read(str_configspace) +# with open(hpo_output_dir + r"\incumbent.json", 'r') as f: +# dic = json.load(f) +# configuration = ConfigSpace.Configuration(configspace, values=dic) +# ml_er(configuration) + +if __name__ == '__main__': + with open(hpo_output_dir + r"\configspace.json", 'r') as f: + dict_configspace = json.load(f) + str_configspace = json.dumps(dict_configspace) + configspace = csj.read(str_configspace) + with open(hpo_output_dir + r"\fuck.json", 'r') as f: + dic = json.load(f) + configuration = ConfigSpace.Configuration(configspace, values=dic) + ml_er(configuration) diff --git a/ml_er/new.py b/ml_er/new.py new file mode 100644 index 0000000..17e4d89 --- /dev/null +++ b/ml_er/new.py @@ -0,0 +1,160 @@ +import os +import sys +import pandas as pd +import networkx +from networkx import draw, Graph +import pyjedai +from pyjedai.utils import ( + text_cleaning_method, + print_clusters, + print_blocks, + print_candidate_pairs +) +from pyjedai.block_building import ( + StandardBlocking, + QGramsBlocking, + ExtendedQGramsBlocking, + SuffixArraysBlocking, + ExtendedSuffixArraysBlocking, +) +from pyjedai.comparison_cleaning import ( + WeightedEdgePruning, + WeightedNodePruning, + CardinalityEdgePruning, + CardinalityNodePruning, + BLAST, + ReciprocalCardinalityNodePruning, + ReciprocalWeightedNodePruning, + ComparisonPropagation +) +from pyjedai.evaluation import Evaluation +from pyjedai.datamodel import Data +from pyjedai.block_cleaning import BlockPurging +from pyjedai.block_cleaning import BlockFiltering +from pyjedai.matching import EntityMatching +from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering + +from settings import er_output_dir, ltable_path, rtable_path, ltable_id, rtable_id, mapping_lid, mapping_rid, \ + mapping_path + + +def example(): + # read data + d1 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\abt.csv", + sep='|', engine='python', na_filter=False) + d2 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\buy.csv", + sep='|', engine='python', na_filter=False) + gt = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\gt.csv", + sep='|', engine='python') + + data = Data(dataset_1=d1, + id_column_name_1='id', + dataset_2=d2, + id_column_name_2='id', + ground_truth=gt) + + # clean data(optional) + data.clean_dataset(remove_stopwords=False, + remove_punctuation=False, + remove_numbers=False, + remove_unicodes=False) + + # block building + bb = StandardBlocking() + blocks = bb.build_blocks(data, attributes_1=['name'], attributes_2=['name']) + + # block purging(optional) + bp = BlockPurging() + cleaned_blocks = bp.process(blocks, data, tqdm_disable=False) + + # block cleaning(optional) + # todo ratio + bf = BlockFiltering(ratio=0.8) + filtered_blocks = bf.process(cleaned_blocks, data, tqdm_disable=False) + + # Comparison Cleaning - Meta Blocking(optional) + # todo meta_blocking methods, weighting_scheme (more) + mb = WeightedEdgePruning(weighting_scheme='EJS') + candidate_pairs_blocks = mb.process(filtered_blocks, data, tqdm_disable=True) + + # entity matching + # todo parameters(qgram, similarity_threshold) + em = EntityMatching( + metric='cosine', + tokenizer='char_tokenizer', + vectorizer='tfidf', + qgram=3, + similarity_threshold=0.0 + ) + + # 无向权重图可视化 + pairs_graph = em.predict(candidate_pairs_blocks, data, tqdm_disable=True) + draw(pairs_graph) + + # entity clustering + # todo similarity_threshold + ccc = UniqueMappingClustering() + clusters = ccc.process(pairs_graph, data, similarity_threshold=0.17) + result_df = ccc.export_to_df(clusters) + p = er_output_dir + r'\result.csv' + result_df.to_csv(p, sep=',', index=False, header=True, quoting=1) + _ = ccc.evaluate(clusters) + return result_df + + +if __name__ == '__main__': + rdf = example() + rdf = rdf.astype(str) + ltable = pd.read_csv(ltable_path, sep='|', engine='python', na_filter=False) + rtable = pd.read_csv(rtable_path, sep='|', engine='python', na_filter=False) + mapping = pd.read_csv(mapping_path, sep='|', engine='python') + ltable = ltable.astype(str) + rtable = rtable.astype(str) + mapping = mapping.astype(str) + + lcolumns_dict = {} + rcolumns_dict = {} + ltable_attrs = ltable.columns.values.tolist() + rtable_attrs = rtable.columns.values.tolist() + for _ in ltable_attrs: + lcolumns_dict[_] = 'ltable_' + _ + for _ in rtable_attrs: + rcolumns_dict[_] = 'rtable_' + _ + result_lid_list = rdf['id1'].tolist() + selected_ltable = ltable[ltable[ltable_id].isin(result_lid_list)] + selected_ltable = selected_ltable.rename(columns=lcolumns_dict) + selected_ltable['key'] = 1 + result_rid_list = rdf['id2'].tolist() + selected_rtable = rtable[rtable[rtable_id].isin(result_rid_list)] + selected_rtable = selected_rtable.rename(columns=rcolumns_dict) + selected_rtable['key'] = 1 + predictions = pd.merge(selected_ltable, selected_rtable, on='key') + predictions.drop(columns='key', inplace=True) + predictions = predictions.reset_index(drop=True) + + predictions['gold'] = '0' + predictions['predicted'] = '0' + gold_match_rows = [] + predicted_match_rows = [] + for tuple_ in predictions.itertuples(): + lid = getattr(tuple_, 'ltable_' + ltable_id) + map_row = mapping[mapping[mapping_lid] == lid] + result_row = rdf[rdf['id1'] == lid] + if map_row is not None: + rid = map_row[mapping_rid] + for value in rid: + if value == getattr(tuple_, 'rtable_' + rtable_id): + gold_match_rows.append(tuple_[0]) + if result_row is not None: + rid = result_row['id2'] + for value in rid: + if value == getattr(tuple_, 'rtable_' + rtable_id): + predicted_match_rows.append(tuple_[0]) + for _ in gold_match_rows: + predictions.loc[_, 'gold'] = '1' + for _ in predicted_match_rows: + predictions.loc[_, 'predicted'] = '1' + + predictions['confidence'] = 0 + predicted_match = predictions[predictions['predicted'] == '1'] + print(1) diff --git a/settings.py b/settings.py new file mode 100644 index 0000000..1535028 --- /dev/null +++ b/settings.py @@ -0,0 +1,22 @@ +from sentence_transformers import SentenceTransformer + +ltable_path = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\abt.csv' +rtable_path = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\buy.csv' +mapping_path = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\gt.csv' +mapping_lid = 'D1' # mapping表中左表id名 +mapping_rid = 'D2' # mapping表中右表id名 +ltable_id = 'id' # 左表id字段名称 +rtable_id = 'id' # 右表id字段名称 +target_attr = 'id' # 进行md挖掘时的目标字段 +# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致 + +model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens') +interpre_weight = 0.5 # 可解释性权重 +similarity_threshold = 0.1 +support_threshold = 1 +confidence_threshold = 0.25 + +er_output_dir = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\ml_er\output' +md_output_dir = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\md_discovery\output' +hpo_output_dir = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\hpo\output' +