diff --git a/draw.py b/draw.py deleted file mode 100644 index 83e4f0b..0000000 --- a/draw.py +++ /dev/null @@ -1,55 +0,0 @@ -import os - -import pyecharts -from pyecharts.charts import Line -from pyecharts import options as opts -from pyecharts.globals import ThemeType - -if __name__ == '__main__': - dir_path = r'E:\Data\Research\Outcome\Walmart-Amazon_dirty' - filename_list = os.listdir(dir_path) - iter_list = [] - precision = [] - recall = [] - f1 = [] - interpretability = [] - performance = [] - for _ in filename_list: - if _.startswith('eval_result'): - it = int(_[12:13]) - iter_list.append(str(it)) - with open(dir_path + '\\' + _, 'r') as f: - # 读取每一行的md,加入该文件的md列表 - for line in f.readlines(): - if line.startswith('Precision'): - lt = line.split(' ') - value = float(lt[2].replace('%', ''))/100 - precision.append(value) - elif line.startswith('Recall'): - lt = line.split(' ') - value = float(lt[2].replace('%', ''))/100 - recall.append(value) - elif line.startswith('F1'): - lt = line.split(' ') - value = float(lt[2].replace('%', ''))/100 - f1.append(value) - elif line.startswith('interpretability'): - lt = line.split(':') - value = float(lt[1]) - interpretability.append(value) - elif line.startswith('performance'): - lt = line.split(':') - value = float(lt[1]) - performance.append(value) - - line = ( - Line(init_opts=opts.InitOpts(theme=ThemeType.LIGHT)) - .add_xaxis(iter_list) - .add_yaxis('Precision', precision) - .add_yaxis('Recall', recall) - .add_yaxis('F1', f1) - .add_yaxis('Interpretability', interpretability) - .add_yaxis('Performance', performance) - .set_global_opts(title_opts=opts.TitleOpts(title=dir_path.split('\\')[-1])) - ) - line.render(dir_path + '\\' + "line.html") diff --git a/draw/__init__.py b/draw/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/draw/draw_confidence_histogram.py b/draw/draw_confidence_histogram.py new file mode 100644 index 0000000..7eef5ed --- /dev/null +++ b/draw/draw_confidence_histogram.py @@ -0,0 +1,54 @@ +import os +import pandas as pd +from pyecharts import options as opts +from pyecharts.charts import Bar +from pyecharts.faker import Faker +from pyecharts.globals import ThemeType + +if __name__ == '__main__': + outcome_dir = r'E:\Data\Research\Outcome' + configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens' + datasets_list = os.listdir(outcome_dir) + + for _ in datasets_list: + path = outcome_dir + rf'\{_}' + configs_dir + statistics_files = os.listdir(path) + length = 0 + for file in statistics_files: + if file.startswith('predictions'): + preds = pd.read_csv(path + rf'\{file}', encoding='ISO-8859-1') + preds = preds[['predicted', 'confidence']] + preds = preds.astype(float) + preds = preds[preds['predicted'] == 1.0] + length = len(preds) + li = [] + zeros = len(preds[preds['confidence'] == 0]) + dot_02 = len(preds[(preds['confidence'] > 0) & (preds['confidence'] <= 0.2)]) + dot_24 = len(preds[(preds['confidence'] > 0.2) & (preds['confidence'] <= 0.4)]) + dot_46 = len(preds[(preds['confidence'] > 0.4) & (preds['confidence'] <= 0.6)]) + dot_68 = len(preds[(preds['confidence'] > 0.6) & (preds['confidence'] <= 0.8)]) + dot_80 = len(preds[(preds['confidence'] > 0.8) & (preds['confidence'] <= 1.0)]) + for number in [zeros, dot_02, dot_24, dot_46, dot_68, dot_80]: + li.append(round(number * 100 / length, ndigits=3)) + + c = ( + Bar(init_opts=opts.InitOpts(theme=ThemeType.WALDEN)) + .add_xaxis(['conf=0', '0 ConfigurationSpace: - # Build Configuration Space which defines all parameters and their ranges cs = ConfigurationSpace(seed=0) ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') selected_attrs = ltable.columns.values.tolist() @@ -22,12 +21,8 @@ class Classifier: block_attr = Categorical("block_attr", block_attr_items) ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf") ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap") - similarity_thresh = Float("similarity_thresh", (0, 0.2), default=0.2) - support_thresh = Integer("support_thresh", (1, 5), default=1) - confidence_thresh = Float("confidence_thresh", (0.25, 0.5), default=0.25) - cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker, similarity_thresh, - support_thresh, confidence_thresh]) + cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker]) return cs def train(self, config: Configuration, seed: int = 0) -> float: @@ -47,8 +42,7 @@ def ml_er_hpo(): scenario = Scenario( cs, deterministic=True, - n_trials=50, # We want to run max 50 trials (combination of config and seed) - walltime_limit=28800, # Max time limit in seconds (14400s = 4h) + n_trials=12, # We want to run max 50 trials (combination of config and seed) n_workers=1 ) diff --git a/md_discovery/discovery_executor.py b/md_discovery/discovery_executor.py index 071b485..ba01a44 100644 --- a/md_discovery/discovery_executor.py +++ b/md_discovery/discovery_executor.py @@ -6,7 +6,7 @@ import copy import torch from ConfigSpace import Configuration from tqdm import tqdm -from settings import model +from settings import model, similarity_threshold, support_threshold, confidence_threshold def is_minimal(md, md_list, target_col): @@ -36,11 +36,7 @@ def is_minimal(md, md_list, target_col): return minimal -def pairs_inference(path, target_col, conf: Configuration): - simt = conf["similarity_thresh"] - # simt = round(simt, ndigits=3) - supt = conf["support_thresh"] - cont = conf["confidence_thresh"] +def pairs_inference(path, target_col): data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') data.fillna("", inplace=True) data = data.astype(str) @@ -109,12 +105,12 @@ def pairs_inference(path, target_col, conf: Configuration): for vio_md in violated_mds: vio_md_support, vio_md_confidence = get_metrics(vio_md, data, sim_tensor, target_col, target_index) - if vio_md_support >= supt: + if vio_md_support >= support_threshold: for col in cols_but_target: if sims[col] < 1.0: spec_l_md = copy.deepcopy(vio_md) - if sims[col] < simt: - spec_l_md[col] = simt + if sims[col] < similarity_threshold: + spec_l_md[col] = similarity_threshold else: if sims[col] + 0.01 <= 1.0: spec_l_md[col] = sims[col] + 0.01 @@ -134,7 +130,7 @@ def pairs_inference(path, target_col, conf: Configuration): if len(minimal_vio) > 0: for md in minimal_vio[:]: support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index) - if support >= supt and confidence >= cont: + if support >= support_threshold and confidence >= confidence_threshold: minimal_vio.append((md, support, confidence)) minimal_vio.remove(md) @@ -148,7 +144,7 @@ def pairs_inference(path, target_col, conf: Configuration): # 去除support小于阈值MD for _ in md_list[:]: support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index) - if support >= supt and confidence >= cont: + if support >= support_threshold and confidence >= confidence_threshold: md_list.append((_, support, confidence)) md_list.remove(_) # 去除不minimal的MD @@ -170,7 +166,6 @@ def pairs_inference(path, target_col, conf: Configuration): result.extend(minimal_vio) result.sort(key=operator.itemgetter(2), reverse=True) print(f'\033[33mList Length: {len(result)}\033[0m') - print(f'\033[33mSupport: {supt}\tConfidence: {cont}\033[0m') return result diff --git a/md_discovery/discovery_executor_gpu.py b/md_discovery/discovery_executor_gpu.py new file mode 100644 index 0000000..30a491c --- /dev/null +++ b/md_discovery/discovery_executor_gpu.py @@ -0,0 +1,138 @@ +import math +import operator +import random +import time +from tqdm import tqdm +import numpy as np +import pandas as pd +import torch + +from settings import model, similarity_threshold, support_threshold, confidence_threshold, md_output_dir + +sample_number = 100000 +step_length = 0.01 + +def get_metrics(md_tensor, data, sim_tensor, target_index): + length = data.shape[0] + width = data.shape[1] + + # md_tensor = list(current_md.values()) + # md_tensor = torch.tensor(md_tensor, device='cuda') + md_tensor_2d = md_tensor.unsqueeze(1) + md_tensor_3d = md_tensor_2d.unsqueeze(2) + md_tensor_3d = md_tensor_3d.repeat(1, length, length) + + sim_tensor = torch.round(sim_tensor, decimals=4) + + sup_tensor = torch.ge(sim_tensor, md_tensor_3d) + ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda') + for i in range(0, width): + if i != target_index: + sup_tensor_slice = sup_tensor[i] + ini_slice = torch.logical_and(ini_slice, sup_tensor_slice) + sup_tensor_int = ini_slice.int() + support_Naumann = torch.count_nonzero(sup_tensor_int).item() + support_Naumann = (support_Naumann - length) / 2 + + conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index]) + conf_tensor_int = conf_tensor.int() + support_Fan = torch.count_nonzero(conf_tensor_int).item() + support_Fan = (support_Fan - length) / 2 + confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0 + + return support_Fan, confidence + + +def build_cartesian(width, target_index): + all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True, + num=math.ceil((1-similarity_threshold)/step_length) + 1) + all_values_array = np.round(all_values_array, 4) + all_values_tensor = torch.tensor(all_values_array, device='cuda') + all_values_tensor = all_values_tensor.float() + all_values_tensor = torch.round(all_values_tensor, decimals=4) + tensors_for_cartesian = [] + for i in range(0, width): + if i == target_index: + t = torch.tensor([1.0], device='cuda') + tensors_for_cartesian.append(t) + else: + tensors_for_cartesian.append(all_values_tensor) + result = torch.cartesian_prod(*tensors_for_cartesian) + return result + + +def get_metric_tensor(cartesian_product, data, sim_tensor, target_index): + length = data.shape[0] + width = data.shape[1] + cartesian_product = cartesian_product.unsqueeze(2) + cartesian_product = cartesian_product.unsqueeze(3) + cartesian_product = cartesian_product.repeat(1, 1, length, length) + + +def discover(path, target_col): + data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') + data.fillna("", inplace=True) + data = data.astype(str) + columns = data.columns.values.tolist() + target_index = columns.index(target_col) + cols_but_target = list(set(columns) - {target_col}) + length = data.shape[0] + width = data.shape[1] + # 除了目标列外所有列的索引 + columns_indices = [_ for _ in range(0, width) if _ != target_index] + + sentences = [] + for col in range(0, width): + for row in range(0, length): + cell_value = data.values[row, col] + sentences.append(cell_value) + embedding = model.encode(sentences, convert_to_tensor=True, device="cuda") + split_embedding = torch.split(embedding, length, dim=0) + table_tensor = torch.stack(split_embedding, dim=0, out=None) + norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) + sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) + sim_tensor = sim_tensor.float() + sim_tensor = torch.round(sim_tensor, decimals=4) + + # 小于6列的可以尝试做笛卡尔积,大于6列可能指数爆炸 + if width <= 6: + # 列出除目标列以外所有列的所有取值,做笛卡尔积,结果为所有可能MD取值 + cartesian = build_cartesian(width, target_index) + # 抽取sample_number / (width - 1)条MD,不含-1 + if cartesian.shape[0] > sample_number / (width - 1): + index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda') + cartesian = torch.index_select(cartesian, 0, index) + else: + # 随机生成sample_number / (width - 1)条MD,使用randint先转化为int再除成小数,不含-1 + cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100, + (math.ceil(sample_number / (width - 1)), width - 1), device='cuda') + cartesian = cartesian / 100 + # 生成一列相似度为1的目标列,插入目标列所在位置 + ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda') + cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1) + cartesian = torch.round(cartesian, decimals=4) + # 此tensor将与其他置为-1的tensor拼接 + joint_md_tensor = cartesian.clone() + # 随机将1列,2列……置为-1 + for i in range(width - 2): + index_list_format = [] + for j in range(cartesian.shape[0]): + # 对每条MD,随机选择将要置为-1的列索引 + index_list_format.append(random.sample(columns_indices, i + 1)) + index = torch.tensor(index_list_format, device='cuda') + # 随机调整为-1后的MD集合 + modified_cartesian = cartesian.scatter(1, index, -1) + joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0) + + md_list = [] + # get_metric_tensor(cartesian, data, sim_tensor, target_index) + for _ in tqdm(range(joint_md_tensor.shape[0])): + s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index) + if s >= support_threshold and c >= confidence_threshold: + md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()] + md_dict_format = {} + for k in range(0, width): + md_dict_format[columns[k]] = md_list_format[k] + md_list.append((md_dict_format, s, c)) + md_list.sort(key=operator.itemgetter(2), reverse=True) + return md_list diff --git a/md_discovery/md_discover.py b/md_discovery/md_discover.py index f244dfa..0617990 100644 --- a/md_discovery/md_discover.py +++ b/md_discovery/md_discover.py @@ -1,6 +1,7 @@ from ConfigSpace import Configuration from md_discovery.discovery_executor import pairs_inference +from md_discovery.discovery_executor_gpu import discover from settings import * # # 若不输出support和confidence,使用以下两块代码 @@ -17,16 +18,12 @@ from settings import * # f.write(str(_) + '\n') -def md_discover(config: Configuration): - t_single_tuple_path = er_output_dir + "t_single_tuple.csv" +def md_discover(config: Configuration, source_path, target_path): # 输入:csv文件路径,md左侧相似度阈值,md右侧目标字段 # 输出:2个md列表,列表1中md无violation,列表2中md有violation但confidence满足阈值 - mds_list = pairs_inference(t_single_tuple_path, target_attr, config) - - # 将列表1写入本地,路径需自己修改 - mds_path = md_output_dir + "mds.txt" - - with open(mds_path, 'w') as f: + # mds_list = pairs_inference(source_path, target_attr) + mds_list = discover(source_path, target_attr) + with open(target_path, 'w') as f: for _ in mds_list: f.write('Target:'+str(target_attr) + '\t') f.write(str(_)) diff --git a/ml_er/ml_entity_resolver.py b/ml_er/ml_entity_resolver.py index 24b7f20..145f640 100644 --- a/ml_er/ml_entity_resolver.py +++ b/ml_er/ml_entity_resolver.py @@ -1,6 +1,7 @@ import json import os import sys +import time import ConfigSpace import pandas @@ -12,28 +13,24 @@ import py_entitymatching.catalog.catalog_manager as cm import pandas as pd import six from ConfigSpace import Configuration +from tqdm import tqdm from md_discovery.md_discover import md_discover from settings import * -def process_prediction_for_md_discovery(pred: pd.DataFrame, - t_single_tuple_path: str = er_output_dir + "t_single_tuple.csv"): - # 提取预测表中真阳和假阴部分 - tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)] - fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)] - # 拼成一张表 - df = pd.concat([tp, fn]) - # 将真阳/假阴表中左右ID调整一致 +def prepare_file_for_md_discovery(train, t_single_tuple_path=er_output_dir + "t_single_tuple.csv"): + df = train[train['gold'] == 1] + # 元组对左右ID调整一致 for index, row in df.iterrows(): df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id] - pred_columns = pred.columns.values.tolist() + train_columns = train.columns.values.tolist() l_columns = [] r_columns = [] cols = [] - # 将预测表中左表和右表字段名分别加入两个列表 - for _ in pred_columns: + # 左表和右表字段名分别加入两个列表 + for _ in train_columns: if _.startswith('ltable'): l_columns.append(_) elif _.startswith('rtable'): @@ -47,6 +44,7 @@ def process_prediction_for_md_discovery(pred: pd.DataFrame, ldf.columns = cols rdf.columns = cols t_single_tuple = pd.concat([ldf, rdf]) + t_single_tuple = t_single_tuple.reset_index(drop=True) t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1) @@ -148,6 +146,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame): def er_process(config: Configuration): + start = time.time() ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') cm.set_key(ltable, ltable_id) # ltable.fillna("", inplace=True) @@ -295,27 +294,42 @@ def er_process(config: Configuration): predictions_attrs.extend(attrs_with_r_prefix) predictions_attrs.extend(['gold', 'predicted']) predictions = predictions[predictions_attrs] - process_prediction_for_md_discovery(predictions) + train_attrs = predictions_attrs[:] + train_attrs.remove('predicted') + train_set = train_set[train_attrs] + prepare_file_for_md_discovery(train_set) predictions = predictions.reset_index(drop=True) predictions = predictions.astype(str) sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) predictions['confidence'] = 0 - md_discover(config) + md_discover(config, er_output_dir + "t_single_tuple.csv", md_output_dir + "mds.txt") md_paths = [md_output_dir + 'mds.txt'] md_list = load_mds(md_paths) # 从全局变量中读取所有的md epl_match = 0 # 可解释,预测match + + unexplainable = pd.DataFrame() + if len(md_list) > 0: - for row in predictions.itertuples(): + for row in tqdm(predictions.itertuples()): x = is_explicable(row, md_list, sim_tensor_dict) if x > 0 and str(getattr(row, 'predicted')) == str(1): predictions.loc[row[0], 'confidence'] = x epl_match += 1 + # else: + # series = pd.Series(row) + # unexplainable = unexplainable._append(series, ignore_index=True) + # unexplainable.drop(columns=unexplainable.columns[[-1, 0]], inplace=True) + # unexplainable.columns = predictions_attrs + # unexplainable = unexplainable[train_attrs] + # if len(unexplainable[unexplainable['gold'] == str(1)]) > 0: + # prepare_file_for_md_discovery(unexplainable, t_single_tuple_path=er_output_dir + 'unexplainable_tst.csv') + # md_discover(config, er_output_dir + 'unexplainable_tst.csv', md_output_dir + "from_unexplainable.txt") df = predictions[predictions['predicted'] == str(1)] interpretability = epl_match / len(df) # 可解释性 indicators['interpretability'] = interpretability - if (indicators["block_recall"] < 0.8) and (indicators["block_recall"] < indicators["recall"]): + if indicators["block_recall"] < indicators["recall"]: f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / ( indicators["precision"] + indicators["block_recall"]) else: @@ -326,6 +340,7 @@ def er_process(config: Configuration): print(indicators) predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True) ################################################################################################################ + print(f'\033[33mTime consumed by ML-ER in seconds: {time.time() - start}\033[0m') return indicators diff --git a/settings.py b/settings.py index be79ceb..6a7475d 100644 --- a/settings.py +++ b/settings.py @@ -1,10 +1,10 @@ from sentence_transformers import SentenceTransformer -ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Amazon-Google\Amazon.csv' -rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Amazon-Google\GoogleProducts.csv' -mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Amazon-Google\Mapping.csv' -mapping_lid = 'idAmazon' # mapping表中左表id名 -mapping_rid = 'idGoogleBase' # mapping表中右表id名 +ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableA.csv' +rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv' +mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\matches.csv' +mapping_lid = 'idDBLP' # mapping表中左表id名 +mapping_rid = 'idScholar' # mapping表中右表id名 ltable_id = 'id' # 左表id字段名称 rtable_id = 'id' # 右表id字段名称 target_attr = 'id' # 进行md挖掘时的目标字段 @@ -12,9 +12,9 @@ target_attr = 'id' # 进行md挖掘时的目标字段 model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens') interpre_weight = 0.5 # 可解释性权重 -# similarity_threshold = 0.2 -# support_threshold = 100 -# confidence_threshold = 0.4 +similarity_threshold = 0.1 +support_threshold = 1 +confidence_threshold = 0.25 er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\' md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\' diff --git a/tfile.py b/tfile.py index ee18fe5..2a98cab 100644 --- a/tfile.py +++ b/tfile.py @@ -147,3 +147,13 @@ def test12(): dic = json.load(f) for _ in dic.keys(): print(f'Key:{_}\tValue:{dic[_]}\tType:{type(dic[_])}') + + +def test13(): + outcome_dir = r'E:\Data\Research\Outcome' + configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens' + datasets_list = os.listdir(outcome_dir) + f = [] + for _ in datasets_list: + f.append(outcome_dir + rf'\{_}' + configs_dir) + print(f)