HuangJintao
HuangJintao 1 year ago
parent 882c25d20f
commit 24985da169

1
.gitignore vendored

@ -1,5 +1,4 @@
/deprecated/ /deprecated/
/datasets/
/ml_er/output/* /ml_er/output/*
/md_discovery/output/* /md_discovery/output/*
/hpo/output/* /hpo/output/*

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,27 +0,0 @@
# this is the entrance of the auto-ER procedure
from md_discovery.md_discover import md_discover
from ml_er.ml_entity_resolver import ml_er
from hpo.er_model_hpo import ml_er_hpo
from settings import *
def run(rounds: int):
hp_config = None
iter_round = 1
for i in range(0, rounds):
ml_er(iter_round, hp_config)
md_discover()
hp_config = ml_er_hpo()
iter_round += 1
ml_er(iter_round, hp_config)
return
if __name__ == '__main__':
# todo
# 使用drop删除特征向量中的列(如删除id相关特征)
run(1) # 迭代3轮
# ml_er(1)
# todo 将优化结果与参数输出到文件中
# 通过ml_entity_resolver.ml_er()输出,同时输出参数配置信息
print(ltable_path)

@ -17,17 +17,17 @@ from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicabl
# 数据在外部加载 # 数据在外部加载
######################################################################################################################## ########################################################################################################################
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
ltable.fillna("", inplace=True) # ltable.fillna("", inplace=True)
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1') rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
rtable.fillna("", inplace=True) # rtable.fillna("", inplace=True)
mappings = pd.read_csv(mapping_path) mappings = pd.read_csv(mapping_path)
lid_mapping_list = [] lid_mapping_list = []
rid_mapping_list = [] rid_mapping_list = []
# 全部转为字符串 # 全部转为字符串
ltable = ltable.astype(str) # ltable = ltable.astype(str)
rtable = rtable.astype(str) # rtable = rtable.astype(str)
mappings = mappings.astype(str) # mappings = mappings.astype(str)
matching_number = len(mappings) # 所有阳性样本数商品数据集应为1300 matching_number = len(mappings) # 所有阳性样本数商品数据集应为1300
for index, row in mappings.iterrows(): for index, row in mappings.iterrows():
@ -35,7 +35,8 @@ for index, row in mappings.iterrows():
rid_mapping_list.append(row[mapping_rid]) rid_mapping_list.append(row[mapping_rid])
# 仅保留两表中出现在映射表中的行,增大正样本比例 # 仅保留两表中出现在映射表中的行,增大正样本比例
selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)] selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段 if len(lr_attrs_map) > 0:
selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段
tables_id = rtable_id # 不论左表右表ID字段名是否一致经上一行调整统一以右表为准 tables_id = rtable_id # 不论左表右表ID字段名是否一致经上一行调整统一以右表为准
selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)] selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名 selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名
@ -72,11 +73,13 @@ class Classifier:
blocker = em.OverlapBlocker() blocker = em.OverlapBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"], candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
overlap_size=config["overlap_size"], show_progress=False) overlap_size=config["overlap_size"], show_progress=False,
allow_missing=True)
elif config["ml_blocker"] == "attr_equiv": elif config["ml_blocker"] == "attr_equiv":
blocker = em.AttrEquivalenceBlocker() blocker = em.AttrEquivalenceBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"], candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs) l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
allow_missing=True)
candidate['gold'] = 0 candidate['gold'] = 0
@ -159,6 +162,8 @@ class Classifier:
attrs_after=test_feature_after, show_progress=False) attrs_after=test_feature_after, show_progress=False)
fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold'] fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
train_feature_vecs.fillna(0, inplace=True)
test_feature_vecs.fillna(0, inplace=True)
matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold') matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id]) test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
@ -176,7 +181,7 @@ class Classifier:
predictions_attrs.extend(['gold', 'predicted']) predictions_attrs.extend(['gold', 'predicted'])
predictions = predictions[predictions_attrs] predictions = predictions[predictions_attrs]
predictions = predictions.reset_index(drop=True) predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str) # predictions = predictions.astype(str)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
# 默认路径为 "../md_discovery/output/xxx.txt" # 默认路径为 "../md_discovery/output/xxx.txt"
@ -191,14 +196,14 @@ class Classifier:
ppre = predictions[predictions['predicted'] == str(1)] ppre = predictions[predictions['predicted'] == str(1)]
interpretability = epl_match / len(ppre) # 可解释性 interpretability = epl_match / len(ppre) # 可解释性
# todo block_recall可以考虑以下注释
# if indicators["block_recall"] >= 0.8: if indicators["block_recall"] >= 0.8:
# f1 = indicators["F1"]
# else:
# f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (indicators["precision"] + indicators["block_recall"])
if indicators["block_recall"] < 0.8:
return 1
f1 = indicators["F1"] f1 = indicators["F1"]
else:
f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (indicators["precision"] + indicators["block_recall"])
# if indicators["block_recall"] < 0.8:
# return 1
# f1 = indicators["F1"]
performance = interpre_weight * interpretability + (1 - interpre_weight) * f1 performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
return 1 - performance return 1 - performance
@ -230,17 +235,20 @@ def ml_er_hpo():
) )
incumbent = smac.optimize() incumbent = smac.optimize()
incumbent_ndarray = incumbent.get_array() incumbent_cost = smac.validate(incumbent)
np.save(hpo_output_dir + 'incumbent.npy', incumbent_ndarray) default = cs.get_default_configuration()
default_cost = smac.validate(default)
print(f"Default Cost: {default_cost}")
print(f"Incumbent Cost: {incumbent_cost}")
# Get cost of default configuration if incumbent_cost > default_cost:
default_cost = smac.validate(cs.get_default_configuration()) incumbent = default
print(f"Default cost: {default_cost}") print(f"Updated Incumbent Cost: {default_cost}")
# Let's calculate the cost of the incumbent print(f"Optimized Configuration:{incumbent.values()}")
incumbent_cost = smac.validate(incumbent)
print(f"Incumbent cost: {incumbent_cost}") incumbent_ndarray = incumbent.get_array()
print(f"Optimized_Configuration:{incumbent.values()}") np.save(hpo_output_dir + 'incumbent.npy', incumbent_ndarray)
return incumbent return incumbent

@ -10,9 +10,7 @@ import copy
import torch import torch
from tqdm import tqdm from tqdm import tqdm
from settings import model, md_output_dir from settings import model, md_output_dir, confidence_threshold
conf_thresh = 0.8
def is_minimal(md, md_list, target_col): def is_minimal(md, md_list, target_col):
@ -60,7 +58,6 @@ def pairs_inference(path, threshold, target_col):
table_tensor = torch.stack(split_embedding, dim=0, out=None) table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
sim_tensor = sim_tensor/2 + 0.5
torch.save(sim_tensor, md_output_dir + "tensor.pt") torch.save(sim_tensor, md_output_dir + "tensor.pt")
@ -130,7 +127,7 @@ def pairs_inference(path, threshold, target_col):
if support < 1: if support < 1:
print('delete by support') print('delete by support')
remove_list.append(md) remove_list.append(md)
if confidence < 0.5: if confidence < confidence_threshold:
print('delete by confidence') print('delete by confidence')
remove_list.append(md) remove_list.append(md)
# fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True) # fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
@ -172,54 +169,3 @@ def get_metrics(current_md, data, sim_tensor, target_col, target_index):
confidence = confidence_numerator / support confidence = confidence_numerator / support
return support, confidence return support, confidence
def get_mds_metadata(md_list, dataset_path, sim_tensor, target_col):
data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
manager = multiprocessing.Manager()
if len(md_list) == 0:
return []
pool_size = 16
pool = multiprocessing.Pool(pool_size)
result = []
with manager:
for _ in md_list:
task = pool.apply_async(get_one_md_metadata, args=(_, data, sim_tensor, target_col))
support, confidence = task.get()
result.append({"md": _, "support": support, "confidence": confidence})
pool.close()
pool.join()
return result
def get_one_md_metadata(md, dataframe, sim_tensor, target_col):
support = 0
pre_confidence = 0
columns = dataframe.columns.values.tolist()
length = dataframe.shape[0]
width = dataframe.shape[1]
for row1 in range(0, length - 1):
for row2 in range(row1 + 1, length):
left_satisfy = True
both_satisfy = True
for col_index in range(0, width):
col = columns[col_index]
sim = sim_tensor[col_index, row1, row2].item()
if col == target_col:
if sim < 1:
both_satisfy = False
else:
if sim < md[col]:
left_satisfy = False
both_satisfy = False
if left_satisfy:
support += 1
if both_satisfy:
pre_confidence += 1
confidence = 0 if support == 0 else pre_confidence / support
# return {"md": md, "support": support, "confidence": confidence}
return support, confidence

@ -46,7 +46,7 @@ def process_prediction_for_md_discovery(pred: pd.DataFrame,
rdf.columns = cols rdf.columns = cols
t_single_tuple = pd.concat([ldf, rdf]) t_single_tuple = pd.concat([ldf, rdf])
t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True) t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1)
def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int, def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
@ -140,45 +140,42 @@ def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
rattr_tensor = norm_table_tensor[col_tuple[1]] rattr_tensor = norm_table_tensor[col_tuple[1]]
mul_tensor = lattr_tensor * rattr_tensor mul_tensor = lattr_tensor * rattr_tensor
sim_tensor = torch.sum(mul_tensor, 1) sim_tensor = torch.sum(mul_tensor, 1)
sim_tensor = sim_tensor / 2 + 0.5
sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
return sim_tensor_dict return sim_tensor_dict
def ml_er(iter_round: int, config: Configuration = None, ): def ml_er(iter_round: int, config: Configuration = None, ):
# todo:
# if config is not None -> load configs
# else -> use default configs
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
cm.set_key(ltable, ltable_id) cm.set_key(ltable, ltable_id)
ltable.fillna("", inplace=True) # ltable.fillna("", inplace=True)
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1') rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
cm.set_key(rtable, rtable_id) cm.set_key(rtable, rtable_id)
rtable.fillna("", inplace=True) # rtable.fillna("", inplace=True)
mappings = pd.read_csv(mapping_path) mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
# 仅保留两表中出现在映射表中的行,增大正样本比例 # 仅保留两表中出现在映射表中的行,增大正样本比例
lid_mapping_list = [] lid_mapping_list = []
rid_mapping_list = [] rid_mapping_list = []
# 全部转为字符串 # 全部转为字符串
ltable = ltable.astype(str) # ltable = ltable.astype(str)
rtable = rtable.astype(str) # rtable = rtable.astype(str)
mappings = mappings.astype(str) # mappings = mappings.astype(str)
matching_number = len(mappings) # 所有阳性样本数商品数据集应为1300 matching_number = len(mappings) # 所有阳性样本数
for index, row in mappings.iterrows(): for index, row in mappings.iterrows():
lid_mapping_list.append(row[mapping_lid]) lid_mapping_list.append(row[mapping_lid])
rid_mapping_list.append(row[mapping_rid]) rid_mapping_list.append(row[mapping_rid])
selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)] selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
if len(lr_attrs_map) > 0:
selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段 selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段
tables_id = rtable_id tables_id = rtable_id
selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)] selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名 selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名
items_but_id = selected_attrs[:] items_but_id = selected_attrs[:]
items_but_id.remove(tables_id) # 两张表中除了id的字段名 items_but_id.remove(tables_id) # 两张表中除了id的字段名
attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs] attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]
attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs] attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]
cm.set_key(selected_ltable, tables_id) cm.set_key(selected_ltable, tables_id)
cm.set_key(selected_rtable, tables_id) cm.set_key(selected_rtable, tables_id)
@ -199,19 +196,21 @@ def ml_er(iter_round: int, config: Configuration = None, ):
if config["ml_blocker"] == "over_lap": if config["ml_blocker"] == "over_lap":
blocker = em.OverlapBlocker() blocker = em.OverlapBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"], candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"],
config["block_attr"], allow_missing=True,
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
overlap_size=config["overlap_size"], show_progress=False) overlap_size=config["overlap_size"], show_progress=False)
elif config["ml_blocker"] == "attr_equiv": elif config["ml_blocker"] == "attr_equiv":
blocker = em.AttrEquivalenceBlocker() blocker = em.AttrEquivalenceBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"], candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"],
config["block_attr"], allow_missing=True,
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs) l_output_attrs=selected_attrs, r_output_attrs=selected_attrs)
else: else:
matcher = em.RFMatcher(name='RF', random_state=0) matcher = em.RFMatcher(name='RF', random_state=0)
blocker = em.OverlapBlocker() blocker = em.OverlapBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, items_but_id[0], items_but_id[0], candidate = blocker.block_tables(selected_ltable, selected_rtable, items_but_id[0], items_but_id[0],
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
overlap_size=1, show_progress=False) overlap_size=1, show_progress=False, allow_missing=True)
candidate['gold'] = 0 candidate['gold'] = 0
@ -267,6 +266,8 @@ def ml_er(iter_round: int, config: Configuration = None, ):
attrs_after=test_feature_after, show_progress=False) attrs_after=test_feature_after, show_progress=False)
fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold'] fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
train_feature_vecs.fillna(0, inplace=True)
test_feature_vecs.fillna(0, inplace=True)
matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold') matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id]) test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after, predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
@ -301,7 +302,8 @@ def ml_er(iter_round: int, config: Configuration = None, ):
if indicators["block_recall"] >= 0.8: if indicators["block_recall"] >= 0.8:
f1 = indicators["F1"] f1 = indicators["F1"]
else: else:
f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (indicators["precision"] + indicators["block_recall"]) f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
indicators["precision"] + indicators["block_recall"])
performance = interpre_weight * interpretability + (1 - interpre_weight) * f1 performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
################################################################################################################ ################################################################################################################

@ -1,20 +1,20 @@
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
import numpy as np import numpy as np
ltable_path = 'E:\\Data\\Research\\Projects\\matching_dependency\\datasets\\Amazon.csv' ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\tableA.csv'
rtable_path = 'E:\\Data\\Research\\Projects\\matching_dependency\\datasets\\GoogleProducts.csv' rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\tableB.csv'
mapping_path = 'E:\\Data\\Research\\Projects\\matching_dependency\\datasets\\Amzon_GoogleProducts_perfectMapping.csv' mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\matches.csv'
mapping_lid = 'idAmazon' # mapping表中左表id名 mapping_lid = 'id1' # mapping表中左表id名
mapping_rid = 'idGoogleBase' # mapping表中右表id名 mapping_rid = 'id2' # mapping表中右表id名
ltable_id = 'id' # 左表id字段名称 ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称 rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段 target_attr = 'id' # 进行md挖掘时的目标字段
lr_attrs_map = {'title': 'name'} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致 lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
similarity_threshold = 0.7 similarity_threshold = 0.2
support_threshold = 1 support_threshold = 1
confidence_threshold = 0.8 confidence_threshold = 0.5
interpre_weight = 0.3 # 可解释性权重 interpre_weight = 0.3 # 可解释性权重
er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\' er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\' md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'
hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\' hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\'
model = SentenceTransformer('E:\\Data\\Research\\Models\\paraphrase-MiniLM-L6-v2') model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')

@ -116,6 +116,13 @@ def test8():
print(cum) print(cum)
def test9():
df = pd.read_json(r'./datasets/t.json', encoding='ISO-8859-1', lines=True)
df.to_csv(r'./datasets/s.csv')
d = pd.read_csv(r'./datasets/s.csv', encoding='ISO-8859-1')
print(1)
if __name__ == '__main__': if __name__ == '__main__':
start = time.time() start = time.time()
t_single_tuple_path = er_output_dir + "t_single_tuple.csv" t_single_tuple_path = er_output_dir + "t_single_tuple.csv"

Loading…
Cancel
Save