增设最大HPO时长

输出support和confidence,依次用高confidence的MD解释预测,该confidence即为预测的confidence
MD-metrics-HPO
HuangJintao 1 year ago
parent 6bdfd9eb51
commit cf278b91db

@ -0,0 +1,133 @@
ltable_id,rtable_id
110,53087
147,50730
205,41179
249,53087
251,52967
251,53047
254,49104
416,33321
426,33282
431,33296
433,33307
467,362
498,29283
508,55899
537,1878
582,378
588,55899
592,43689
598,43689
642,24635
668,24621
836,23706
1069,11277
1148,7757
1151,53282
1161,27990
1164,27994
1193,28008
1216,33988
1358,42278
1406,48287
1413,51590
1489,3898
1514,49668
1595,52925
1636,27530
1655,49479
1683,23727
1683,23781
1735,10926
1736,10938
1776,33397
1777,20232
1897,51626
1918,9358
1946,23456
2222,815
2337,54230
2368,33786
2375,3581
2394,29834
2399,29849
2456,24297
2479,33786
2490,4511
2666,35475
2742,17181
2749,17161
2782,31534
2812,47863
2834,53255
2845,52867
2901,21458
2913,21458
3093,22994
3104,22995
3292,7440
3324,7403
3354,17660
3354,17643
3356,39335
3358,39320
3376,7414
3421,34320
3503,54798
3507,23084
3734,47848
3762,38431
3771,38442
3850,9120
3864,8306
3870,23889
3870,23901
3878,23925
3887,16397
3926,16397
3944,18970
4301,29695
4312,6130
4326,23361
4384,23361
4413,42103
4468,31072
4547,26140
4587,26140
4610,26152
4659,41090
4669,41083
4747,5682
4823,20588
5001,3099
5077,40465
5083,40471
5122,34113
5143,43647
5227,27254
5363,47909
5406,27824
5518,9828
5542,10251
5702,37654
5712,35057
5726,4470
5733,4465
5761,21517
5806,6499
5827,6502
5885,19151
6047,4667
6104,7981
6218,29583
6234,13713
6280,47818
6425,31102
6429,31088
6458,18807
6532,38301
6592,38375
6634,824
6724,17069
6764,51438
6857,39318
1 ltable_id rtable_id
2 110 53087
3 147 50730
4 205 41179
5 249 53087
6 251 52967
7 251 53047
8 254 49104
9 416 33321
10 426 33282
11 431 33296
12 433 33307
13 467 362
14 498 29283
15 508 55899
16 537 1878
17 582 378
18 588 55899
19 592 43689
20 598 43689
21 642 24635
22 668 24621
23 836 23706
24 1069 11277
25 1148 7757
26 1151 53282
27 1161 27990
28 1164 27994
29 1193 28008
30 1216 33988
31 1358 42278
32 1406 48287
33 1413 51590
34 1489 3898
35 1514 49668
36 1595 52925
37 1636 27530
38 1655 49479
39 1683 23727
40 1683 23781
41 1735 10926
42 1736 10938
43 1776 33397
44 1777 20232
45 1897 51626
46 1918 9358
47 1946 23456
48 2222 815
49 2337 54230
50 2368 33786
51 2375 3581
52 2394 29834
53 2399 29849
54 2456 24297
55 2479 33786
56 2490 4511
57 2666 35475
58 2742 17181
59 2749 17161
60 2782 31534
61 2812 47863
62 2834 53255
63 2845 52867
64 2901 21458
65 2913 21458
66 3093 22994
67 3104 22995
68 3292 7440
69 3324 7403
70 3354 17660
71 3354 17643
72 3356 39335
73 3358 39320
74 3376 7414
75 3421 34320
76 3503 54798
77 3507 23084
78 3734 47848
79 3762 38431
80 3771 38442
81 3850 9120
82 3864 8306
83 3870 23889
84 3870 23901
85 3878 23925
86 3887 16397
87 3926 16397
88 3944 18970
89 4301 29695
90 4312 6130
91 4326 23361
92 4384 23361
93 4413 42103
94 4468 31072
95 4547 26140
96 4587 26140
97 4610 26152
98 4659 41090
99 4669 41083
100 4747 5682
101 4823 20588
102 5001 3099
103 5077 40465
104 5083 40471
105 5122 34113
106 5143 43647
107 5227 27254
108 5363 47909
109 5406 27824
110 5518 9828
111 5542 10251
112 5702 37654
113 5712 35057
114 5726 4470
115 5733 4465
116 5761 21517
117 5806 6499
118 5827 6502
119 5885 19151
120 6047 4667
121 6104 7981
122 6218 29583
123 6234 13713
124 6280 47818
125 6425 31102
126 6429 31088
127 6458 18807
128 6532 38301
129 6592 38375
130 6634 824
131 6724 17069
132 6764 51438
133 6857 39318

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -8,13 +8,14 @@ from sentence_transformers import SentenceTransformer
from torch import nn from torch import nn
if __name__ == '__main__': if __name__ == '__main__':
train = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\train.csv', encoding='ISO-8859-1') directory = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon'
valid = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\valid.csv', encoding='ISO-8859-1') train = pd.read_csv(directory + r'\train.csv', encoding='ISO-8859-1')
test = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\test.csv', encoding='ISO-8859-1') valid = pd.read_csv(directory + r'\valid.csv', encoding='ISO-8859-1')
test = pd.read_csv(directory + r'\test.csv', encoding='ISO-8859-1')
train = train[train['label'] == 1] train = train[train['label'] == 1]
valid = valid[valid['label'] == 1] valid = valid[valid['label'] == 1]
test = test[test['label'] == 1] test = test[test['label'] == 1]
matches = pd.concat([train, valid, test]) matches = pd.concat([train, valid, test])
matches.drop(columns=['label'], inplace=True) matches.drop(columns=['label'], inplace=True)
matches = matches.sort_values(by='ltable_id') matches = matches.sort_values(by='ltable_id')
matches.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\matches.csv', sep=',', index=False, header=True) matches.to_csv(directory + r'\matches.csv', sep=',', index=False, header=True)

@ -24,13 +24,12 @@ class Classifier:
ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap") ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
similarity_thresh = Float("similarity_thresh", (0, 0.2), default=0.2) similarity_thresh = Float("similarity_thresh", (0, 0.2), default=0.2)
support_thresh = Integer("support_thresh", (1, 5), default=1) support_thresh = Integer("support_thresh", (1, 5), default=1)
confidence_thresh = Float("confidence_thresh", (0.3, 0.7), default=0.4) confidence_thresh = Float("confidence_thresh", (0.25, 0.5), default=0.25)
cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker, similarity_thresh, cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker, similarity_thresh,
support_thresh, confidence_thresh]) support_thresh, confidence_thresh])
return cs return cs
# train 就是整个函数 只需将返回结果由预测变成预测结果的评估
def train(self, config: Configuration, seed: int = 0) -> float: def train(self, config: Configuration, seed: int = 0) -> float:
cm.del_catalog() cm.del_catalog()
indicators = er_process(config) indicators = er_process(config)
@ -49,6 +48,7 @@ def ml_er_hpo():
cs, cs,
deterministic=True, deterministic=True,
n_trials=50, # We want to run max 50 trials (combination of config and seed) n_trials=50, # We want to run max 50 trials (combination of config and seed)
walltime_limit=28800, # Max time limit in seconds (14400s = 4h)
n_workers=1 n_workers=1
) )

@ -1,3 +1,5 @@
import operator
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import copy import copy
@ -13,6 +15,8 @@ def is_minimal(md, md_list, target_col):
return True return True
minimal = True minimal = True
for _ in md_list: for _ in md_list:
if isinstance(_, tuple):
_ = _[0]
if _ != md: if _ != md:
other_cols = list(set(_.keys()) - {target_col}) other_cols = list(set(_.keys()) - {target_col})
# 假设列表中每一个md都使当前md不minimal # 假设列表中每一个md都使当前md不minimal
@ -34,6 +38,7 @@ def is_minimal(md, md_list, target_col):
def pairs_inference(path, target_col, conf: Configuration): def pairs_inference(path, target_col, conf: Configuration):
simt = conf["similarity_thresh"] simt = conf["similarity_thresh"]
# simt = round(simt, ndigits=3)
supt = conf["support_thresh"] supt = conf["support_thresh"]
cont = conf["confidence_thresh"] cont = conf["confidence_thresh"]
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
@ -55,6 +60,7 @@ def pairs_inference(path, target_col, conf: Configuration):
table_tensor = torch.stack(split_embedding, dim=0, out=None) table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
# sim_tensor = torch.round(sim_tensor, decimals=3)
# torch.save(sim_tensor, md_output_dir + "tensor.pt") # torch.save(sim_tensor, md_output_dir + "tensor.pt")
@ -65,7 +71,7 @@ def pairs_inference(path, target_col, conf: Configuration):
init_md[col] = 1 if col == target_col else -1 init_md[col] = 1 if col == target_col else -1
md_list.append(init_md) md_list.append(init_md)
for row1 in range(0, length - 1): for row1 in tqdm(range(0, length - 1)):
terminate = False terminate = False
for row2 in range(row1 + 1, length): for row2 in range(row1 + 1, length):
violated_mds = [] violated_mds = []
@ -126,13 +132,11 @@ def pairs_inference(path, target_col, conf: Configuration):
break break
if len(minimal_vio) > 0: if len(minimal_vio) > 0:
remove_list = [] for md in minimal_vio[:]:
for md in minimal_vio:
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index) support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
if confidence < cont: if support >= supt and confidence >= cont:
remove_list.append(md) minimal_vio.append((md, support, confidence))
for _ in remove_list: minimal_vio.remove(md)
minimal_vio.remove(_)
if len(md_list) > 0: if len(md_list) > 0:
# 去除重复MD # 去除重复MD
@ -142,32 +146,32 @@ def pairs_inference(path, target_col, conf: Configuration):
tmp.append(_) tmp.append(_)
md_list = tmp md_list = tmp
# 去除support小于阈值MD # 去除support小于阈值MD
md_rm_list = [] for _ in md_list[:]:
for _ in md_list:
support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index) support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index)
if support < supt: if support >= supt and confidence >= cont:
md_rm_list.append(_) md_list.append((_, support, confidence))
for _ in md_rm_list:
md_list.remove(_) md_list.remove(_)
# 去除不minimal的MD # 去除不minimal的MD
for _ in md_list[:]: for md_tuple in md_list[:]:
if not is_minimal(_, md_list, target_col): if not is_minimal(md_tuple[0], md_list, target_col) and md_tuple[2] < 0.5:
md_list.remove(_) md_list.remove(md_tuple)
if len(minimal_vio) > 0: if len(minimal_vio) > 0:
for vio in minimal_vio[:]: for vio_tuple in minimal_vio[:]:
if not is_minimal(vio, md_list, target_col): if not is_minimal(vio_tuple[0], md_list, target_col) and vio_tuple[2] < 0.5:
minimal_vio.remove(vio) minimal_vio.remove(vio_tuple)
if len(minimal_vio) > 0: if len(minimal_vio) > 0:
for _ in minimal_vio[:]: for vio_tuple in minimal_vio[:]:
if not is_minimal(_, minimal_vio, target_col): if not is_minimal(vio_tuple[0], minimal_vio, target_col) and vio_tuple[2] < 0.5:
minimal_vio.remove(_) minimal_vio.remove(vio_tuple)
print(f'\033[33mList Length: {len(md_list)}\033[0m') result = []
print(f'\033[33mVio Length: {len(minimal_vio)}\033[0m') result.extend(md_list)
result.extend(minimal_vio)
result.sort(key=operator.itemgetter(2), reverse=True)
print(f'\033[33mList Length: {len(result)}\033[0m')
print(f'\033[33mSupport: {supt}\tConfidence: {cont}\033[0m') print(f'\033[33mSupport: {supt}\tConfidence: {cont}\033[0m')
return result
return md_list, minimal_vio
def get_metrics(current_md, data, sim_tensor, target_col, target_index): def get_metrics(current_md, data, sim_tensor, target_col, target_index):
@ -181,6 +185,7 @@ def get_metrics(current_md, data, sim_tensor, target_col, target_index):
md_tensor_3d = md_tensor_2d.unsqueeze(2) md_tensor_3d = md_tensor_2d.unsqueeze(2)
md_tensor_3d = md_tensor_3d.repeat(1, length, length) md_tensor_3d = md_tensor_3d.repeat(1, length, length)
sim_tensor = torch.round(sim_tensor, decimals=4)
sup_tensor = torch.ge(sim_tensor, md_tensor_3d) sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda') ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
for i in range(0, width): for i in range(0, width):
@ -189,10 +194,12 @@ def get_metrics(current_md, data, sim_tensor, target_col, target_index):
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice) ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int() sup_tensor_int = ini_slice.int()
support_Naumann = torch.count_nonzero(sup_tensor_int).item() support_Naumann = torch.count_nonzero(sup_tensor_int).item()
support_Naumann = (support_Naumann - length) / 2
ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index]) ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = ini_slice.int() conf_tensor_int = ini_slice.int()
support_Fan = torch.count_nonzero(conf_tensor_int).item() support_Fan = torch.count_nonzero(conf_tensor_int).item()
confidence = support_Fan / support_Naumann support_Fan = (support_Fan - length) / 2
confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
return support_Fan, confidence return support_Fan, confidence

@ -21,11 +21,10 @@ def md_discover(config: Configuration):
t_single_tuple_path = er_output_dir + "t_single_tuple.csv" t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
# 输入csv文件路径md左侧相似度阈值md右侧目标字段 # 输入csv文件路径md左侧相似度阈值md右侧目标字段
# 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值 # 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值
mds_list, vio_list = pairs_inference(t_single_tuple_path, target_attr, config) mds_list = pairs_inference(t_single_tuple_path, target_attr, config)
# 将列表1写入本地路径需自己修改 # 将列表1写入本地路径需自己修改
mds_path = md_output_dir + "mds.txt" mds_path = md_output_dir + "mds.txt"
vio_path = md_output_dir + "vio.txt"
with open(mds_path, 'w') as f: with open(mds_path, 'w') as f:
for _ in mds_list: for _ in mds_list:
@ -33,12 +32,6 @@ def md_discover(config: Configuration):
f.write(str(_)) f.write(str(_))
f.write('\n') f.write('\n')
with open(vio_path, 'w') as f:
for _ in vio_list:
f.write('Target:'+str(target_attr) + '\t')
f.write(str(_))
f.write('\n')
# if __name__ == '__main__': # if __name__ == '__main__':
# md_discover() # md_discover()

@ -101,18 +101,18 @@ def load_mds(paths: list) -> list:
return all_mds return all_mds
def is_explicable(row, all_mds: list, st_dict) -> bool: def is_explicable(row, all_mds: list, st_dict):
attrs = all_mds[0].keys() # 从第一条md中读取所有字段 attrs = all_mds[0][0].keys() # 从第一条md_tuple中的md字典中读取所有字段
for md in all_mds: for md_tuple in all_mds:
explicable = True # 假设这条md能解释当前元组 explicable = True # 假设这条md能解释当前元组
for a in attrs: for a in attrs:
if a != target_attr: if a != target_attr:
if st_dict[a][row[0]].item() < md[a]: if st_dict[a][row[0]].item() < md_tuple[0][a]:
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组 explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable: if explicable:
return True # 任意一条md能解释直接返回 return md_tuple[2] # 任意一条md能解释直接返回
return False # 遍历结束,不能解释 return -1.0 # 遍历结束,不能解释
def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame): def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
@ -142,6 +142,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
rattr_tensor = norm_table_tensor[col_tuple[1]] rattr_tensor = norm_table_tensor[col_tuple[1]]
mul_tensor = lattr_tensor * rattr_tensor mul_tensor = lattr_tensor * rattr_tensor
sim_tensor = torch.sum(mul_tensor, 1) sim_tensor = torch.sum(mul_tensor, 1)
sim_tensor = torch.round(sim_tensor, decimals=4)
sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
return sim_tensor_dict return sim_tensor_dict
@ -298,14 +299,17 @@ def er_process(config: Configuration):
predictions = predictions.reset_index(drop=True) predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str) predictions = predictions.astype(str)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
predictions['confidence'] = 0
md_discover(config) md_discover(config)
md_paths = [md_output_dir + 'mds.txt', md_output_dir + 'vio.txt'] md_paths = [md_output_dir + 'mds.txt']
md_list = load_mds(md_paths) # 从全局变量中读取所有的md md_list = load_mds(md_paths) # 从全局变量中读取所有的md
epl_match = 0 # 可解释预测match epl_match = 0 # 可解释预测match
if len(md_list) > 0: if len(md_list) > 0:
for row in predictions.itertuples(): for row in predictions.itertuples():
if is_explicable(row, md_list, sim_tensor_dict) and str(getattr(row, 'predicted')) == str(1): x = is_explicable(row, md_list, sim_tensor_dict)
if x > 0 and str(getattr(row, 'predicted')) == str(1):
predictions.loc[row[0], 'confidence'] = x
epl_match += 1 epl_match += 1
df = predictions[predictions['predicted'] == str(1)] df = predictions[predictions['predicted'] == str(1)]
@ -320,6 +324,7 @@ def er_process(config: Configuration):
indicators['performance'] = performance indicators['performance'] = performance
indicators['eval_result'] = eval_result indicators['eval_result'] = eval_result
print(indicators) print(indicators)
predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True)
################################################################################################################ ################################################################################################################
return indicators return indicators

@ -1,10 +1,10 @@
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\tableA.csv' ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Amazon-Google\Amazon.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\tableB.csv' rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Amazon-Google\GoogleProducts.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\matches.csv' mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Amazon-Google\Mapping.csv'
mapping_lid = 'ltable_id' # mapping表中左表id名 mapping_lid = 'idAmazon' # mapping表中左表id名
mapping_rid = 'rtable_id' # mapping表中右表id名 mapping_rid = 'idGoogleBase' # mapping表中右表id名
ltable_id = 'id' # 左表id字段名称 ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称 rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段 target_attr = 'id' # 进行md挖掘时的目标字段

@ -121,8 +121,9 @@ def test9():
def test10(): def test10():
rtable = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv', rtable = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\ml_er\output\predictions.csv',
encoding='ISO-8859-1') encoding='ISO-8859-1')
print(1)
rtable.columns = ["id", "title", "authors", "venue", "year"] rtable.columns = ["id", "title", "authors", "venue", "year"]
rtable.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv', rtable.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv',
sep=',', index=False, header=True, quoting=1) sep=',', index=False, header=True, quoting=1)

Loading…
Cancel
Save