|
|
@ -70,6 +70,7 @@ def inference_from_record_pairs(path, threshold, target_col):
|
|
|
|
data.fillna("", inplace=True)
|
|
|
|
data.fillna("", inplace=True)
|
|
|
|
data = data.astype(str)
|
|
|
|
data = data.astype(str)
|
|
|
|
columns = data.columns.values.tolist()
|
|
|
|
columns = data.columns.values.tolist()
|
|
|
|
|
|
|
|
target_index = columns.index(target_col)
|
|
|
|
cols_but_target = list(set(columns) - {target_col})
|
|
|
|
cols_but_target = list(set(columns) - {target_col})
|
|
|
|
length = data.shape[0]
|
|
|
|
length = data.shape[0]
|
|
|
|
width = data.shape[1]
|
|
|
|
width = data.shape[1]
|
|
|
@ -95,7 +96,6 @@ def inference_from_record_pairs(path, threshold, target_col):
|
|
|
|
init_md[col] = 1 if col == target_col else 0
|
|
|
|
init_md[col] = 1 if col == target_col else 0
|
|
|
|
md_list.append(init_md)
|
|
|
|
md_list.append(init_md)
|
|
|
|
|
|
|
|
|
|
|
|
start = time.time()
|
|
|
|
|
|
|
|
for row1 in range(0, length - 1):
|
|
|
|
for row1 in range(0, length - 1):
|
|
|
|
terminate = False
|
|
|
|
terminate = False
|
|
|
|
for row2 in range(row1 + 1, length):
|
|
|
|
for row2 in range(row1 + 1, length):
|
|
|
@ -132,9 +132,9 @@ def inference_from_record_pairs(path, threshold, target_col):
|
|
|
|
# md_list.append(spec_r_md)
|
|
|
|
# md_list.append(spec_r_md)
|
|
|
|
# 特殊化左侧
|
|
|
|
# 特殊化左侧
|
|
|
|
for col in list(set(columns) - {target_col}):
|
|
|
|
for col in list(set(columns) - {target_col}):
|
|
|
|
if sims[col] + 0.05 <= 1:
|
|
|
|
if sims[col] + 0.01 <= 1:
|
|
|
|
spec_l_md = copy.deepcopy(vio_md)
|
|
|
|
spec_l_md = copy.deepcopy(vio_md)
|
|
|
|
spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.05
|
|
|
|
spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
|
|
|
|
if if_minimal(spec_l_md, md_list, target_col):
|
|
|
|
if if_minimal(spec_l_md, md_list, target_col):
|
|
|
|
md_list.append(spec_l_md)
|
|
|
|
md_list.append(spec_l_md)
|
|
|
|
if vio_md not in minimal_vio:
|
|
|
|
if vio_md not in minimal_vio:
|
|
|
@ -151,9 +151,8 @@ def inference_from_record_pairs(path, threshold, target_col):
|
|
|
|
if not if_minimal(vio, md_list, target_col):
|
|
|
|
if not if_minimal(vio, md_list, target_col):
|
|
|
|
minimal_vio.remove(vio)
|
|
|
|
minimal_vio.remove(vio)
|
|
|
|
|
|
|
|
|
|
|
|
print(time.time()-start, '\n')
|
|
|
|
print('mds_list\t', len(md_list), '\n')
|
|
|
|
print(len(md_list), '\n')
|
|
|
|
print('vio_list\t', len(minimal_vio), '\n')
|
|
|
|
print(len(minimal_vio), '\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(minimal_vio) == 0:
|
|
|
|
if len(minimal_vio) == 0:
|
|
|
|
return md_list, []
|
|
|
|
return md_list, []
|
|
|
@ -170,49 +169,87 @@ def inference_from_record_pairs(path, threshold, target_col):
|
|
|
|
# pool.join()
|
|
|
|
# pool.join()
|
|
|
|
# minimal_vio = list(proxy_minimal_vio)
|
|
|
|
# minimal_vio = list(proxy_minimal_vio)
|
|
|
|
|
|
|
|
|
|
|
|
start = time.time()
|
|
|
|
# minimal_vio.reverse()
|
|
|
|
minimal_vio.reverse()
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
i = 0
|
|
|
|
while i < len(minimal_vio):
|
|
|
|
remove_list = []
|
|
|
|
print(i)
|
|
|
|
fuck = []
|
|
|
|
print(len(minimal_vio))
|
|
|
|
for md in minimal_vio:
|
|
|
|
current_md = minimal_vio[i]
|
|
|
|
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
|
|
|
|
support, confidence = get_one_md_metadata(current_md, data, sim_tensor, target_col)
|
|
|
|
fuck.append((support, confidence))
|
|
|
|
if support < 50:
|
|
|
|
if support < 1:
|
|
|
|
minimal_vio_length = len(minimal_vio)
|
|
|
|
print('delete by support')
|
|
|
|
j = i + 1
|
|
|
|
remove_list.append(md)
|
|
|
|
while j < len(minimal_vio):
|
|
|
|
|
|
|
|
specialization = True
|
|
|
|
|
|
|
|
next_md = minimal_vio[j]
|
|
|
|
|
|
|
|
for col in cols_but_target:
|
|
|
|
|
|
|
|
if current_md[col] > next_md[col]:
|
|
|
|
|
|
|
|
specialization = False
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
if specialization:
|
|
|
|
|
|
|
|
minimal_vio.remove(next_md)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
j += 1
|
|
|
|
|
|
|
|
print('sup')
|
|
|
|
|
|
|
|
minimal_vio.remove(current_md)
|
|
|
|
|
|
|
|
if confidence < 0.8:
|
|
|
|
if confidence < 0.8:
|
|
|
|
print('conf')
|
|
|
|
print('delete by confidence')
|
|
|
|
minimal_vio.remove(current_md)
|
|
|
|
remove_list.append(md)
|
|
|
|
if support >= 50 and confidence >= 0.8:
|
|
|
|
fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
|
|
|
|
i += 1
|
|
|
|
# while i < len(minimal_vio):
|
|
|
|
print(time.time()-start)
|
|
|
|
# print('vio_index\t', i)
|
|
|
|
|
|
|
|
# print('vio_length', len(minimal_vio))
|
|
|
|
|
|
|
|
# current_md = minimal_vio[i]
|
|
|
|
|
|
|
|
# support, confidence = get_metrics(current_md, data, sim_tensor, target_col, target_index)
|
|
|
|
|
|
|
|
# # if support < 50:
|
|
|
|
|
|
|
|
# # minimal_vio_length = len(minimal_vio)
|
|
|
|
|
|
|
|
# # j = i + 1
|
|
|
|
|
|
|
|
# # while j < len(minimal_vio):
|
|
|
|
|
|
|
|
# # specialization = True
|
|
|
|
|
|
|
|
# # next_md = minimal_vio[j]
|
|
|
|
|
|
|
|
# # for col in cols_but_target:
|
|
|
|
|
|
|
|
# # if current_md[col] > next_md[col]:
|
|
|
|
|
|
|
|
# # specialization = False
|
|
|
|
|
|
|
|
# # break
|
|
|
|
|
|
|
|
# # if specialization:
|
|
|
|
|
|
|
|
# # minimal_vio.remove(next_md)
|
|
|
|
|
|
|
|
# # else:
|
|
|
|
|
|
|
|
# # j += 1
|
|
|
|
|
|
|
|
# # print('sup')
|
|
|
|
|
|
|
|
# # minimal_vio.remove(current_md)
|
|
|
|
|
|
|
|
# if support < 1:
|
|
|
|
|
|
|
|
# print('delete by support')
|
|
|
|
|
|
|
|
# minimal_vio.remove(current_md)
|
|
|
|
|
|
|
|
# if confidence < 0.8:
|
|
|
|
|
|
|
|
# print('delete by confidence')
|
|
|
|
|
|
|
|
# minimal_vio.remove(current_md)
|
|
|
|
|
|
|
|
# if support >= 1 and confidence >= 0.8:
|
|
|
|
|
|
|
|
# i += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
t1 = time.time()
|
|
|
|
|
|
|
|
for _ in minimal_vio[:]:
|
|
|
|
for _ in minimal_vio[:]:
|
|
|
|
if not if_minimal(_, minimal_vio, target_col):
|
|
|
|
if not if_minimal(_, minimal_vio, target_col):
|
|
|
|
minimal_vio.remove(_)
|
|
|
|
minimal_vio.remove(_)
|
|
|
|
print(time.time() - t1)
|
|
|
|
|
|
|
|
|
|
|
|
print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m')
|
|
|
|
|
|
|
|
|
|
|
|
return md_list, minimal_vio
|
|
|
|
return md_list, minimal_vio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_metrics(current_md, data, sim_tensor, target_col, target_index):
|
|
|
|
|
|
|
|
columns = data.columns.values.tolist()
|
|
|
|
|
|
|
|
length = data.shape[0]
|
|
|
|
|
|
|
|
width = data.shape[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
md_tensor = list(current_md.values())
|
|
|
|
|
|
|
|
md_tensor = torch.tensor(md_tensor, device='cuda')
|
|
|
|
|
|
|
|
md_tensor_2d = md_tensor.unsqueeze(1)
|
|
|
|
|
|
|
|
md_tensor_3d = md_tensor_2d.unsqueeze(2)
|
|
|
|
|
|
|
|
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
|
|
|
|
|
|
|
|
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
|
|
|
|
|
|
|
|
for i in range(0, width):
|
|
|
|
|
|
|
|
if i != target_index:
|
|
|
|
|
|
|
|
sup_tensor_slice = sup_tensor[i]
|
|
|
|
|
|
|
|
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
|
|
|
|
|
|
|
|
sup_tensor_int = ini_slice.int()
|
|
|
|
|
|
|
|
support = torch.count_nonzero(sup_tensor_int).item()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
|
|
|
|
|
|
|
|
conf_tensor_int = ini_slice.int()
|
|
|
|
|
|
|
|
confidence_numerator = torch.count_nonzero(conf_tensor_int).item()
|
|
|
|
|
|
|
|
confidence = confidence_numerator / support
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return support, confidence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_mds_metadata(md_list, dataset_path, sim_tensor, target_col):
|
|
|
|
def get_mds_metadata(md_list, dataset_path, sim_tensor, target_col):
|
|
|
|
data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1')
|
|
|
|
data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1')
|
|
|
|
data.fillna("", inplace=True)
|
|
|
|
data.fillna("", inplace=True)
|
|
|
|