You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
matching_dependency/md_discovery/multi_process_infer_by_pair...

277 lines
11 KiB

1 year ago
import multiprocessing
import pandas as pd
import Levenshtein
import copy
import numpy as np
import time
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from settings import model, er_output_dir
1 year ago
from sentence_transformers.util import cos_sim
conf_thresh = 0.8
def my_Levenshtein_ratio(str1, str2):
if max(len(str1), len(str2)) == 0:
return 1
return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
def norm_cos_sim(embed1, embed2):
sim = cos_sim(embed1, embed2)
return sim.tolist()[0][0]/2 + 0.5
def table_encode(tp_path, fn_path):
embedding_dic = {}
tp_data = pd.read_csv(tp_path, low_memory=False, encoding='ISO-8859-1')
tp_data.fillna("", inplace=True)
tp_data = tp_data.astype(str)
tp_length = tp_data.shape[0]
tp_width = tp_data.shape[1]
tp_sentences = []
for row in range(0, tp_length):
for col in range(0, tp_width):
cell_value = tp_data.values[row, col]
tp_sentences.append(cell_value)
tp_embedding = model.encode(tp_sentences, convert_to_tensor=True, device="cuda")
1 year ago
list_tp_embedding = tp_embedding.tolist()
1 year ago
for row in range(0, tp_length):
for col in range(0, tp_width):
cell_value = tp_data.values[row, col]
embedding_dic[cell_value] = list_tp_embedding[row * tp_width + col]
1 year ago
fn_data = pd.read_csv(fn_path, low_memory=False, encoding='ISO-8859-1')
fn_data.fillna("", inplace=True)
fn_data = fn_data.astype(str)
fn_length = fn_data.shape[0]
fn_width = fn_data.shape[1]
fn_sentences = []
for row in range(0, fn_length):
for col in range(0, fn_width):
cell_value = fn_data.values[row, col]
fn_sentences.append(cell_value)
fn_embedding = model.encode(fn_sentences, convert_to_tensor=True, device="cuda")
1 year ago
list_fn_embedding = fn_embedding.tolist()
1 year ago
for row in range(0, fn_length):
for col in range(0, fn_width):
cell_value = fn_data.values[row, col]
embedding_dic[cell_value] = list_fn_embedding[row * fn_width + col]
1 year ago
np.save('embedding_dic.npy', embedding_dic)
def test_table_encode():
start = time.time()
table_encode(er_output_dir+'tp_single_tuple.csv', er_output_dir+'fn_single_tuple.csv')
1 year ago
print(time.time()-start)
def test_load():
load_dict = np.load('embedding_dic.npy', allow_pickle=True).item()
a = load_dict['model- bdcd00105wi venor- bitdefender features- bitdefender antivirus v10- small box antivirus v10 delivers a one-two security punch integrating todays most powerful antivirus and antispyware modules into one convenient package. its easy to use and updates itself automatically making it truly an install and forget solution. * antivirus the purpose of the antivirus module is to ensure detection and removal of all viruses in the wild. bitdefender antivirus uses robust scan engines certified by icsa labs virus bulletin checkmark checkvir and tuv. - improved proactive detection b-have (behavioral heuristic analyzer in virtual environments) emulates a virtual computer-inside-a-computer where pieces of software are run in order to check for potential malware behavior. this bitdefender proprietary technology represents a new security layer that keeps the operating system safe from unknown viruses by detecting malicious pieces of code for which signatures have not yet been released. - permanent antivirus protection the new and improved bitdefender scanning engines will scan and disinfect infected files on access minimizing data loss. infected documents can now be recovered instead of being deleted. - new rootkit detection and removal a new bitdefender module looks for rootkits (malicious programs designed to control victim computers while staying hidden) and removes them on detection. - new web scanning web traffic is now filtered in real-time even before reaching your browser providing a safe and enjoyable web experience. - peer-2-peer and im applications protection filters against viruses that spread']
print(a)
print(1)
# def test_lm_similarity():
# print(time.time())
# sentences = ['fun with reading & writing! is designed to help kids learn to read and write better through exercises puzzle-solving creative writing decoding and more!',
# 'based on the tween lifestyle brand launched in 2004 this action/adventure game will contain loads of adventures tailored specifically to the player\'s personality type. the evergirl brand features a clothing and accessories line with a companion web ...']
# embeddings = model.encode(sentences, convert_to_tensor=True)
# print(time.time())
# sim = cos_sim(embeddings[0], embeddings[1])
# print(time.time())
# # print(sim.tolist()[0][0]/2 + 0.5)
def is_minimal(md, md_list, target_col):
1 year ago
# 假设这个md是minimal
minimal = True
if len(md_list) == 0:
return True
if md_list.count(md) > 1:
return False
for _ in md_list:
if _ != md:
# 假设列表中每一个md都使当前md不minimal
exist = True
# 如果左边任何一个大于,则假设不成立
for col in list(set(_.keys()) - {target_col}):
if _[col] > md[col]:
exist = False
# 如果右边小于,假设也不成立
if _[target_col] < md[target_col]:
exist = False
# 任何一次假设成立当前md不minimal
if exist:
minimal = False
break
return minimal
def remove_by_confidence(md, l, relation, target_col, lock):
support, confidence = get_one_md_metadata(md, relation, target_col)
if confidence < 0.8:
with lock:
l.remove(md)
# def remove_by_confidence(md, l, relation, target_col):
# boolean, conf = satisfy_confidence(md, relation, 0.8, target_col)
# if not boolean:
# l.remove(md)
# print(md, '\t', conf)
def inference_from_record_pairs(path, threshold, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
columns = data.columns.values.tolist()
other_columns = list(set(columns) - {target_col})
1 year ago
md_list = []
minimal_vio = []
init_md = {}
for col in columns:
init_md[col] = 1 if col == target_col else 0
md_list.append(init_md)
for row1 in tqdm(data.itertuples()):
1 year ago
# 获取当前行的索引,从后一行开始切片
i = row1[0]
data1 = data[i + 1:]
for row2 in data1.itertuples():
violated_mds = []
# sims是两行的相似度
sims = {}
for col in columns:
similarity = norm_cos_sim(getattr(row1, col), getattr(row2, col))
1 year ago
sims[col] = similarity
# 寻找violated md,从md列表中删除并加入vio列表
# tmp_md_list = copy.deepcopy(md_list)
for md in md_list[:]:
lhs_satis = True
rhs_satis = True
for col in other_columns:
1 year ago
if sims[col] + 0.0000001 < md[col]:
lhs_satis = False
break
if sims[target_col] + 0.0000001 < md[target_col]:
rhs_satis = False
if lhs_satis == True and rhs_satis == False:
md_list.remove(md)
violated_mds.append(md)
# minimal_vio.extend(violated_mds)
1 year ago
for vio_md in violated_mds:
# 特殊化右侧,我们需要右侧百分百相似,其实不需要降低右侧阈值
# if sims[target_col] >= threshold:
# new_rhs = sims[target_col]
# spec_r_md = copy.deepcopy(vio_md)
# spec_r_md[target_col] = new_rhs
# if is_minimal(spec_r_md, md_list, target_col):
1 year ago
# md_list.append(spec_r_md)
# 特殊化左侧
for col in other_columns:
1 year ago
if sims[col] + 0.01 <= 1:
spec_l_md = copy.deepcopy(vio_md)
spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
if is_minimal(spec_l_md, md_list, target_col):
1 year ago
md_list.append(spec_l_md)
# for vio in minimal_vio[:]:
# if not is_minimal(vio, md_list, target_col):
# minimal_vio.remove(vio)
1 year ago
# fuck = len(minimal_vio)
# tmp = []
# for _ in minimal_vio:
# if _ not in tmp:
# tmp.append(_)
# minimal_vio = tmp
# manager = multiprocessing.Manager()
# lock = manager.Lock()
# if len(minimal_vio) == 0:
# return md_list, []
# pool_size = len(minimal_vio) if len(minimal_vio) < 16 else 16
# pool = multiprocessing.Pool(pool_size)
# # tmp = copy.deepcopy(minimal_vio)
# with manager:
# proxy_minimal_vio = manager.list(minimal_vio)
# for _ in minimal_vio[:]:
# pool.apply_async(remove_by_confidence, args=(_, proxy_minimal_vio, data, target_col, lock))
# pool.close()
# pool.join()
# minimal_vio = list(proxy_minimal_vio)
#
# for _ in minimal_vio[:]:
# if not is_minimal(_, minimal_vio, target_col):
1 year ago
# minimal_vio.remove(_)
#
for _ in md_list[:]:
if not is_minimal(_, md_list, target_col):
md_list.remove(_)
1 year ago
return md_list, minimal_vio
def get_mds_metadata(md_list, dataset_path, target_col):
data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
manager = multiprocessing.Manager()
if len(md_list) == 0:
return []
pool_size = len(md_list) if len(md_list) < 16 else 16
pool = multiprocessing.Pool(pool_size)
result = []
with manager:
for _ in md_list:
task = pool.apply_async(get_one_md_metadata, args=(_, data, target_col))
support, confidence = task.get()
result.append({"md": _, "support": support, "confidence": confidence})
pool.close()
pool.join()
return result
def get_one_md_metadata(md, dataframe, target_col):
support = 0
pre_confidence = 0
columns = dataframe.columns.values.tolist()
for row1 in dataframe.itertuples():
i = row1[0]
df_slice = dataframe[i + 1:]
for row2 in df_slice.itertuples():
left_satisfy = True
both_satisfy = True
for col in columns:
sim = norm_cos_sim(getattr(row1, col), getattr(row2, col))
1 year ago
if col == target_col:
if sim + 0.0000001 < 1:
both_satisfy = False
else:
if sim + 0.0000001 < md[col]:
left_satisfy = False
both_satisfy = False
if left_satisfy:
support += 1
if both_satisfy:
pre_confidence += 1
confidence = 0 if support == 0 else pre_confidence / support
# return {"md": md, "support": support, "confidence": confidence}
return support, confidence