parent
d03d300f8c
commit
a6a58e178f
@ -0,0 +1,75 @@
|
||||
import time
|
||||
import pandas as pd
|
||||
import py_entitymatching as em
|
||||
import py_entitymatching.catalog.catalog_manager as cm
|
||||
from tqdm import tqdm
|
||||
|
||||
from md_discovery.md_mining import mining
|
||||
from settings import *
|
||||
|
||||
|
||||
def blocking_mining():
|
||||
start = time.time()
|
||||
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
|
||||
cm.set_key(ltable, ltable_id)
|
||||
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
|
||||
cm.set_key(rtable, rtable_id)
|
||||
mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
|
||||
matching_number = len(mappings)
|
||||
if ltable_id == rtable_id:
|
||||
tables_id = rtable_id
|
||||
attributes = ltable.columns.values.tolist()
|
||||
lattributes = ['ltable_' + i for i in attributes]
|
||||
rattributes = ['rtable_' + i for i in attributes]
|
||||
cm.set_key(ltable, ltable_id)
|
||||
cm.set_key(rtable, rtable_id)
|
||||
|
||||
blocker = em.OverlapBlocker()
|
||||
candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True,
|
||||
l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=1,
|
||||
overlap_size=1, show_progress=False)
|
||||
candidate['gold'] = 0
|
||||
candidate = candidate.reset_index(drop=True)
|
||||
|
||||
# 根据mapping表标注数据
|
||||
candidate_match_rows = []
|
||||
for t in tqdm(mappings.itertuples()):
|
||||
mask = ((candidate['ltable_' + ltable_id].isin([getattr(t, 'ltable_id')])) &
|
||||
(candidate['rtable_' + rtable_id].isin([getattr(t, 'rtable_id')])))
|
||||
matching_indices = candidate[mask].index
|
||||
candidate_match_rows.extend(matching_indices.tolist())
|
||||
match_rows_mask = candidate.index.isin(candidate_match_rows)
|
||||
candidate.loc[match_rows_mask, 'gold'] = 1
|
||||
candidate.fillna(value="", inplace=True)
|
||||
|
||||
candidate_mismatch = candidate[candidate['gold'] == 0]
|
||||
candidate_match = candidate[candidate['gold'] == 1]
|
||||
candidate_mismatch = candidate_mismatch.sample(n=3*len(candidate_match))
|
||||
candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
|
||||
# 如果拼接后不重设索引可能导致索引重复
|
||||
candidate_for_train_test = candidate_for_train_test.reset_index(drop=True)
|
||||
cm.set_key(candidate_for_train_test, '_id')
|
||||
cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id)
|
||||
cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id)
|
||||
cm.set_ltable(candidate_for_train_test, ltable)
|
||||
cm.set_rtable(candidate_for_train_test, rtable)
|
||||
block_recall = len(candidate_match) / matching_number
|
||||
|
||||
# 分为训练测试集
|
||||
train_proportion = 0.5
|
||||
sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
|
||||
train_set = sets['train']
|
||||
test_set = sets['test']
|
||||
end_blocking = time.time()
|
||||
print(end_blocking - start)
|
||||
|
||||
mining(train_set)
|
||||
return 1
|
||||
|
||||
|
||||
def matching():
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
blocking_mining()
|
Loading…
Reference in new issue