You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
161 lines
5.6 KiB
161 lines
5.6 KiB
import os
|
|
import sys
|
|
import pandas as pd
|
|
import networkx
|
|
from networkx import draw, Graph
|
|
import pyjedai
|
|
from pyjedai.utils import (
|
|
text_cleaning_method,
|
|
print_clusters,
|
|
print_blocks,
|
|
print_candidate_pairs
|
|
)
|
|
from pyjedai.block_building import (
|
|
StandardBlocking,
|
|
QGramsBlocking,
|
|
ExtendedQGramsBlocking,
|
|
SuffixArraysBlocking,
|
|
ExtendedSuffixArraysBlocking,
|
|
)
|
|
from pyjedai.comparison_cleaning import (
|
|
WeightedEdgePruning,
|
|
WeightedNodePruning,
|
|
CardinalityEdgePruning,
|
|
CardinalityNodePruning,
|
|
BLAST,
|
|
ReciprocalCardinalityNodePruning,
|
|
ReciprocalWeightedNodePruning,
|
|
ComparisonPropagation
|
|
)
|
|
from pyjedai.evaluation import Evaluation
|
|
from pyjedai.datamodel import Data
|
|
from pyjedai.block_cleaning import BlockPurging
|
|
from pyjedai.block_cleaning import BlockFiltering
|
|
from pyjedai.matching import EntityMatching
|
|
from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering
|
|
|
|
from settings import er_output_dir, ltable_path, rtable_path, ltable_id, rtable_id, mapping_lid, mapping_rid, \
|
|
mapping_path
|
|
|
|
|
|
def example():
|
|
# read data
|
|
d1 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\abt.csv",
|
|
sep='|', engine='python', na_filter=False)
|
|
d2 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\buy.csv",
|
|
sep='|', engine='python', na_filter=False)
|
|
gt = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\gt.csv",
|
|
sep='|', engine='python')
|
|
|
|
data = Data(dataset_1=d1,
|
|
id_column_name_1='id',
|
|
dataset_2=d2,
|
|
id_column_name_2='id',
|
|
ground_truth=gt)
|
|
|
|
# clean data(optional)
|
|
data.clean_dataset(remove_stopwords=False,
|
|
remove_punctuation=False,
|
|
remove_numbers=False,
|
|
remove_unicodes=False)
|
|
|
|
# block building
|
|
bb = StandardBlocking()
|
|
blocks = bb.build_blocks(data, attributes_1=['name'], attributes_2=['name'])
|
|
|
|
# block purging(optional)
|
|
bp = BlockPurging()
|
|
cleaned_blocks = bp.process(blocks, data, tqdm_disable=False)
|
|
|
|
# block cleaning(optional)
|
|
# todo ratio
|
|
bf = BlockFiltering(ratio=0.8)
|
|
filtered_blocks = bf.process(cleaned_blocks, data, tqdm_disable=False)
|
|
|
|
# Comparison Cleaning - Meta Blocking(optional)
|
|
# todo meta_blocking methods, weighting_scheme (more)
|
|
mb = WeightedEdgePruning(weighting_scheme='EJS')
|
|
candidate_pairs_blocks = mb.process(filtered_blocks, data, tqdm_disable=True)
|
|
|
|
# entity matching
|
|
# todo parameters(qgram, similarity_threshold)
|
|
em = EntityMatching(
|
|
metric='cosine',
|
|
tokenizer='char_tokenizer',
|
|
vectorizer='tfidf',
|
|
qgram=3,
|
|
similarity_threshold=0.0
|
|
)
|
|
|
|
# 无向权重图可视化
|
|
pairs_graph = em.predict(candidate_pairs_blocks, data, tqdm_disable=True)
|
|
draw(pairs_graph)
|
|
|
|
# entity clustering
|
|
# todo similarity_threshold
|
|
ccc = UniqueMappingClustering()
|
|
clusters = ccc.process(pairs_graph, data, similarity_threshold=0.17)
|
|
result_df = ccc.export_to_df(clusters)
|
|
p = er_output_dir + r'\result.csv'
|
|
result_df.to_csv(p, sep=',', index=False, header=True, quoting=1)
|
|
_ = ccc.evaluate(clusters)
|
|
return result_df
|
|
|
|
|
|
if __name__ == '__main__':
|
|
rdf = example()
|
|
rdf = rdf.astype(str)
|
|
ltable = pd.read_csv(ltable_path, sep='|', engine='python', na_filter=False)
|
|
rtable = pd.read_csv(rtable_path, sep='|', engine='python', na_filter=False)
|
|
mapping = pd.read_csv(mapping_path, sep='|', engine='python')
|
|
ltable = ltable.astype(str)
|
|
rtable = rtable.astype(str)
|
|
mapping = mapping.astype(str)
|
|
|
|
lcolumns_dict = {}
|
|
rcolumns_dict = {}
|
|
ltable_attrs = ltable.columns.values.tolist()
|
|
rtable_attrs = rtable.columns.values.tolist()
|
|
for _ in ltable_attrs:
|
|
lcolumns_dict[_] = 'ltable_' + _
|
|
for _ in rtable_attrs:
|
|
rcolumns_dict[_] = 'rtable_' + _
|
|
result_lid_list = rdf['id1'].tolist()
|
|
selected_ltable = ltable[ltable[ltable_id].isin(result_lid_list)]
|
|
selected_ltable = selected_ltable.rename(columns=lcolumns_dict)
|
|
selected_ltable['key'] = 1
|
|
result_rid_list = rdf['id2'].tolist()
|
|
selected_rtable = rtable[rtable[rtable_id].isin(result_rid_list)]
|
|
selected_rtable = selected_rtable.rename(columns=rcolumns_dict)
|
|
selected_rtable['key'] = 1
|
|
predictions = pd.merge(selected_ltable, selected_rtable, on='key')
|
|
predictions.drop(columns='key', inplace=True)
|
|
predictions = predictions.reset_index(drop=True)
|
|
|
|
predictions['gold'] = '0'
|
|
predictions['predicted'] = '0'
|
|
gold_match_rows = []
|
|
predicted_match_rows = []
|
|
for tuple_ in predictions.itertuples():
|
|
lid = getattr(tuple_, 'ltable_' + ltable_id)
|
|
map_row = mapping[mapping[mapping_lid] == lid]
|
|
result_row = rdf[rdf['id1'] == lid]
|
|
if map_row is not None:
|
|
rid = map_row[mapping_rid]
|
|
for value in rid:
|
|
if value == getattr(tuple_, 'rtable_' + rtable_id):
|
|
gold_match_rows.append(tuple_[0])
|
|
if result_row is not None:
|
|
rid = result_row['id2']
|
|
for value in rid:
|
|
if value == getattr(tuple_, 'rtable_' + rtable_id):
|
|
predicted_match_rows.append(tuple_[0])
|
|
for _ in gold_match_rows:
|
|
predictions.loc[_, 'gold'] = '1'
|
|
for _ in predicted_match_rows:
|
|
predictions.loc[_, 'predicted'] = '1'
|
|
|
|
predictions['confidence'] = 0
|
|
predicted_match = predictions[predictions['predicted'] == '1']
|
|
print(1)
|