You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

161 lines
5.6 KiB

10 months ago
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph
import pyjedai
from pyjedai.utils import (
text_cleaning_method,
print_clusters,
print_blocks,
print_candidate_pairs
)
from pyjedai.block_building import (
StandardBlocking,
QGramsBlocking,
ExtendedQGramsBlocking,
SuffixArraysBlocking,
ExtendedSuffixArraysBlocking,
)
from pyjedai.comparison_cleaning import (
WeightedEdgePruning,
WeightedNodePruning,
CardinalityEdgePruning,
CardinalityNodePruning,
BLAST,
ReciprocalCardinalityNodePruning,
ReciprocalWeightedNodePruning,
ComparisonPropagation
)
from pyjedai.evaluation import Evaluation
from pyjedai.datamodel import Data
from pyjedai.block_cleaning import BlockPurging
from pyjedai.block_cleaning import BlockFiltering
from pyjedai.matching import EntityMatching
from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering
from settings import er_output_dir, ltable_path, rtable_path, ltable_id, rtable_id, mapping_lid, mapping_rid, \
mapping_path
def example():
# read data
d1 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\abt.csv",
sep='|', engine='python', na_filter=False)
d2 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\buy.csv",
sep='|', engine='python', na_filter=False)
gt = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\gt.csv",
sep='|', engine='python')
data = Data(dataset_1=d1,
id_column_name_1='id',
dataset_2=d2,
id_column_name_2='id',
ground_truth=gt)
# clean data(optional)
data.clean_dataset(remove_stopwords=False,
remove_punctuation=False,
remove_numbers=False,
remove_unicodes=False)
# block building
bb = StandardBlocking()
blocks = bb.build_blocks(data, attributes_1=['name'], attributes_2=['name'])
# block purging(optional)
bp = BlockPurging()
cleaned_blocks = bp.process(blocks, data, tqdm_disable=False)
# block cleaning(optional)
# todo ratio
bf = BlockFiltering(ratio=0.8)
filtered_blocks = bf.process(cleaned_blocks, data, tqdm_disable=False)
# Comparison Cleaning - Meta Blocking(optional)
# todo meta_blocking methods, weighting_scheme (more)
mb = WeightedEdgePruning(weighting_scheme='EJS')
candidate_pairs_blocks = mb.process(filtered_blocks, data, tqdm_disable=True)
# entity matching
# todo parameters(qgram, similarity_threshold)
em = EntityMatching(
metric='cosine',
tokenizer='char_tokenizer',
vectorizer='tfidf',
qgram=3,
similarity_threshold=0.0
)
# 无向权重图可视化
pairs_graph = em.predict(candidate_pairs_blocks, data, tqdm_disable=True)
draw(pairs_graph)
# entity clustering
# todo similarity_threshold
ccc = UniqueMappingClustering()
clusters = ccc.process(pairs_graph, data, similarity_threshold=0.17)
result_df = ccc.export_to_df(clusters)
p = er_output_dir + r'\result.csv'
result_df.to_csv(p, sep=',', index=False, header=True, quoting=1)
_ = ccc.evaluate(clusters)
return result_df
if __name__ == '__main__':
rdf = example()
rdf = rdf.astype(str)
ltable = pd.read_csv(ltable_path, sep='|', engine='python', na_filter=False)
rtable = pd.read_csv(rtable_path, sep='|', engine='python', na_filter=False)
mapping = pd.read_csv(mapping_path, sep='|', engine='python')
ltable = ltable.astype(str)
rtable = rtable.astype(str)
mapping = mapping.astype(str)
lcolumns_dict = {}
rcolumns_dict = {}
ltable_attrs = ltable.columns.values.tolist()
rtable_attrs = rtable.columns.values.tolist()
for _ in ltable_attrs:
lcolumns_dict[_] = 'ltable_' + _
for _ in rtable_attrs:
rcolumns_dict[_] = 'rtable_' + _
result_lid_list = rdf['id1'].tolist()
selected_ltable = ltable[ltable[ltable_id].isin(result_lid_list)]
selected_ltable = selected_ltable.rename(columns=lcolumns_dict)
selected_ltable['key'] = 1
result_rid_list = rdf['id2'].tolist()
selected_rtable = rtable[rtable[rtable_id].isin(result_rid_list)]
selected_rtable = selected_rtable.rename(columns=rcolumns_dict)
selected_rtable['key'] = 1
predictions = pd.merge(selected_ltable, selected_rtable, on='key')
predictions.drop(columns='key', inplace=True)
predictions = predictions.reset_index(drop=True)
predictions['gold'] = '0'
predictions['predicted'] = '0'
gold_match_rows = []
predicted_match_rows = []
for tuple_ in predictions.itertuples():
lid = getattr(tuple_, 'ltable_' + ltable_id)
map_row = mapping[mapping[mapping_lid] == lid]
result_row = rdf[rdf['id1'] == lid]
if map_row is not None:
rid = map_row[mapping_rid]
for value in rid:
if value == getattr(tuple_, 'rtable_' + rtable_id):
gold_match_rows.append(tuple_[0])
if result_row is not None:
rid = result_row['id2']
for value in rid:
if value == getattr(tuple_, 'rtable_' + rtable_id):
predicted_match_rows.append(tuple_[0])
for _ in gold_match_rows:
predictions.loc[_, 'gold'] = '1'
for _ in predicted_match_rows:
predictions.loc[_, 'predicted'] = '1'
predictions['confidence'] = 0
predicted_match = predictions[predictions['predicted'] == '1']
print(1)