import os import sys import pandas as pd import networkx from networkx import draw, Graph import pyjedai from pyjedai.utils import ( text_cleaning_method, print_clusters, print_blocks, print_candidate_pairs ) from pyjedai.block_building import ( StandardBlocking, QGramsBlocking, ExtendedQGramsBlocking, SuffixArraysBlocking, ExtendedSuffixArraysBlocking, ) from pyjedai.comparison_cleaning import ( WeightedEdgePruning, WeightedNodePruning, CardinalityEdgePruning, CardinalityNodePruning, BLAST, ReciprocalCardinalityNodePruning, ReciprocalWeightedNodePruning, ComparisonPropagation ) from pyjedai.evaluation import Evaluation from pyjedai.datamodel import Data from pyjedai.block_cleaning import BlockPurging from pyjedai.block_cleaning import BlockFiltering from pyjedai.matching import EntityMatching from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering from settings import er_output_dir, ltable_path, rtable_path, ltable_id, rtable_id, mapping_lid, mapping_rid, \ mapping_path def example(): # read data d1 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\abt.csv", sep='|', engine='python', na_filter=False) d2 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\buy.csv", sep='|', engine='python', na_filter=False) gt = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\gt.csv", sep='|', engine='python') data = Data(dataset_1=d1, id_column_name_1='id', dataset_2=d2, id_column_name_2='id', ground_truth=gt) # clean data(optional) data.clean_dataset(remove_stopwords=False, remove_punctuation=False, remove_numbers=False, remove_unicodes=False) # block building bb = StandardBlocking() blocks = bb.build_blocks(data, attributes_1=['name'], attributes_2=['name']) # block purging(optional) bp = BlockPurging() cleaned_blocks = bp.process(blocks, data, tqdm_disable=False) # block cleaning(optional) # todo ratio bf = BlockFiltering(ratio=0.8) filtered_blocks = bf.process(cleaned_blocks, data, tqdm_disable=False) # Comparison Cleaning - Meta Blocking(optional) # todo meta_blocking methods, weighting_scheme (more) mb = WeightedEdgePruning(weighting_scheme='EJS') candidate_pairs_blocks = mb.process(filtered_blocks, data, tqdm_disable=True) # entity matching # todo parameters(qgram, similarity_threshold) em = EntityMatching( metric='cosine', tokenizer='char_tokenizer', vectorizer='tfidf', qgram=3, similarity_threshold=0.0 ) # 无向权重图可视化 pairs_graph = em.predict(candidate_pairs_blocks, data, tqdm_disable=True) draw(pairs_graph) # entity clustering # todo similarity_threshold ccc = UniqueMappingClustering() clusters = ccc.process(pairs_graph, data, similarity_threshold=0.17) result_df = ccc.export_to_df(clusters) p = er_output_dir + r'\result.csv' result_df.to_csv(p, sep=',', index=False, header=True, quoting=1) _ = ccc.evaluate(clusters) return result_df if __name__ == '__main__': rdf = example() rdf = rdf.astype(str) ltable = pd.read_csv(ltable_path, sep='|', engine='python', na_filter=False) rtable = pd.read_csv(rtable_path, sep='|', engine='python', na_filter=False) mapping = pd.read_csv(mapping_path, sep='|', engine='python') ltable = ltable.astype(str) rtable = rtable.astype(str) mapping = mapping.astype(str) lcolumns_dict = {} rcolumns_dict = {} ltable_attrs = ltable.columns.values.tolist() rtable_attrs = rtable.columns.values.tolist() for _ in ltable_attrs: lcolumns_dict[_] = 'ltable_' + _ for _ in rtable_attrs: rcolumns_dict[_] = 'rtable_' + _ result_lid_list = rdf['id1'].tolist() selected_ltable = ltable[ltable[ltable_id].isin(result_lid_list)] selected_ltable = selected_ltable.rename(columns=lcolumns_dict) selected_ltable['key'] = 1 result_rid_list = rdf['id2'].tolist() selected_rtable = rtable[rtable[rtable_id].isin(result_rid_list)] selected_rtable = selected_rtable.rename(columns=rcolumns_dict) selected_rtable['key'] = 1 predictions = pd.merge(selected_ltable, selected_rtable, on='key') predictions.drop(columns='key', inplace=True) predictions = predictions.reset_index(drop=True) predictions['gold'] = '0' predictions['predicted'] = '0' gold_match_rows = [] predicted_match_rows = [] for tuple_ in predictions.itertuples(): lid = getattr(tuple_, 'ltable_' + ltable_id) map_row = mapping[mapping[mapping_lid] == lid] result_row = rdf[rdf['id1'] == lid] if map_row is not None: rid = map_row[mapping_rid] for value in rid: if value == getattr(tuple_, 'rtable_' + rtable_id): gold_match_rows.append(tuple_[0]) if result_row is not None: rid = result_row['id2'] for value in rid: if value == getattr(tuple_, 'rtable_' + rtable_id): predicted_match_rows.append(tuple_[0]) for _ in gold_match_rows: predictions.loc[_, 'gold'] = '1' for _ in predicted_match_rows: predictions.loc[_, 'predicted'] = '1' predictions['confidence'] = 0 predicted_match = predictions[predictions['predicted'] == '1'] print(1)