parent
0835b6df1c
commit
253eb0835f
@ -0,0 +1,4 @@
|
|||||||
|
/ml_er/output/*
|
||||||
|
/md_discovery/output/*
|
||||||
|
/hpo/output/*
|
||||||
|
/datasets/*
|
@ -0,0 +1,93 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import json
|
||||||
|
from time import *
|
||||||
|
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
||||||
|
from ConfigSpace.read_and_write import json as csj
|
||||||
|
from smac import HyperparameterOptimizationFacade, Scenario
|
||||||
|
from settings import *
|
||||||
|
from ml_er.ml_entity_resolver import er_process
|
||||||
|
from settings import ltable_path, ltable_id
|
||||||
|
|
||||||
|
|
||||||
|
class Classifier:
|
||||||
|
@property
|
||||||
|
def configspace(self) -> ConfigurationSpace:
|
||||||
|
cs = ConfigurationSpace(seed=0)
|
||||||
|
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1', sep='|')
|
||||||
|
selected_attrs = ltable.columns.values.tolist()
|
||||||
|
block_attr_items = selected_attrs[:]
|
||||||
|
block_attr_items.remove(ltable_id)
|
||||||
|
|
||||||
|
jed_blocker = Categorical("jed_blocker",
|
||||||
|
["Standard", "QGrams", "ExtendedQG", "SuffixArrays", "ExtendedSA"])
|
||||||
|
block_attr = Categorical("block_attr", block_attr_items)
|
||||||
|
# filter_ratio = Float("filter_ratio", (0.0, 1.0), default=0.8)
|
||||||
|
meta_blocker = Categorical("meta_blocker",
|
||||||
|
["WEP", "WNP", "CEP", "CNP", "BLAST", "RCNP", "RWNP", "CP"])
|
||||||
|
weighting_scheme = Categorical("weighting_scheme",
|
||||||
|
['CN-CBS', 'CBS', 'SN-CBS', 'CNC', 'SNC', 'SND', 'CND', 'CNJ', 'SNJ',
|
||||||
|
'COSINE', 'DICE', 'ECBS', 'JS', 'EJS', 'X2'])
|
||||||
|
|
||||||
|
# todo other hyperparameters
|
||||||
|
matching_metric = Categorical("matching_metric",
|
||||||
|
['cosine', 'euclidean'])
|
||||||
|
matching_tokenizer = Categorical("matching_tokenizer",
|
||||||
|
['char_tokenizer', 'word_tokenizer', 'white_space_tokenizer'])
|
||||||
|
matching_vectorizer = Categorical("matching_vectorizer",
|
||||||
|
['tfidf', 'tf', 'boolean'])
|
||||||
|
clusteror = Categorical("clusteror_name",
|
||||||
|
["CCC", "UMC", "EC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"])
|
||||||
|
|
||||||
|
cs.add_hyperparameters([jed_blocker, block_attr, meta_blocker, weighting_scheme, matching_metric,
|
||||||
|
matching_tokenizer, matching_vectorizer, clusteror])
|
||||||
|
return cs
|
||||||
|
|
||||||
|
def train(self, config: Configuration, seed: int = 0) -> float:
|
||||||
|
indicators = er_process(config)
|
||||||
|
return 1-indicators['performance']
|
||||||
|
|
||||||
|
|
||||||
|
def ml_er_hpo():
|
||||||
|
classifier = Classifier()
|
||||||
|
cs = classifier.configspace
|
||||||
|
str_configspace = csj.write(cs)
|
||||||
|
dict_configspace = json.loads(str_configspace)
|
||||||
|
with open(hpo_output_dir + r"\configspace.json", "w") as f:
|
||||||
|
json.dump(dict_configspace, f, indent=4)
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
cs,
|
||||||
|
deterministic=True,
|
||||||
|
n_trials=50, # We want to run max 50 trials (combination of config and seed)
|
||||||
|
n_workers=1
|
||||||
|
)
|
||||||
|
|
||||||
|
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
|
||||||
|
|
||||||
|
smac = HyperparameterOptimizationFacade(
|
||||||
|
scenario,
|
||||||
|
classifier.train,
|
||||||
|
initial_design=initial_design,
|
||||||
|
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
|
||||||
|
)
|
||||||
|
|
||||||
|
incumbent = smac.optimize()
|
||||||
|
incumbent_cost = smac.validate(incumbent)
|
||||||
|
default = cs.get_default_configuration()
|
||||||
|
default_cost = smac.validate(default)
|
||||||
|
print(f"Default Cost: {default_cost}")
|
||||||
|
print(f"Incumbent Cost: {incumbent_cost}")
|
||||||
|
|
||||||
|
if incumbent_cost > default_cost:
|
||||||
|
incumbent = default
|
||||||
|
print(f"Updated Incumbent Cost: {default_cost}")
|
||||||
|
|
||||||
|
print(f"Optimized Configuration:{incumbent.values()}")
|
||||||
|
|
||||||
|
with open(hpo_output_dir + r"\incumbent.json", "w") as f:
|
||||||
|
json.dump(dict(incumbent), f, indent=4)
|
||||||
|
return incumbent
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
ml_er_hpo()
|
@ -0,0 +1,160 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pandas as pd
|
||||||
|
import networkx
|
||||||
|
from networkx import draw, Graph
|
||||||
|
import pyjedai
|
||||||
|
from pyjedai.utils import (
|
||||||
|
text_cleaning_method,
|
||||||
|
print_clusters,
|
||||||
|
print_blocks,
|
||||||
|
print_candidate_pairs
|
||||||
|
)
|
||||||
|
from pyjedai.block_building import (
|
||||||
|
StandardBlocking,
|
||||||
|
QGramsBlocking,
|
||||||
|
ExtendedQGramsBlocking,
|
||||||
|
SuffixArraysBlocking,
|
||||||
|
ExtendedSuffixArraysBlocking,
|
||||||
|
)
|
||||||
|
from pyjedai.comparison_cleaning import (
|
||||||
|
WeightedEdgePruning,
|
||||||
|
WeightedNodePruning,
|
||||||
|
CardinalityEdgePruning,
|
||||||
|
CardinalityNodePruning,
|
||||||
|
BLAST,
|
||||||
|
ReciprocalCardinalityNodePruning,
|
||||||
|
ReciprocalWeightedNodePruning,
|
||||||
|
ComparisonPropagation
|
||||||
|
)
|
||||||
|
from pyjedai.evaluation import Evaluation
|
||||||
|
from pyjedai.datamodel import Data
|
||||||
|
from pyjedai.block_cleaning import BlockPurging
|
||||||
|
from pyjedai.block_cleaning import BlockFiltering
|
||||||
|
from pyjedai.matching import EntityMatching
|
||||||
|
from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering
|
||||||
|
|
||||||
|
from settings import er_output_dir, ltable_path, rtable_path, ltable_id, rtable_id, mapping_lid, mapping_rid, \
|
||||||
|
mapping_path
|
||||||
|
|
||||||
|
|
||||||
|
def example():
|
||||||
|
# read data
|
||||||
|
d1 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\abt.csv",
|
||||||
|
sep='|', engine='python', na_filter=False)
|
||||||
|
d2 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\buy.csv",
|
||||||
|
sep='|', engine='python', na_filter=False)
|
||||||
|
gt = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\gt.csv",
|
||||||
|
sep='|', engine='python')
|
||||||
|
|
||||||
|
data = Data(dataset_1=d1,
|
||||||
|
id_column_name_1='id',
|
||||||
|
dataset_2=d2,
|
||||||
|
id_column_name_2='id',
|
||||||
|
ground_truth=gt)
|
||||||
|
|
||||||
|
# clean data(optional)
|
||||||
|
data.clean_dataset(remove_stopwords=False,
|
||||||
|
remove_punctuation=False,
|
||||||
|
remove_numbers=False,
|
||||||
|
remove_unicodes=False)
|
||||||
|
|
||||||
|
# block building
|
||||||
|
bb = StandardBlocking()
|
||||||
|
blocks = bb.build_blocks(data, attributes_1=['name'], attributes_2=['name'])
|
||||||
|
|
||||||
|
# block purging(optional)
|
||||||
|
bp = BlockPurging()
|
||||||
|
cleaned_blocks = bp.process(blocks, data, tqdm_disable=False)
|
||||||
|
|
||||||
|
# block cleaning(optional)
|
||||||
|
# todo ratio
|
||||||
|
bf = BlockFiltering(ratio=0.8)
|
||||||
|
filtered_blocks = bf.process(cleaned_blocks, data, tqdm_disable=False)
|
||||||
|
|
||||||
|
# Comparison Cleaning - Meta Blocking(optional)
|
||||||
|
# todo meta_blocking methods, weighting_scheme (more)
|
||||||
|
mb = WeightedEdgePruning(weighting_scheme='EJS')
|
||||||
|
candidate_pairs_blocks = mb.process(filtered_blocks, data, tqdm_disable=True)
|
||||||
|
|
||||||
|
# entity matching
|
||||||
|
# todo parameters(qgram, similarity_threshold)
|
||||||
|
em = EntityMatching(
|
||||||
|
metric='cosine',
|
||||||
|
tokenizer='char_tokenizer',
|
||||||
|
vectorizer='tfidf',
|
||||||
|
qgram=3,
|
||||||
|
similarity_threshold=0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
# 无向权重图可视化
|
||||||
|
pairs_graph = em.predict(candidate_pairs_blocks, data, tqdm_disable=True)
|
||||||
|
draw(pairs_graph)
|
||||||
|
|
||||||
|
# entity clustering
|
||||||
|
# todo similarity_threshold
|
||||||
|
ccc = UniqueMappingClustering()
|
||||||
|
clusters = ccc.process(pairs_graph, data, similarity_threshold=0.17)
|
||||||
|
result_df = ccc.export_to_df(clusters)
|
||||||
|
p = er_output_dir + r'\result.csv'
|
||||||
|
result_df.to_csv(p, sep=',', index=False, header=True, quoting=1)
|
||||||
|
_ = ccc.evaluate(clusters)
|
||||||
|
return result_df
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
rdf = example()
|
||||||
|
rdf = rdf.astype(str)
|
||||||
|
ltable = pd.read_csv(ltable_path, sep='|', engine='python', na_filter=False)
|
||||||
|
rtable = pd.read_csv(rtable_path, sep='|', engine='python', na_filter=False)
|
||||||
|
mapping = pd.read_csv(mapping_path, sep='|', engine='python')
|
||||||
|
ltable = ltable.astype(str)
|
||||||
|
rtable = rtable.astype(str)
|
||||||
|
mapping = mapping.astype(str)
|
||||||
|
|
||||||
|
lcolumns_dict = {}
|
||||||
|
rcolumns_dict = {}
|
||||||
|
ltable_attrs = ltable.columns.values.tolist()
|
||||||
|
rtable_attrs = rtable.columns.values.tolist()
|
||||||
|
for _ in ltable_attrs:
|
||||||
|
lcolumns_dict[_] = 'ltable_' + _
|
||||||
|
for _ in rtable_attrs:
|
||||||
|
rcolumns_dict[_] = 'rtable_' + _
|
||||||
|
result_lid_list = rdf['id1'].tolist()
|
||||||
|
selected_ltable = ltable[ltable[ltable_id].isin(result_lid_list)]
|
||||||
|
selected_ltable = selected_ltable.rename(columns=lcolumns_dict)
|
||||||
|
selected_ltable['key'] = 1
|
||||||
|
result_rid_list = rdf['id2'].tolist()
|
||||||
|
selected_rtable = rtable[rtable[rtable_id].isin(result_rid_list)]
|
||||||
|
selected_rtable = selected_rtable.rename(columns=rcolumns_dict)
|
||||||
|
selected_rtable['key'] = 1
|
||||||
|
predictions = pd.merge(selected_ltable, selected_rtable, on='key')
|
||||||
|
predictions.drop(columns='key', inplace=True)
|
||||||
|
predictions = predictions.reset_index(drop=True)
|
||||||
|
|
||||||
|
predictions['gold'] = '0'
|
||||||
|
predictions['predicted'] = '0'
|
||||||
|
gold_match_rows = []
|
||||||
|
predicted_match_rows = []
|
||||||
|
for tuple_ in predictions.itertuples():
|
||||||
|
lid = getattr(tuple_, 'ltable_' + ltable_id)
|
||||||
|
map_row = mapping[mapping[mapping_lid] == lid]
|
||||||
|
result_row = rdf[rdf['id1'] == lid]
|
||||||
|
if map_row is not None:
|
||||||
|
rid = map_row[mapping_rid]
|
||||||
|
for value in rid:
|
||||||
|
if value == getattr(tuple_, 'rtable_' + rtable_id):
|
||||||
|
gold_match_rows.append(tuple_[0])
|
||||||
|
if result_row is not None:
|
||||||
|
rid = result_row['id2']
|
||||||
|
for value in rid:
|
||||||
|
if value == getattr(tuple_, 'rtable_' + rtable_id):
|
||||||
|
predicted_match_rows.append(tuple_[0])
|
||||||
|
for _ in gold_match_rows:
|
||||||
|
predictions.loc[_, 'gold'] = '1'
|
||||||
|
for _ in predicted_match_rows:
|
||||||
|
predictions.loc[_, 'predicted'] = '1'
|
||||||
|
|
||||||
|
predictions['confidence'] = 0
|
||||||
|
predicted_match = predictions[predictions['predicted'] == '1']
|
||||||
|
print(1)
|
@ -0,0 +1,22 @@
|
|||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
ltable_path = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\abt.csv'
|
||||||
|
rtable_path = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\buy.csv'
|
||||||
|
mapping_path = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\gt.csv'
|
||||||
|
mapping_lid = 'D1' # mapping表中左表id名
|
||||||
|
mapping_rid = 'D2' # mapping表中右表id名
|
||||||
|
ltable_id = 'id' # 左表id字段名称
|
||||||
|
rtable_id = 'id' # 右表id字段名称
|
||||||
|
target_attr = 'id' # 进行md挖掘时的目标字段
|
||||||
|
# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
|
||||||
|
|
||||||
|
model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
|
||||||
|
interpre_weight = 0.5 # 可解释性权重
|
||||||
|
similarity_threshold = 0.1
|
||||||
|
support_threshold = 1
|
||||||
|
confidence_threshold = 0.25
|
||||||
|
|
||||||
|
er_output_dir = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\ml_er\output'
|
||||||
|
md_output_dir = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\md_discovery\output'
|
||||||
|
hpo_output_dir = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\hpo\output'
|
||||||
|
|
Loading…
Reference in new issue