diff --git a/entrance.py b/entrance.py index eca4e6a..d51bf8a 100644 --- a/entrance.py +++ b/entrance.py @@ -25,4 +25,6 @@ if __name__ == '__main__': # 使用drop删除特征向量中的列?(如删除id相关特征) run(1) # ml_er(1) + # todo 将优化结果与参数输出到文件中 + # 通过ml_entity_resolver.ml_er()输出,同时输出参数配置信息 print(ltable_path) diff --git a/hpo/er_model_hpo.py b/hpo/er_model_hpo.py index 395b499..81c1c76 100644 --- a/hpo/er_model_hpo.py +++ b/hpo/er_model_hpo.py @@ -1,4 +1,6 @@ import os +import time + from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer from ConfigSpace.conditions import InCondition import py_entitymatching as em @@ -37,7 +39,7 @@ selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字 ######################################################################################################################## -def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int, +def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, couple_number: int, test_proportion: float) -> dict: new_df = df.reset_index(drop=False, inplace=False) gold = new_df[labeled_attr] @@ -61,7 +63,7 @@ def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall) - my_recall = num_true_positives / (matching_number * test_proportion) + my_recall = num_true_positives / (couple_number * test_proportion) return {"precision": precision, "recall": recall, "F1": F1, "my_recall": my_recall} @@ -88,13 +90,13 @@ def load_mds(paths: list) -> list: return all_mds -def is_explicable(row, all_mds: list) -> bool: +def is_explicable(line, all_mds: list) -> bool: attrs = all_mds[0].keys() # 从第一条md中读取所有字段 for md in all_mds: explicable = True # 假设这条md能解释当前元组 for a in attrs: threshold = md[a] - if my_Levenshtein_ratio(str(getattr(row, 'ltable_' + a)), str(getattr(row, 'rtable_' + a))) < threshold: + if my_Levenshtein_ratio(str(getattr(line, 'ltable_' + a)), str(getattr(line, 'rtable_' + a))) < threshold: explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组 break # 不再与当前md的其他相似度阈值比较,跳转到下一条md if explicable: @@ -122,7 +124,8 @@ class Classifier: # train 就是整个函数 只需将返回结果由预测变成预测结果的评估 def train(self, config: Configuration, seed: int = 0) -> float: - + # print(f"BRAINFUCK:{config.values()}") + cm.del_catalog() attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs] # 字段名加左前缀 attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs] # 字段名加右前缀 cm.set_key(selected_ltable, tables_id) @@ -131,28 +134,28 @@ class Classifier: blocker = em.OverlapBlocker() candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"], l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, - overlap_size=config["overlap_size"], show_progress=False, n_jobs=-1) + overlap_size=config["overlap_size"], show_progress=False) elif config["ml_blocker"] == "attr_equiv": blocker = em.AttrEquivalenceBlocker() candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"], - l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, n_jobs=-1) + l_output_attrs=selected_attrs, r_output_attrs=selected_attrs) candidate['gold'] = 0 candidate_match_rows = [] - for index, row in candidate.iterrows(): - l_id = row['ltable_' + tables_id] + for index_num, line in candidate.iterrows(): + l_id = line['ltable_' + tables_id] map_row = mappings[mappings[mapping_lid] == l_id] if map_row is not None: r_id = map_row[mapping_rid] for value in r_id: - if value == row['rtable_' + tables_id]: - candidate_match_rows.append(row["_id"]) + if value == line['rtable_' + tables_id]: + candidate_match_rows.append(line["_id"]) else: continue - for row in candidate_match_rows: - candidate.loc[row, 'gold'] = 1 + for line in candidate_match_rows: + candidate.loc[line, 'gold'] = 1 # 裁剪负样本,保持正负样本数量一致 candidate_mismatch = candidate[candidate['gold'] == 0] @@ -243,12 +246,12 @@ class Classifier: nepl_mismatch = 0 # 不可解释,预测mismatch md_list = load_mds(md_paths) # 从全局变量中读取所有的md if len(md_list) > 0: - for row in predictions.itertuples(): - if is_explicable(row, md_list): - if getattr(row, 'predicted') == 1: + for line in predictions.itertuples(): + if is_explicable(line, md_list): + if getattr(line, 'predicted') == 1: epl_match += 1 else: - if getattr(row, 'predicted') == 0: + if getattr(line, 'predicted') == 0: nepl_mismatch += 1 interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性 # if indicators["my_recall"] >= 0.8: @@ -270,6 +273,7 @@ def ml_er_hpo(): classifier.configspace, deterministic=True, n_trials=10, # We want to run max 50 trials (combination of config and seed) + n_workers=2 ) initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) @@ -291,6 +295,6 @@ def ml_er_hpo(): # Let's calculate the cost of the incumbent incumbent_cost = smac.validate(incumbent) print(f"Incumbent cost: {incumbent_cost}") - print(f"Configuration:{incumbent.values()}") + print(f"Optimized_Configuration:{incumbent.values()}") return incumbent