HuangJintao
HuangJintao 1 year ago
parent 27795aac1d
commit c349768eaf

@ -2,10 +2,18 @@
from md_discovery.script.md_discover import md_discover
# todo: magellan ER模块读入初始化配置或hpo配置
# todo: 模块间的自动化调用
# 入口到ER/HPO到ER
ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
mapping_lid = 'idAmazon' # mapping表中左表id名
mapping_rid = 'idGoogleBase' # mapping表中右表id名
ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段
lr_attrs_map = {'title': 'name'} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
similarity_threshold = 0.7
confidence_threshold = 0.8
interpretability_weight = 0.3
def run(l_table_path, r_table_path, mapping_path):
# while The termination condition is not met:
@ -17,18 +25,5 @@ def run(l_table_path, r_table_path, mapping_path):
if __name__ == '__main__':
# todo使用input函数输入变量值不方便就不用input
# 7. 距离度量方式
ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
mapping_lid = 'idAmazon' # mapping表中左表id名
mapping_rid = 'idGoogleBase' # mapping表中右表id名
ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段
lr_attrs_map = {'title': 'name'} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
similarity_threshold = 0.7
confidence_threshold = 0.8
interpretability_weight = 0.3
# todo 距离度量用户可设置?
print(ltable_path)

@ -1,20 +1,50 @@
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace.conditions import InCondition
import py_entitymatching as em
import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
from py_entitymatching.blocker.blocker import Blocker
from py_entitymatching.matcher.mlmatcher import MLMatcher
from smac import HyperparameterOptimizationFacade, Scenario
from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
from entrance import *
# todo 距离度量用户可设置
# 全局变量每次迭代后清空列表加入新的md路径
# todo:
# 默认路径为 "../md_discovery/output/xxx.txt"
# 真阳/假阴 mds/vio 共4个md文件
# 数据在外部加载
########################################################################################################################
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
cm.set_key(ltable, ltable_id)
ltable.fillna("", inplace=True)
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
cm.set_key(rtable, rtable_id)
rtable.fillna("", inplace=True)
mappings = pd.read_csv(mapping_path)
lid_mapping_list = []
rid_mapping_list = []
# 全部转为字符串
ltable = ltable.astype(str)
rtable = rtable.astype(str)
mappings = mappings.astype(str)
matching_number = len(mappings) # 所有阳性样本数商品数据集应为1300
for index, row in mappings.iterrows():
lid_mapping_list.append(row[mapping_lid])
rid_mapping_list.append(row[mapping_rid])
# 仅保留两表中出现在映射表中的行,增大正样本比例
selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段
selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名
attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
cm.set_key(selected_ltable, ltable_id)
cm.set_key(selected_rtable, rtable_id)
########################################################################################################################
def test_test():
block_attr_items = selected_attrs[:]
block_attr_items.remove(rtable_id)
print(block_attr_items)
def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
@ -80,54 +110,27 @@ def is_explicable(row, all_mds: list) -> bool:
return False # 遍历结束,不能解释
class SVM:
class Classifier:
@property
def configspace(self) -> ConfigurationSpace:
# Build Configuration Space which defines all parameters and their ranges
cs = ConfigurationSpace(seed=0)
# todo
# block_attr 取消打桩
block_attr = Categorical("block_attr", ["name", "description", "manufacturer", "price"], default="title")
block_attr_items = selected_attrs[:]
block_attr_items.remove(rtable_id)
block_attr = Categorical("block_attr", block_attr_items)
overlap_size = Integer("overlap_size", (1, 3), default=1)
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
# todo 其他可调参数(如feature table删去某列)
use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"])
cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker])
cs.add_conditions([use_overlap_size])
return cs
# train 就是整个函数 只需将返回结果由预测变成预测结果的评估
def train(self, config: Configuration) -> float:
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
cm.set_key(ltable, ltable_id)
ltable.fillna("", inplace=True)
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
cm.set_key(rtable, rtable_id)
rtable.fillna("", inplace=True)
mappings = pd.read_csv(mapping_path)
# 仅保留两表中出现在映射表中的行,增大正样本比例
lid_mapping_list = []
rid_mapping_list = []
# 全部转为字符串
ltable = ltable.astype(str)
rtable = rtable.astype(str)
mappings = mappings.astype(str)
matching_number = len(mappings) # 所有阳性样本数商品数据集应为1300
for index, row in mappings.iterrows():
lid_mapping_list.append(row[mapping_lid])
rid_mapping_list.append(row[mapping_rid])
selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段
selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名
attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
cm.set_key(selected_ltable, ltable_id)
cm.set_key(selected_rtable, rtable_id)
blocker = None
def train(self, config: Configuration, seed: int = 0) -> float:
if config["ml_blocker"] == "over_lap":
blocker = em.OverlapBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
@ -195,6 +198,7 @@ class SVM:
attrs_after=['gold'],
show_progress=False)
# todo 属性名解耦
test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
attrs_after=['ltable_title', 'ltable_description', 'ltable_manufacturer',
'ltable_price', 'rtable_name', 'rtable_description',
@ -225,6 +229,8 @@ class SVM:
predictions_attrs.extend(['gold', 'predicted'])
predictions = predictions[predictions_attrs]
# 默认路径为 "../md_discovery/output/xxx.txt"
# 真阳/假阴 mds/vio 共4个md文件
md_paths = ['../md_discovery/output/tp_mds.txt', '../md_discovery/output/tp_vio.txt',
'../md_discovery/output/fn_mds.txt', '../md_discovery/output/fn_vio.txt']
epl_match = 0 # 可解释预测match
@ -244,7 +250,7 @@ class SVM:
if __name__ == "__main__":
classifier = SVM()
classifier = Classifier()
# Next, we create an object, holding general information about the run
scenario = Scenario(

@ -141,7 +141,10 @@ def ml_er(config: Configuration):
# todo:
# if config is not None -> load configs
# else use default configs
# 1. overlap_attr
# 1. block_attr
# 2. overlap_size
# 3. ml_matcher
# 4. ml_blocker
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
cm.set_key(ltable, ltable_id)
ltable.fillna("", inplace=True)
@ -172,8 +175,6 @@ def ml_er(config: Configuration):
cm.set_key(selected_ltable, ltable_id)
cm.set_key(selected_rtable, rtable_id)
# todo 所有可调参数需读入配置并有默认值
# block 并将gold标记为0
blocker = em.OverlapBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, 'name', 'name',
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,

Loading…
Cancel
Save