画图

2 years ago · 4f61d93a30
parent 3d10e93bdf
commit 4f61d93a30
4 changed files with 107 additions and 48 deletions
--- a/draw/draw_confidence_histogram.py
+++ b/draw/draw_confidence_histogram.py
@ -7,48 +7,50 @@ from pyecharts.globals import ThemeType
 if __name__ == '__main__':
    outcome_dir = r'E:\Data\Research\Outcome'
-    configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5'
+    inter_list = ['0', '0.5', '0.7', '0.9', '1']
    configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-'
    datasets_list = os.listdir(outcome_dir)
    for _ in datasets_list:
-        path = outcome_dir + rf'\{_}' + configs_dir
+        for inter in inter_list:
-        statistics_files = os.listdir(path)
+            path = outcome_dir + rf'\{_}' + configs_dir + inter
-        length = 0
+            statistics_files = os.listdir(path)
-        for file in statistics_files:
+            length = 0
-            if file.startswith('predictions'):
+            for file in statistics_files:
-                preds = pd.read_csv(path + rf'\{file}', encoding='ISO-8859-1')
+                if file.startswith('predictions'):
-                preds = preds[['predicted', 'confidence']]
+                    preds = pd.read_csv(path + rf'\{file}', encoding='ISO-8859-1')
-                preds = preds.astype(float)
+                    preds = preds[['predicted', 'confidence']]
-                preds = preds[preds['predicted'] == 1.0]
+                    preds = preds.astype(float)
-                length = len(preds)
+                    preds = preds[preds['predicted'] == 1.0]
-                li = []
+                    length = len(preds)
-                zeros = len(preds[preds['confidence'] == 0])
+                    li = []
-                dot_02 = len(preds[(preds['confidence'] > 0) & (preds['confidence'] <= 0.2)])
+                    zeros = len(preds[preds['confidence'] == 0])
-                dot_24 = len(preds[(preds['confidence'] > 0.2) & (preds['confidence'] <= 0.4)])
+                    dot_02 = len(preds[(preds['confidence'] > 0) & (preds['confidence'] <= 0.2)])
-                dot_46 = len(preds[(preds['confidence'] > 0.4) & (preds['confidence'] <= 0.6)])
+                    dot_24 = len(preds[(preds['confidence'] > 0.2) & (preds['confidence'] <= 0.4)])
-                dot_68 = len(preds[(preds['confidence'] > 0.6) & (preds['confidence'] <= 0.8)])
+                    dot_46 = len(preds[(preds['confidence'] > 0.4) & (preds['confidence'] <= 0.6)])
-                dot_80 = len(preds[(preds['confidence'] > 0.8) & (preds['confidence'] <= 1.0)])
+                    dot_68 = len(preds[(preds['confidence'] > 0.6) & (preds['confidence'] <= 0.8)])
-                for number in [zeros, dot_02, dot_24, dot_46, dot_68, dot_80]:
+                    dot_80 = len(preds[(preds['confidence'] > 0.8) & (preds['confidence'] <= 1.0)])
-                    li.append(round(number * 100 / length, ndigits=3))
+                    for number in [zeros, dot_02, dot_24, dot_46, dot_68, dot_80]:
                        li.append(round(number * 100 / length, ndigits=3))
-                c = (
+                    c = (
-                    Bar(init_opts=opts.InitOpts(theme=ThemeType.WALDEN))
+                        Bar(init_opts=opts.InitOpts(theme=ThemeType.WALDEN))
-                    .add_xaxis(['conf=0', '0<conf≤0.2', '0.2<conf≤0.4', '0.4<conf≤0.6', '0.6<conf≤0.8', '0.8<conf≤1'])
+                        .add_xaxis(['conf=0', '0<conf≤0.2', '0.2<conf≤0.4', '0.4<conf≤0.6', '0.6<conf≤0.8', '0.8<conf≤1'])
-                    .add_yaxis(_, li, category_gap=2)
+                        .add_yaxis(_, li, category_gap=2)
-                    .set_global_opts(
+                        .set_global_opts(
-                        yaxis_opts=opts.AxisOpts(
+                            yaxis_opts=opts.AxisOpts(
-                            name="Proportion",
+                                name="Proportion",
-                            type_="value",
+                                type_="value",
-                            min_=0,
+                                min_=0,
-                            max_=100,
+                                max_=100,
-                            position="left",
+                                position="left",
-                            axisline_opts=opts.AxisLineOpts(
+                                axisline_opts=opts.AxisLineOpts(
-                                linestyle_opts=opts.LineStyleOpts()
+                                    linestyle_opts=opts.LineStyleOpts()
                                ),
                                axislabel_opts=opts.LabelOpts(formatter="{value}%"),
                            ),
-                            axislabel_opts=opts.LabelOpts(formatter="{value}%"),
+                            title_opts=opts.TitleOpts(title="Confidence Histogram"),
-                        ),
+                            xaxis_opts=opts.AxisOpts(name="Intervals")
-                        title_opts=opts.TitleOpts(title="Confidence Histogram"),
+                        )
-                        xaxis_opts=opts.AxisOpts(name="Intervals")
+                        .render(path + r"\confidence_histogram.html")
                    )
                    .render(path + r"\confidence_histogram.html")
                )
--- a/draw/draw_inter_weight_lines.py
+++ b/draw/draw_inter_weight_lines.py
@ -0,0 +1,51 @@
 import os
 import pyecharts.options as opts
 from pyecharts.charts import Line
 from pyecharts.faker import Faker
 if __name__ == '__main__':
    outcome_dir = r'E:\Data\Research\Outcome'
    datasets_list = os.listdir(outcome_dir)
    inter_list = ['0', '0.5', '0.7', '0.9', '1']
    configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-'
    for _ in datasets_list:
        block_recall_list = []
        f1 = []
        interpretability = []
        for inter in inter_list:
            path = outcome_dir + rf'\{_}' + configs_dir + inter
            statistics_files = os.listdir(path)
            for i in statistics_files:
                if i.startswith('eval_result'):
                    with open(path + rf'\{i}', 'r') as f:
                        # 读取每一行的md，加入该文件的md列表
                        for line in f.readlines():
                            if line.startswith('block_recall'):
                                lt = line.split(':')
                                value = float(lt[1])
                                block_recall_list.append(round(value, ndigits=3))
                            elif line.startswith('F1'):
                                lt = line.split(' ')
                                value = float(lt[2].replace('%', '')) / 100
                                f1.append(round(value, ndigits=3))
                            elif line.startswith('interpretability'):
                                lt = line.split(':')
                                value = float(lt[1])
                                interpretability.append(round(value, ndigits=3))
        interpretability[0] = 0
        c = (
            Line()
            .add_xaxis(inter_list)
            .add_yaxis("Block Recall", block_recall_list)
            .add_yaxis("F1", f1)
            .add_yaxis("Interpretability", interpretability)
            .set_global_opts(
                title_opts=opts.TitleOpts(title=_),
                # yaxis_opts=opts.AxisOpts(name="Block Recall", name_location="middle", name_gap=15, name_rotate=0),
                xaxis_opts=opts.AxisOpts(name="Interpretability Weight", name_location="middle", name_gap=25)
            )
            .render(outcome_dir + rf'\{_}\inter_weight_lines.html')
        )
--- a/settings.py
+++ b/settings.py
@ -1,17 +1,17 @@
 from sentence_transformers import SentenceTransformer
-ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv'
+ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\tableA.csv'
-rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv'
+rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\tableB.csv'
-mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv'
+mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\matches.csv'
-mapping_lid = 'idAbt'  # mapping表中左表id名
+mapping_lid = 'ltable_id'  # mapping表中左表id名
-mapping_rid = 'idBuy'  # mapping表中右表id名
+mapping_rid = 'rtable_id'  # mapping表中右表id名
 ltable_id = 'id'  # 左表id字段名称
 rtable_id = 'id'  # 右表id字段名称
 target_attr = 'id'  # 进行md挖掘时的目标字段
 # lr_attrs_map = {}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
 model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
-interpre_weight = 0.9  # 可解释性权重
+interpre_weight = 1  # 可解释性权重
 similarity_threshold = 0.1
 support_threshold = 1
 confidence_threshold = 0.25
--- a/table_embedding.py
+++ b/table_embedding.py
@ -1,7 +1,13 @@
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTableQuestionAnswering
 if __name__ == '__main__':
-    model = SentenceTransformer('E:\\Data\\Research\\Models\\tapex-base-finetuned-wikisql')
+    path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM\tableA.csv'
-    sentences = ['公积金转入深圳', '公积金转出深圳']
+    table = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
-    embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
+
    tokenizer = AutoTokenizer.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
    model = AutoModelForTableQuestionAnswering.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
    encoding = tokenizer(table, return_tensors="pt")
    embedding = tokenizer.encode(table, return_tensors="pt")
    print(1)