diff --git a/bert.py b/bert.py new file mode 100644 index 0000000..197ee7b --- /dev/null +++ b/bert.py @@ -0,0 +1,27 @@ +import csv + +import pandas as pd +import json +import sentence_transformers.util +import torch +from sentence_transformers import SentenceTransformer +from torch import nn + +if __name__ == '__main__': + # model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens') + # sentences = ['公积金转入深圳', '公积金转出深圳', None, None, 114514, 114514, 1919810] + # embedding = model.encode(sentences, device='cuda') + # outcome1 = sentence_transformers.util.cos_sim(embedding[4], embedding[5]) + # outcome2 = sentence_transformers.util.cos_sim(embedding[4], embedding[6]) + # print(outcome1.item()) + # print(outcome2.item()) + train = pd.read_csv(r'E:\Data\Research\Datasets\DeepMatcher dataset\Dirty\Walmart-Amazon\train.csv', encoding='ISO-8859-1') + valid = pd.read_csv(r'E:\Data\Research\Datasets\DeepMatcher dataset\Dirty\Walmart-Amazon\valid.csv', encoding='ISO-8859-1') + test = pd.read_csv(r'E:\Data\Research\Datasets\DeepMatcher dataset\Dirty\Walmart-Amazon\test.csv', encoding='ISO-8859-1') + train = train[train['label'] == 1] + valid = valid[valid['label'] == 1] + test = test[test['label'] == 1] + matches = pd.concat([train, valid, test]) + matches.drop(columns=['label'], inplace=True) + matches = matches.sort_values(by='ltable_id') + matches.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\matches.csv', sep=',', index=False, header=True) diff --git a/datasets/Walmart-Amazon_dirty/matches.csv b/datasets/Walmart-Amazon_dirty/matches.csv index 2110b54..1ab7285 100644 --- a/datasets/Walmart-Amazon_dirty/matches.csv +++ b/datasets/Walmart-Amazon_dirty/matches.csv @@ -1,1155 +1,963 @@ -id1,id2 -395,1768 -1234,12408 -1341,8497 -1093,13748 -1719,12857 -596,14870 -1906,7801 -595,20844 -1785,11404 -2104,2964 -723,12996 -469,20646 -475,18530 -2156,4510 -1973,3591 -1535,1091 -2508,5039 -932,6452 -524,7631 -1559,4509 -1267,4233 -2122,4433 -538,791 -1386,17537 -1467,13593 -1844,9692 -2244,482 -1154,13246 -1923,20241 -2344,4785 -2041,9551 -1642,11045 -2333,18201 -1479,1767 -1965,14804 -1502,5286 -1440,19655 -2325,20694 -1418,7249 -2141,6966 -314,8871 -425,8535 -2107,1254 -1611,10451 -247,19032 -89,3270 -1316,7784 -499,5578 -1629,6289 -1194,4494 -331,16953 -640,7372 -1008,11117 -2202,17199 -1146,4788 -2409,6809 -1397,2439 -2197,19485 -604,21732 -1937,18141 -2230,14756 -2220,246 -318,1083 -247,13590 -1316,15123 -2232,17840 -1108,12701 -710,1097 -89,18981 -786,10846 -2089,22026 -1317,4507 -1950,17303 -1705,16232 -337,20175 -195,20940 -2015,11741 -2210,17163 -1698,16579 -1695,20398 -2095,15248 -1704,17702 -432,18465 -854,6748 -1721,20523 -2417,5900 -236,21784 -726,16801 -397,7150 -1209,18171 -222,16625 -541,9384 -1164,6863 -2093,13768 -1696,17275 -1288,15034 -1335,8619 -34,11684 -1885,17579 -1529,16917 -1935,9579 -837,8740 -1209,3255 -1719,8846 -1652,13805 -2406,18333 -1281,7638 -2483,19120 -863,5783 -92,17688 -865,11895 -348,14459 -740,11688 -484,9131 -1002,6639 -560,20714 -532,19818 -322,20698 -2365,16367 -1427,10693 -551,19540 -81,6527 -2018,16794 -320,21484 -2354,13477 -1125,15648 -115,9224 -1238,6975 -109,12442 -2005,19521 -1791,19801 -2379,11831 -482,14854 -35,3068 -1483,8567 -1523,21030 -464,11441 -585,966 -1023,12117 -2528,13283 -562,18153 -1143,14278 -646,6761 -1608,13173 -2482,6886 -2408,12312 -1909,20112 -1695,11027 -16,4283 -377,9390 -927,4427 -2517,20310 -1109,10673 -455,1892 -1371,5173 -1578,13541 -2409,5519 -2520,2669 -1263,12735 -1500,4638 -2370,20053 -2441,6644 -1928,19027 -1749,17508 -1360,1904 -206,10518 -858,7348 -1861,5561 -503,11436 -1803,14925 -477,2718 -2543,6691 -150,8944 -1775,18403 -1039,5011 -1110,6058 -1031,10556 -2156,17865 -2015,9457 -1039,19036 -2532,19977 -40,10873 -1844,4901 -2148,2888 -921,3247 -743,8236 -1618,2749 -586,14868 -1300,14224 -1883,10855 -2014,9227 -66,7157 -481,377 -2240,327 -1012,13414 -612,8430 -1178,20812 -192,1244 -545,13665 -462,18390 -738,6760 -543,13160 -1218,5250 -1856,12475 -515,14882 -70,14642 -87,7611 -1892,15305 -2512,21504 -262,5193 -976,13423 -2057,3068 -2200,18139 -2361,7250 -2191,2311 -1682,13298 -194,19881 -1219,5268 -8,1304 -21,16838 -1064,3146 -100,4521 -2267,1617 -1150,9482 -1505,4818 -862,11943 -1718,17262 -2316,9549 -636,7146 -4,4379 -1530,1442 -2217,17495 -2134,8991 -448,13175 -79,12348 -1071,18621 -1770,5883 -1135,8790 -202,6829 -1316,21113 -1401,14944 -811,1481 -2509,11583 -65,8392 -416,17244 -54,17893 -1067,8298 -556,19010 -830,8898 -108,12091 -2211,16561 -1149,16757 -984,18365 -2245,595 -1152,18331 -30,6032 -95,9052 -742,11538 -1237,7616 -376,5042 -1727,18464 -1864,21326 -1395,19581 -87,11618 -261,3778 -2516,4425 -158,16206 -2539,2280 -2410,11715 -1094,5463 -661,10274 -2117,14496 -829,10888 -842,7504 -1657,14170 -1631,10852 -1902,10883 -2217,20676 -2125,11565 -1739,20133 -1945,829 -2227,8671 -42,6817 -2112,3378 -1199,14824 -2472,19284 -1773,2223 -2072,8815 -650,1428 -1066,10601 -497,4497 -2070,21449 -1184,11289 -2296,21435 -2540,10706 -876,4227 -2022,17375 -2026,2699 -1549,9246 -1981,4279 -1894,11121 -1828,2579 -1622,16059 -873,15269 -1633,20071 -918,17318 -590,16954 -1000,4924 -1954,11439 -449,13174 -1210,14110 -1610,2510 -2256,14016 -2027,3204 -1462,3781 -1512,4520 -1321,885 -945,14178 -149,16351 -2193,3145 -1074,2523 -1414,2789 -626,18444 -700,4284 -933,2814 -173,1153 -195,6877 -975,19754 -766,15416 -504,10207 -1415,2486 -1834,9343 -1082,21588 -819,980 -2098,21458 -1120,3472 -2365,17916 -1350,12192 -1985,1121 -2187,13584 -872,20289 -2345,6971 -721,13399 -1072,16127 -1440,1637 -91,21918 -470,19258 -2246,3567 -70,21810 -706,16537 -66,11130 -700,3857 -2073,4160 -600,557 -529,21348 -2413,7234 -1603,19808 -1037,10881 -1734,21813 -833,4598 -2094,16312 -86,9497 -2108,8359 -1211,1325 -121,14796 -216,6602 -1933,9096 -1363,17389 -821,6299 -2038,21544 -599,3627 -2477,6956 -1366,18373 -2034,18680 -2469,10814 -1269,4381 -2089,17264 -55,6573 -1596,13391 -1910,11285 -2497,12025 -185,2960 -417,19476 -2423,8664 -145,7971 -2053,6287 -1851,17489 -2238,1697 -414,285 -1474,3251 -1940,20178 -1249,16299 -2142,11049 -1969,10525 -303,16808 -2124,1795 -888,12784 -951,15943 -1030,16792 -390,10544 -1326,10508 -228,2203 -1253,9636 -2524,10366 -1318,20507 -785,12734 -1348,12432 -804,4122 -2260,14835 -531,18532 -1084,20645 -936,14801 -2502,4363 -160,5665 -770,6363 -2225,14687 -2140,102 -338,6116 -578,19368 -2218,19468 -812,454 -2274,5256 -1989,2477 -679,8963 -1485,15559 -470,2721 -1658,16333 -94,8351 -1543,10128 -962,14318 -2060,5735 -20,17430 -2040,7444 -1142,9254 -1863,15435 -1247,17192 -1869,4495 -760,8382 -726,16278 -223,17535 -2550,19120 -1667,6195 -903,6662 -1550,18949 -724,1690 -2150,4785 -904,16990 -282,7825 -383,12583 -1469,2294 -226,12628 -1452,4720 -1910,20494 -323,9991 -1179,18154 -325,8520 -2419,7680 -1592,3206 -548,9296 -174,1250 -1964,6962 -1209,16936 -361,19151 -2463,8176 -2137,19007 -785,21197 -1814,1608 -1814,3254 -16,17365 -513,9061 -1642,15010 -379,21691 -1339,17544 -578,21737 -512,1393 -2184,10974 -2031,13168 -219,5047 -1973,3594 -476,20393 -1476,8651 -1356,560 -899,7384 -2038,8520 -782,17484 -1368,12147 -1603,10609 -152,692 -811,11429 -1691,4227 -2253,16784 -1513,20845 -311,4721 -1687,20039 -667,12515 -899,11344 -1717,17397 -1138,4430 -1436,7325 -817,11999 -708,11830 -1687,17666 -2479,16863 -2168,14881 -992,12313 -2077,18298 -1852,13349 -2237,1663 -1792,19539 -653,7446 -1876,9387 -1036,21948 -1762,4577 -2152,7339 -82,17195 -1201,160 -2544,2534 -2081,16296 -2019,11228 -546,7625 -1559,12853 -1167,3904 -232,10437 -2436,20420 -1939,19139 -2268,2528 -399,16515 -1476,12655 -1105,1398 -863,11819 -1110,8092 -1458,8347 -2185,11690 -1467,15576 -148,21193 -1529,812 -1467,11642 -849,3154 -1838,13678 -2397,18102 -524,3627 -59,21240 -1778,3751 -1759,13846 -1220,14407 -453,16646 -772,5518 -1278,1881 -1933,9017 -971,6931 -165,18938 -781,14279 -2102,15844 -346,8387 -2164,8470 -121,13523 -2357,5528 -192,11198 -1010,781 -400,714 -708,8043 -1953,1770 -2118,14558 -2200,20396 -616,17520 -1299,8456 -2382,3483 -1685,17949 -798,6509 -1886,2011 -1306,2559 -1878,2928 -1616,12974 -2097,2975 -1551,11327 -624,14736 -501,12545 -1075,20741 -1842,2327 -2090,7049 -1977,12094 -26,6985 -1188,5480 -508,11954 -1833,433 -2161,9859 -1590,2646 -2165,3695 -58,13303 -1361,995 -1186,11040 -1931,15376 -2064,6309 -137,2577 -1455,10761 -980,9554 -105,11792 -149,18937 -1392,2977 -2157,14546 -2051,21913 -361,11567 -109,21710 -2551,15808 -182,12013 -2195,18385 -694,2612 -539,17175 -871,14931 -1296,6406 -352,10049 -2160,17900 -1404,21316 -2329,16353 -682,20252 -1793,8495 -1561,19670 -946,3104 -1627,13620 -476,12747 -2084,14540 -1283,1241 -839,716 -2088,11144 -1406,14895 -67,3669 -2393,15894 -466,7305 -6,10423 -673,2945 -605,15830 -262,3239 -568,12174 -1166,3696 -298,15654 -1752,20118 -310,19292 -939,15253 -1264,22052 -720,6973 -2195,5102 -659,21943 -2521,15684 -2159,9027 -1433,6231 -789,14902 -1716,20592 -736,12274 -2277,12761 -653,6709 -1987,6414 -177,4855 -1015,7029 -735,5992 -2194,5683 -718,20107 -1517,889 -2380,265 -2030,14036 -2480,10670 -1461,14499 -2255,346 -1759,9790 -146,15892 -230,13259 -937,14005 -1830,19354 -1875,14195 -471,10875 -2469,16833 -2156,4319 -1976,8181 -414,1678 -2074,15480 -2055,7602 -695,20007 -1574,5416 -1944,15781 -396,100 -387,4059 -1743,15528 -2079,3895 -1553,9157 -640,10129 -1113,10801 -2380,10268 -2318,15826 -2134,10435 -663,17259 -322,4630 -633,13150 -1754,16221 -2120,20315 -1565,6915 -324,1245 -1190,2760 -129,10641 -926,21758 -1790,11779 -1457,18232 -2297,12893 -430,3274 -907,3470 -1403,20184 -1032,13439 -10,2457 -530,17880 -908,13995 -1511,19043 -31,11089 -1523,1026 -557,6722 -2361,17906 -1889,6730 -1350,4660 -160,9710 -96,10967 -694,105 -7,9016 -2151,21344 -1025,12675 -135,3741 -1196,5365 -1378,12162 -1094,16870 -1974,3156 -1230,5843 -1908,10482 -347,8305 -751,10531 -715,16479 -1292,17830 -1929,13790 -689,4784 -576,1129 -1984,6914 -685,17574 -1998,12452 -649,4760 -1490,2078 -2439,2862 -2168,17734 -1078,11885 -1661,11173 -601,12882 -1176,19246 -1398,20806 -260,10735 -1369,12443 -1728,14919 -1526,12544 -2341,3759 -454,10900 -57,21965 -2027,18649 -893,21097 -2537,12334 -295,19912 -1293,18603 -1346,14258 -1257,20412 -277,15427 -179,19150 -1141,9098 -1624,6593 -1801,256 -102,2594 -2514,20427 -357,21592 -1424,9728 -1798,6005 -1952,20444 -1729,12126 -1905,15685 -218,5275 -625,4040 -329,14370 -1900,5459 -1644,2780 -1449,7903 -331,1600 -1195,14917 -1507,16667 -724,9676 -540,11543 -688,20213 -1641,9565 -1234,18429 -917,8911 -2020,16544 -1399,4587 -1607,18688 -523,14015 -265,16641 -212,401 -1981,1503 -1415,11940 -1488,3574 -2486,8096 -2428,1157 -1563,12170 -2062,353 -1938,8501 -2007,13922 -273,3966 -2261,13723 -2214,21799 -1475,3962 -1225,8071 -1499,5235 -598,22044 -601,4692 -531,6616 -304,19300 -1381,20891 -2327,19269 -1961,2285 -1884,19124 -1247,21595 -705,18207 -2189,2672 -1777,15490 -1354,6820 -2022,17452 -492,19958 -2191,21662 -1126,8268 -920,17372 -993,19593 -693,18705 -2498,19526 -2094,11401 -590,15496 -906,19637 -139,11469 -169,20287 -2332,21977 -2019,5208 -1139,9086 -997,2751 -1592,17448 -1904,10675 -180,8546 -2119,20622 -833,19557 -365,9533 -2330,14074 -1281,17950 -2442,21374 -308,10210 -1510,9333 -1811,578 -802,8932 -187,1929 -1920,544 -2222,11341 -300,4645 -881,5987 -2048,8525 -164,9825 -1357,11270 -1208,6899 -57,9165 -259,10647 -2294,6549 -2005,1748 -2462,13244 -559,5884 -20,13058 -668,7080 -2375,3052 -938,19776 -1243,2466 -1137,17384 -267,2950 -519,2139 -1560,14554 -560,21311 -799,6374 -542,9190 -132,5596 -1096,2703 -2025,18711 -2345,14945 -2340,2237 -1038,12169 -2100,12942 -2000,20190 -2298,10870 -655,14490 -1536,19000 -739,18198 -1300,15463 -2029,12892 -2503,16033 -287,18627 -1896,7661 -1287,16536 -529,77 -2343,639 -223,1355 -2155,11212 -2008,20462 -1579,3324 -1580,11461 -1993,1275 -2388,20822 -156,3447 -2143,5921 -1312,14761 -589,20109 -1777,16438 -1855,10031 -1252,2074 -1011,15004 -1833,13397 -2478,4885 -389,1451 -2192,16174 -1937,11296 -2401,10788 -303,16642 -1947,1316 -2101,20732 -1988,10906 -956,4703 -1664,10772 -482,987 -746,11811 -935,13149 -454,4893 -200,5512 -16,17811 -637,9152 -1822,10712 -2541,5176 -1540,4757 -1514,1593 -335,10662 -2366,8619 -2335,10109 -1482,4163 -2197,17367 -1370,14865 -240,10618 -466,15279 -1478,2433 -657,10559 -564,18916 -670,12491 -696,12570 -706,12896 -580,15016 -806,14295 -950,7347 -2350,18167 -875,5645 -1302,6495 -220,8697 -103,11760 -1442,15285 -1509,19515 -2395,13252 -2085,8988 -1325,913 -1201,19355 -1262,8267 -1459,16056 -1862,18358 -1733,5721 -2017,12007 -2328,8545 -134,18759 -712,7672 -2267,13319 -371,15072 -2346,15062 -316,7706 -2137,4706 -1932,11215 -1045,3010 -940,3616 -397,17849 -505,15432 -618,19541 -528,486 -2276,19663 -456,19348 -1772,236 -551,2556 -1477,11084 -1081,14455 -1989,9475 -576,5076 -1156,14874 -802,13030 -27,538 -942,2806 -388,3170 -1707,17027 -2504,20052 -539,10462 -711,2946 -2275,357 -1091,16635 -1654,1650 -2212,4647 -592,5595 -2398,10118 -638,15921 -987,6670 -1413,16819 -427,14869 -2001,14203 -2372,10924 -1958,17571 -2032,9534 -1486,1376 -138,21121 -2342,7894 -363,3236 -664,7136 -1846,186 -1419,14716 -313,13300 -2523,3325 -2463,14118 -416,20386 -1187,16093 -2345,4985 -1835,16332 -2306,10516 -2126,16073 -1625,5014 -2369,15352 -1191,10707 -2166,21077 -193,11132 -1546,21375 -1867,14360 -1231,4668 -851,8444 -98,15905 -2549,19118 -2258,8357 -1210,9718 -2336,8280 -1194,5593 -285,2676 -2511,11626 -903,18251 -1160,13110 -1346,20146 -1468,16060 -998,4380 -1767,5889 -1636,20244 -149,16399 -2265,19008 -1673,9028 -2489,5636 -50,8097 -974,21569 -1585,11476 -164,124 -750,15431 -2075,17804 -363,16308 -1146,2868 -1181,4538 -2180,18388 -183,7400 -681,844 -131,9891 -1006,11658 -546,7623 -2143,13935 -1967,657 -849,603 -1777,13180 -2406,8669 -330,7315 -473,15368 -1425,8193 -2460,7487 -203,2448 -753,7076 -1198,4815 -1638,1271 -631,12818 -1438,11705 -1572,6999 -691,12897 -1762,12648 -909,13024 -1804,6873 -549,3415 -2218,18301 -1701,13660 -43,173 -1709,16227 -2249,8369 -2254,11266 -2484,11842 -1842,18624 -1619,6365 -834,18900 -1564,431 -2194,9731 -129,2702 -1103,7854 -1154,9238 -2266,18699 -894,11621 -447,5775 -2166,18478 -168,15438 -1615,11351 -1401,18728 -1614,17219 -476,782 -716,7851 -1122,14110 -397,7145 -1847,15548 -1653,14385 -429,6991 -896,12464 -1593,16659 -127,1556 -771,11224 -1714,21094 -1695,3794 -6,20933 -1474,3249 -1245,10289 -1672,20795 -1969,21512 -1706,14646 -1702,18550 -1008,18143 -1715,18513 -2475,14438 -1988,16253 -615,22046 -2147,6471 -1700,17104 -899,15336 -2133,7359 -1698,21668 -2327,1682 -2231,12950 -1656,10362 -458,8019 +ltable_id,rtable_id +3,4378 +5,20932 +6,9015 +7,1303 +15,4282 +15,17364 +15,17810 +19,17429 +20,16837 +25,6984 +29,6031 +30,11088 +33,11683 +34,3067 +39,10872 +41,6816 +42,172 +49,8096 +53,17892 +54,6572 +56,9164 +56,21964 +57,13302 +64,8391 +65,7156 +65,11129 +66,3668 +69,14641 +78,12347 +80,6526 +81,17194 +86,11617 +86,7610 +88,18980 +88,3269 +90,21917 +91,17687 +93,8350 +94,9051 +95,10966 +97,15904 +99,4520 +101,2593 +102,11759 +107,12090 +108,21709 +108,12441 +114,9223 +120,14795 +128,2701 +128,10640 +130,9890 +131,5595 +134,3740 +136,2576 +137,21120 +138,11468 +144,7970 +145,15891 +147,21192 +148,16398 +148,16350 +148,18936 +151,691 +155,3446 +157,16205 +159,9709 +159,5664 +164,18937 +172,1152 +173,1249 +178,19149 +179,8545 +181,12012 +182,7399 +184,2959 +186,1928 +191,1243 +192,11131 +194,6876 +194,20939 +199,5511 +201,6828 +205,10517 +211,400 +215,6601 +217,5274 +218,5046 +219,8696 +221,16624 +222,1354 +222,17534 +225,12627 +229,13258 +231,10436 +235,21783 +239,10617 +246,13589 +246,19031 +258,10646 +261,5192 +264,16640 +266,2949 +272,3965 +276,15426 +281,7824 +284,2675 +286,18626 +294,19911 +297,15653 +302,16807 +302,16641 +303,19299 +309,19291 +310,4720 +312,13299 +313,8870 +315,7705 +317,1082 +319,21483 +322,9990 +323,1244 +324,8519 +330,1599 +334,10661 +336,20174 +337,6115 +345,8386 +346,8304 +351,10048 +356,21591 +360,19150 +360,11566 +362,16307 +370,15071 +376,9389 +378,21690 +382,12582 +387,3169 +388,1450 +395,99 +396,7149 +396,7144 +396,17848 +398,16514 +399,713 +413,284 +415,17243 +415,20385 +424,8534 +426,14868 +428,6990 +429,3273 +431,18464 +446,5774 +447,13174 +448,13173 +452,16645 +453,10899 +453,4892 +454,1891 +455,19347 +461,18389 +463,11440 +465,7304 +465,15278 +468,20645 +469,2720 +469,19257 +470,10874 +472,15367 +475,12746 +475,20392 +476,2717 +480,376 +481,14853 +483,9130 +491,19957 +496,4496 +498,5577 +500,12544 +502,11435 +503,10206 +507,11953 +512,9060 +514,14881 +518,2138 +522,14014 +523,3626 +523,7630 +527,485 +528,21347 +528,76 +529,17879 +530,18531 +530,6615 +531,19817 +537,790 +538,17174 +539,11542 +540,9383 +541,9189 +542,13159 +544,13664 +545,7624 +545,7622 +547,9295 +548,3414 +550,19539 +550,2555 +555,19009 +558,5883 +559,20713 +559,21310 +561,18152 +563,18915 +575,1128 +575,5075 +577,21736 +577,19367 +579,15015 +584,965 +585,14867 +588,20108 +589,16953 +589,15495 +591,5594 +594,20843 +595,14869 +597,22043 +598,3626 +599,556 +600,4691 +600,12881 +604,15829 +611,8429 +614,22045 +615,17519 +617,19540 +625,18443 +630,12817 +632,13149 +635,7145 +636,9151 +639,7371 +639,10128 +645,6760 +648,4759 +652,7445 +652,6708 +656,10558 +658,21942 +660,10273 +662,17258 +663,7135 +669,12490 +678,8962 +680,843 +681,20251 +684,17573 +687,20212 +688,4783 +692,18704 +693,2611 +693,104 +694,20006 +695,12569 +699,4283 +699,3856 +705,16536 +705,12895 +707,11829 +707,8042 +709,1096 +714,16478 +715,7850 +717,20106 +719,6972 +720,13398 +722,12995 +723,9675 +725,16800 +725,16277 +734,5991 +735,12273 +737,6759 +738,18197 +739,11687 +741,11537 +742,8235 +745,11810 +752,7075 +759,8381 +765,15415 +769,6362 +770,11223 +771,5517 +781,17483 +784,21196 +784,12733 +785,10845 +788,14901 +797,6508 +801,13029 +801,8931 +803,4121 +805,14294 +810,1480 +810,11428 +811,453 +816,11998 +818,979 +828,10887 +829,8897 +832,19556 +832,4597 +836,8739 +838,715 +841,7503 +848,3153 +848,602 +853,6747 +857,7347 +861,11942 +862,5782 +864,11894 +870,14930 +871,20288 +872,15268 +874,5644 +875,4226 +887,12783 +892,21096 +893,11620 +895,12463 +898,7383 +898,11343 +898,15335 +902,6661 +903,16989 +905,19636 +906,3469 +907,13994 +908,13023 +916,8910 +917,17317 +919,17371 +920,3246 +925,21757 +926,4426 +931,6451 +932,2813 +934,13148 +935,14800 +936,14004 +938,15252 +939,3615 +941,2805 +944,14177 +949,7346 +950,15942 +955,4702 +961,14317 +973,21568 +974,19753 +975,13422 +979,9553 +983,18364 +986,6669 +991,12312 +992,19592 +996,2750 +997,4379 +999,4923 +1007,18142 +1007,11116 +1009,780 +1010,15003 +1011,13413 +1014,7028 +1022,12116 +1024,12674 +1029,16791 +1030,10555 +1031,13438 +1037,12168 +1038,5010 +1044,3009 +1066,8297 +1070,18620 +1071,16126 +1073,2522 +1074,20740 +1077,11884 +1080,14454 +1081,21587 +1083,20644 +1090,16634 +1092,13747 +1093,5462 +1093,16869 +1102,7853 +1104,1397 +1107,12700 +1108,10672 +1109,8091 +1119,3471 +1121,14109 +1125,8267 +1134,8789 +1136,17383 +1137,4429 +1138,9085 +1140,9097 +1141,9253 +1145,4787 +1145,2867 +1148,16756 +1149,9481 +1151,18330 +1153,9237 +1153,13245 +1155,14873 +1159,13109 +1165,3695 +1177,20811 +1178,18153 +1183,11288 +1186,16092 +1187,5479 +1189,2759 +1190,10706 +1193,4493 +1193,5592 +1194,14916 +1195,5364 +1197,4814 +1200,159 +1207,6898 +1208,16935 +1208,3254 +1208,18170 +1209,9717 +1209,14109 +1210,1324 +1217,5249 +1218,5267 +1224,8070 +1229,5842 +1230,4667 +1233,18428 +1233,12407 +1236,7615 +1237,6974 +1242,2465 +1244,10288 +1246,21594 +1246,17191 +1248,16298 +1252,9635 +1256,20411 +1262,12734 +1263,22051 +1266,4232 +1268,4380 +1280,17949 +1280,7637 +1282,1240 +1286,16535 +1287,15033 +1291,17829 +1292,18602 +1295,6405 +1298,8455 +1299,14223 +1299,15462 +1301,6494 +1305,2558 +1311,14760 +1315,15122 +1315,7783 +1315,21112 +1316,4506 +1317,20506 +1320,884 +1324,912 +1325,10507 +1334,8618 +1338,17543 +1340,8496 +1345,20145 +1345,14257 +1349,4659 +1349,12191 +1353,6819 +1355,559 +1356,11269 +1359,1903 +1360,994 +1362,17388 +1365,18372 +1367,12146 +1370,5172 +1377,12161 +1380,20890 +1385,17536 +1391,2976 +1394,19580 +1396,2438 +1397,20805 +1398,4586 +1400,14943 +1400,18727 +1402,20183 +1405,14894 +1412,16818 +1413,2788 +1417,7248 +1418,14715 +1423,9727 +1424,8192 +1426,10692 +1437,11704 +1439,19654 +1441,15284 +1451,4719 +1454,10760 +1456,18231 +1457,8346 +1460,14498 +1466,15575 +1466,13592 +1466,11641 +1467,16059 +1473,3250 +1473,3248 +1474,3961 +1475,8650 +1475,12654 +1476,11083 +1477,2432 +1478,1766 +1481,4162 +1482,8566 +1484,15558 +1485,1375 +1489,2077 +1498,5234 +1499,4637 +1504,4817 +1506,16666 +1508,19514 +1509,9332 +1511,4519 +1512,20844 +1513,1592 +1522,1025 +1522,21029 +1525,12543 +1528,16916 +1528,811 +1529,1441 +1534,1090 +1535,18999 +1539,4756 +1542,10127 +1545,21374 +1549,18948 +1550,11326 +1552,9156 +1558,12852 +1559,14553 +1560,19669 +1562,12169 +1563,430 +1564,6914 +1571,6998 +1573,5415 +1577,13540 +1578,3323 +1579,11460 +1584,11475 +1589,2645 +1591,3205 +1592,16658 +1595,13390 +1602,10608 +1602,19807 +1606,18687 +1607,13172 +1609,2509 +1610,10450 +1613,17218 +1614,11350 +1618,6364 +1621,16058 +1624,5013 +1626,13619 +1628,6288 +1632,20070 +1637,1270 +1640,9564 +1641,15009 +1641,11044 +1643,2779 +1651,13804 +1652,14384 +1653,1649 +1656,14169 +1657,16332 +1660,11172 +1666,6194 +1671,20794 +1684,17948 +1686,20038 +1686,17665 +1694,11026 +1694,20397 +1694,3793 +1695,17274 +1697,16578 +1697,21667 +1699,17103 +1701,18549 +1703,17701 +1704,16231 +1705,14645 +1706,17026 +1708,16226 +1713,21093 +1714,18512 +1716,17396 +1717,17261 +1718,8845 +1720,20522 +1726,18463 +1727,14918 +1728,12125 +1732,5720 +1733,21812 +1738,20132 +1742,15527 +1751,20117 +1753,16220 +1758,13845 +1758,9789 +1761,12647 +1761,4576 +1766,5888 +1771,235 +1774,18402 +1776,16437 +1776,15489 +1776,13179 +1777,3750 +1784,11403 +1789,11778 +1790,19800 +1791,19538 +1792,8494 +1802,14924 +1803,6872 +1810,577 +1813,3253 +1813,1607 +1827,2578 +1829,19353 +1832,432 +1832,13396 +1833,9342 +1834,16331 +1837,13677 +1841,18623 +1841,2326 +1843,4900 +1846,15547 +1850,17488 +1851,13348 +1860,5560 +1861,18357 +1862,15434 +1863,21325 +1866,14359 +1868,4494 +1875,9386 +1877,2927 +1882,10854 +1883,19123 +1884,17578 +1888,6729 +1891,15304 +1893,11120 +1895,7660 +1899,5458 +1901,10882 +1903,10674 +1904,15684 +1905,7800 +1907,10481 +1908,20111 +1919,543 +1922,20240 +1927,19026 +1930,15375 +1931,11214 +1932,9095 +1932,9016 +1934,9578 +1936,18140 +1936,11295 +1937,8500 +1943,15780 +1944,828 +1946,1315 +1949,17302 +1951,20443 +1952,1769 +1953,11438 +1957,17570 +1960,2284 +1963,6961 +1964,14803 +1966,656 +1968,21511 +1972,3590 +1972,3593 +1973,3155 +1975,8180 +1976,12093 +1980,4278 +1983,6913 +1984,1120 +1986,6413 +1987,10905 +1987,16252 +1988,2476 +1988,9474 +1992,1274 +1997,12451 +1999,20189 +2000,14202 +2004,19520 +2004,1747 +2006,13921 +2007,20461 +2013,9226 +2014,11740 +2014,9456 +2017,16793 +2018,11227 +2018,5207 +2019,16543 +2021,17451 +2021,17374 +2024,18710 +2028,12891 +2029,14035 +2030,13167 +2033,18679 +2037,8519 +2037,21543 +2039,7443 +2040,9550 +2047,8524 +2050,21912 +2052,6286 +2054,7601 +2056,3067 +2061,352 +2063,6308 +2069,21448 +2071,8814 +2074,17803 +2076,18297 +2078,3894 +2080,16295 +2083,14539 +2087,11143 +2088,22025 +2088,17263 +2089,7048 +2092,13767 +2093,16311 +2094,15247 +2096,2974 +2100,20731 +2101,15843 +2103,2963 +2106,1253 +2107,8358 +2118,20621 +2119,20314 +2121,4432 +2123,1794 +2125,16072 +2132,7358 +2133,8990 +2133,10434 +2136,4705 +2136,19006 +2139,101 +2140,6965 +2141,11048 +2142,13934 +2147,2887 +2149,4784 +2150,21343 +2151,7338 +2154,11211 +2155,4509 +2155,17864 +2155,4318 +2156,14545 +2158,9026 +2159,17899 +2160,9858 +2163,8469 +2164,3694 +2165,21076 +2167,17733 +2179,18387 +2184,11689 +2186,13583 +2188,2671 +2190,2310 +2190,21661 +2191,16173 +2192,3144 +2193,9730 +2194,5101 +2196,19484 +2196,17366 +2199,20395 +2199,18138 +2201,17198 +2209,17162 +2210,16560 +2211,4646 +2213,21798 +2216,17494 +2216,20675 +2219,245 +2221,11340 +2224,14686 +2229,14755 +2230,12949 +2231,17839 +2236,1662 +2237,1696 +2239,326 +2244,594 +2245,3566 +2248,8368 +2252,16783 +2253,11265 +2254,345 +2255,14015 +2257,8356 +2259,14834 +2260,13722 +2264,19007 +2265,18698 +2266,1616 +2266,13318 +2267,2527 +2273,5255 +2274,356 +2275,19662 +2276,12760 +2293,6548 +2295,21434 +2296,12892 +2297,10869 +2305,10515 +2315,9548 +2317,15825 +2324,20693 +2326,19268 +2327,8544 +2328,16352 +2331,21976 +2332,18200 +2335,8279 +2339,2236 +2340,3758 +2341,7893 +2342,638 +2343,4784 +2344,4984 +2344,6970 +2344,14944 +2345,15061 +2349,18166 +2353,13476 +2356,5527 +2360,17905 +2360,7249 +2364,16366 +2364,17915 +2365,8618 +2368,15351 +2371,10923 +2374,3051 +2378,11830 +2379,10267 +2379,264 +2381,3482 +2387,20821 +2392,15893 +2394,13251 +2396,18101 +2397,10117 +2405,8668 +2407,12311 +2408,6808 +2409,11714 +2412,7233 +2418,7679 +2422,8663 +2435,20419 +2438,2861 +2440,6643 +2441,21373 +2461,13243 +2462,14117 +2462,8175 +2468,16832 +2468,10813 +2471,19283 +2474,14437 +2476,6955 +2477,4884 +2478,16862 +2479,10669 +2481,6885 +2482,19119 +2483,11841 +2488,5635 +2496,12024 +2497,19525 +2501,4362 +2502,16032 +2503,20051 +2507,5038 +2508,11582 +2510,11625 +2511,21503 +2513,20426 +2515,4424 +2516,20309 +2519,2668 +2522,3324 +2523,10365 +2527,13282 +2531,19976 +2536,12333 +2538,2279 +2539,10705 +2540,5175 +2542,6690 +2543,2533 +2548,19117 diff --git a/draw.py b/draw.py new file mode 100644 index 0000000..2074f11 --- /dev/null +++ b/draw.py @@ -0,0 +1,55 @@ +import os + +import pyecharts +from pyecharts.charts import Line +from pyecharts import options as opts +from pyecharts.globals import ThemeType + +if __name__ == '__main__': + dir_path = r'E:\Data\Research\Outcome\Abt-Buy' + filename_list = os.listdir(dir_path) + iter_list = [] + precision = [] + recall = [] + f1 = [] + interpretability = [] + performance = [] + for _ in filename_list: + if _.startswith('eval_result'): + it = int(_[12:13]) + iter_list.append(str(it)) + with open(dir_path + '\\' + _, 'r') as f: + # 读取每一行的md,加入该文件的md列表 + for line in f.readlines(): + if line.startswith('Precision'): + lt = line.split(' ') + value = float(lt[2].replace('%', ''))/100 + precision.append(value) + elif line.startswith('Recall'): + lt = line.split(' ') + value = float(lt[2].replace('%', ''))/100 + recall.append(value) + elif line.startswith('F1'): + lt = line.split(' ') + value = float(lt[2].replace('%', ''))/100 + f1.append(value) + elif line.startswith('interpretability'): + lt = line.split(':') + value = float(lt[1]) + interpretability.append(value) + elif line.startswith('performance'): + lt = line.split(':') + value = float(lt[1]) + performance.append(value) + + line = ( + Line(init_opts=opts.InitOpts(theme=ThemeType.LIGHT)) + .add_xaxis(iter_list) + .add_yaxis('Precision', precision) + .add_yaxis('Recall', recall) + .add_yaxis('F1', f1) + .add_yaxis('Interpretability', interpretability) + .add_yaxis('Performance', performance) + .set_global_opts(title_opts=opts.TitleOpts(title=dir_path.split('\\')[-1])) + ) + line.render(dir_path + '\\' + "line.html") diff --git a/hpo/er_model_hpo.py b/hpo/er_model_hpo.py index c0b9323..6b879df 100644 --- a/hpo/er_model_hpo.py +++ b/hpo/er_model_hpo.py @@ -17,17 +17,17 @@ from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicabl # 数据在外部加载 ######################################################################################################################## ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') -# ltable.fillna("", inplace=True) +ltable.fillna("", inplace=True) rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1') -# rtable.fillna("", inplace=True) +rtable.fillna("", inplace=True) mappings = pd.read_csv(mapping_path) lid_mapping_list = [] rid_mapping_list = [] # 全部转为字符串 -# ltable = ltable.astype(str) -# rtable = rtable.astype(str) -# mappings = mappings.astype(str) +ltable = ltable.astype(str) +rtable = rtable.astype(str) +mappings = mappings.astype(str) matching_number = len(mappings) # 所有阳性样本数,商品数据集应为1300 for index, row in mappings.iterrows(): @@ -162,8 +162,6 @@ class Classifier: attrs_after=test_feature_after, show_progress=False) fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold'] - train_feature_vecs.fillna(0, inplace=True) - test_feature_vecs.fillna(0, inplace=True) matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold') test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id]) @@ -181,7 +179,7 @@ class Classifier: predictions_attrs.extend(['gold', 'predicted']) predictions = predictions[predictions_attrs] predictions = predictions.reset_index(drop=True) - # predictions = predictions.astype(str) + predictions = predictions.astype(str) sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) # 默认路径为 "../md_discovery/output/xxx.txt" @@ -197,10 +195,11 @@ class Classifier: ppre = predictions[predictions['predicted'] == str(1)] interpretability = epl_match / len(ppre) # 可解释性 - if indicators["block_recall"] >= 0.8: - f1 = indicators["F1"] + if (indicators["block_recall"] < 0.8) and (indicators["block_recall"] < indicators["recall"]): + f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / ( + indicators["precision"] + indicators["block_recall"]) else: - f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (indicators["precision"] + indicators["block_recall"]) + f1 = indicators["F1"] # if indicators["block_recall"] < 0.8: # return 1 # f1 = indicators["F1"] diff --git a/ml_er/ml_entity_resolver.py b/ml_er/ml_entity_resolver.py index bd345ce..934ed37 100644 --- a/ml_er/ml_entity_resolver.py +++ b/ml_er/ml_entity_resolver.py @@ -157,9 +157,9 @@ def ml_er(iter_round: int, config: Configuration = None, ): lid_mapping_list = [] rid_mapping_list = [] # 全部转为字符串 - # ltable = ltable.astype(str) - # rtable = rtable.astype(str) - # mappings = mappings.astype(str) + ltable = ltable.astype(str) + rtable = rtable.astype(str) + mappings = mappings.astype(str) matching_number = len(mappings) # 所有阳性样本数 for index, row in mappings.iterrows(): @@ -206,9 +206,9 @@ def ml_er(iter_round: int, config: Configuration = None, ): config["block_attr"], allow_missing=True, l_output_attrs=selected_attrs, r_output_attrs=selected_attrs) else: - matcher = em.RFMatcher(name='RF', random_state=0) + matcher = em.SVMMatcher(name='SVM', random_state=0) blocker = em.OverlapBlocker() - candidate = blocker.block_tables(selected_ltable, selected_rtable, items_but_id[0], items_but_id[0], + candidate = blocker.block_tables(selected_ltable, selected_rtable, selected_attrs[-1], selected_attrs[-1], l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, overlap_size=1, show_progress=False, allow_missing=True) @@ -229,6 +229,8 @@ def ml_er(iter_round: int, config: Configuration = None, ): for row in candidate_match_rows: candidate.loc[row, 'gold'] = 1 + candidate.fillna("", inplace=True) + # 裁剪负样本,保持正负样本数量一致 candidate_mismatch = candidate[candidate['gold'] == 0] candidate_match = candidate[candidate['gold'] == 1] @@ -266,8 +268,6 @@ def ml_er(iter_round: int, config: Configuration = None, ): attrs_after=test_feature_after, show_progress=False) fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold'] - train_feature_vecs.fillna(0, inplace=True) - test_feature_vecs.fillna(0, inplace=True) matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold') test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id]) predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after, @@ -299,11 +299,11 @@ def ml_er(iter_round: int, config: Configuration = None, ): df = predictions[predictions['predicted'] == str(1)] interpretability = epl_match / len(df) # 可解释性 - if indicators["block_recall"] >= 0.8: - f1 = indicators["F1"] - else: + if (indicators["block_recall"] < 0.8) and (indicators["block_recall"] < indicators["recall"]): f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / ( indicators["precision"] + indicators["block_recall"]) + else: + f1 = indicators["F1"] performance = interpre_weight * interpretability + (1 - interpre_weight) * f1 ################################################################################################################ diff --git a/settings.py b/settings.py index 2e359e1..039c04d 100644 --- a/settings.py +++ b/settings.py @@ -4,16 +4,16 @@ import numpy as np ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\tableA.csv' rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\tableB.csv' mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\matches.csv' -mapping_lid = 'id1' # mapping表中左表id名 -mapping_rid = 'id2' # mapping表中右表id名 +mapping_lid = 'ltable_id' # mapping表中左表id名 +mapping_rid = 'rtable_id' # mapping表中右表id名 ltable_id = 'id' # 左表id字段名称 rtable_id = 'id' # 右表id字段名称 target_attr = 'id' # 进行md挖掘时的目标字段 lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致 similarity_threshold = 0.2 support_threshold = 1 -confidence_threshold = 0.5 -interpre_weight = 0.3 # 可解释性权重 +confidence_threshold = 0.4 +interpre_weight = 0.4 # 可解释性权重 er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\' md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\' hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\'