You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
30 lines
1.4 KiB
30 lines
1.4 KiB
# train/valid/test set中只有左右id映射, 该方法根据左右表以及id构建完整的set
|
|
import pandas as pd
|
|
|
|
from setting import directory_path
|
|
|
|
|
|
def build_whole_X_set(X_set, _ltable, _rtable):
|
|
merged_set = pd.merge(X_set, _ltable, on='ltable_id', how='left')
|
|
merged_set = pd.merge(merged_set, _rtable, on='rtable_id', how='left')
|
|
merged_set.insert(0, '_id', range(len(merged_set)))
|
|
return merged_set
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# 读入两张表, 加前缀
|
|
ltable = pd.read_csv(directory_path + r'\tableA.csv', encoding='ISO-8859-1').rename(columns=lambda x: f'ltable_{x}')
|
|
rtable = pd.read_csv(directory_path + r'\tableB.csv', encoding='ISO-8859-1').rename(columns=lambda x: f'rtable_{x}')
|
|
|
|
train = pd.read_csv(directory_path + r'\train.csv', encoding='ISO-8859-1')
|
|
valid = pd.read_csv(directory_path + r'\valid.csv', encoding='ISO-8859-1')
|
|
test = pd.read_csv(directory_path + r'\test.csv', encoding='ISO-8859-1')
|
|
|
|
train = build_whole_X_set(train, ltable, rtable)
|
|
valid = build_whole_X_set(valid, ltable, rtable)
|
|
test = build_whole_X_set(test, ltable, rtable)
|
|
|
|
train.to_csv(directory_path + r'\train_whole.csv', sep=',', index=False, header=True)
|
|
valid.to_csv(directory_path + r'\valid_whole.csv', sep=',', index=False, header=True)
|
|
test.to_csv(directory_path + r'\test_whole.csv', sep=',', index=False, header=True)
|