You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

30 lines
1.4 KiB

# train/valid/test set中只有左右id映射, 该方法根据左右表以及id构建完整的set
import pandas as pd
from setting import directory_path
def build_whole_X_set(X_set, _ltable, _rtable):
merged_set = pd.merge(X_set, _ltable, on='ltable_id', how='left')
merged_set = pd.merge(merged_set, _rtable, on='rtable_id', how='left')
merged_set.insert(0, '_id', range(len(merged_set)))
return merged_set
if __name__ == '__main__':
# 读入两张表, 加前缀
ltable = pd.read_csv(directory_path + r'\tableA.csv', encoding='ISO-8859-1').rename(columns=lambda x: f'ltable_{x}')
rtable = pd.read_csv(directory_path + r'\tableB.csv', encoding='ISO-8859-1').rename(columns=lambda x: f'rtable_{x}')
train = pd.read_csv(directory_path + r'\train.csv', encoding='ISO-8859-1')
valid = pd.read_csv(directory_path + r'\valid.csv', encoding='ISO-8859-1')
test = pd.read_csv(directory_path + r'\test.csv', encoding='ISO-8859-1')
train = build_whole_X_set(train, ltable, rtable)
valid = build_whole_X_set(valid, ltable, rtable)
test = build_whole_X_set(test, ltable, rtable)
train.to_csv(directory_path + r'\train_whole.csv', sep=',', index=False, header=True)
valid.to_csv(directory_path + r'\valid_whole.csv', sep=',', index=False, header=True)
test.to_csv(directory_path + r'\test_whole.csv', sep=',', index=False, header=True)