#!/usr/bin/env python3 # -*- coding: utf-8 -*- import sys import pandas as pd import numpy as np import sklearn from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn import preprocessing import imbalanced_ensemble as imbens import matplotlib.pyplot as plt import seaborn as sns from IPython.core.pylabtools import figsize subs = [] def container_count(str): ct = 0 for ch in str: if ch == ',': ct = ct + 1 return ct + 1 def pre_process(data): data.drop(columns=['id', 'entry_id', 'g_no', 'g_name', 'note_s', 'bill_no' , 'loginname', 'username', 'trade_name', 'agent_name', 'owner_name'], inplace=True) # 尝试g_model彻底拆分,没效果 # global subs # temp_data = pd.DataFrame(np.zeros((data.shape[0], len(subs))), columns=subs, index=range(0, data.shape[0])) # # for i in data.index: # string = data.loc[i, 'g_model'] # if isinstance(string, str): # elms = string.split('|') # for elm in elms: # if elm in subs: # temp_data.loc[i, elm] = 1 # print(temp_data['杂色,粗洗']) # print(data['g_model']) for field in ['hs', 'g_model', 'trade_curr', 'origin_country', 'g_unit', 'wrap_type' , 'dest', 'country', 'i_e_flag', 'trade_mode', 'traf_name', 'traf_mode']: data[field] = pd.Categorical(data[field]).codes status_dict = ['低', '中', '超高'] for field in ['decl_price', 'decl_total', 'g_qty', 'gross_wt', 'pack_no', 'net_wt']: data[field] = data[field].apply(lambda x: status_dict.index(x)) for field in ['trade_co_id', 'agent_co_id', 'owner_co_id']: data[field] = [int(x.split('_')[-1]) for x in data[field]] data['container_nums'] = data['container_nums'].apply(container_count) data['i_e_date'] = data['i_e_date'].apply(lambda x: 2 if np.isnan(x) else x) # data = pd.concat([data, temp_data], axis=1) return data def plot_cost_matrix(cost_matrix, title:str, **kwargs): ax = sns.heatmap(data=cost_matrix, **kwargs) ax.set_ylabel("Predicted Label") ax.set_xlabel("Ground Truth") ax.set_title(title) def imbens_classify(X_train, y_train, X_valid, y_valid, X_test): # estimator = DecisionTreeClassifier() # estimator = RandomForestClassifier() # estimator = SVC(probability=True) my_matrixs = [ [ [1, 20], [0.051, 1] ], [ [2, 20], [1, 2] ], [ [2, 10], [5, 1] ], [ [2, 100], [0.5, 2] ], [ [1, 0.05], [20, 1] ], ] plot_cost_matrix(my_matrixs[0], "test", annot=True, cmap='YlOrRd', vmax=6) plt.show() depth_best = 1 matrix_best = 0 n_best = 1 f1_best = 0 # estimator = DecisionTreeClassifier() # for var in range(1, 70): # clf = imbens.ensemble.SelfPacedEnsembleClassifier( # random_state=42, # base_estimator=estimator, # n_estimators=var # ).fit(X_train, y_train, # eval_datasets={ # 'valid': (X_valid, y_valid), # add validation data # }, # eval_metrics={ # 'f1': (sklearn.metrics.f1_score, {}), # 'f1_macro': (sklearn.metrics.f1_score, {'average': 'macro'}), # }, # # train_verbose={ # # 'granularity': 1, # # } # ) # # f1 = sklearn.metrics.f1_score(y_test, clf.predict(X_test)) # if f1 > f1_best: # f1_best = f1 # var_best = var # print(var_best, f1_best) # exit(-1) # # for depth in range(11, 100, 10): # estimator = DecisionTreeClassifier(max_depth=depth_best) # clf = imbens.ensemble.AsymBoostClassifier( # random_state=42, # base_estimator=estimator, # # n_estimators=50 # ).fit(X_train, y_train, # eval_datasets={ # 'valid': (X_valid, y_valid), # add validation data # }, # eval_metrics={ # 'f1_micro': (sklearn.metrics.f1_score, {'average': 'micro'}), # 'f1_macro': (sklearn.metrics.f1_score, {'average': 'macro'}), # }, # cost_matrix=my_matrixs[0] # ) # # f1_macro = sklearn.metrics.f1_score(y_test, clf.predict(X_test), # average='macro') # if f1_macro > f1_best: # f1_best = f1_macro # depth_best = depth # print("sub best depth: " + str(depth_best)) # # f1_best = 0 # for depth in range(depth_best - 10, depth_best + 10): # for matrix in range(0, len(my_matrixs)): # for n in range(1, 102): # estimator = DecisionTreeClassifier(max_depth=depth) # clf = imbens.ensemble.AsymBoostClassifier( # random_state=42, # base_estimator=estimator, # n_estimators=n # ).fit(X_train, y_train, # eval_datasets={ # 'valid': (X_valid, y_valid), # add validation data # }, # eval_metrics={ # 'f1_micro': ( # sklearn.metrics.f1_score, {'average': 'micro'}), # 'f1_macro': ( # sklearn.metrics.f1_score, {'average': 'macro'}), # }, # cost_matrix=my_matrixs[matrix], # # train_verbose={ # # 'granularity': 1, # # } # ) # # f1_macro = sklearn.metrics.f1_score(y_test, clf.predict(X_test), # average='macro') # if f1_macro > f1_best: # f1_best = f1_macro # depth_best = depth # matrix_best = matrix # n_best = n # print("max_depth: %d, matrix_num: %d, n_estimators: %d" # % (depth_best, matrix_best, n_best)) estimator = DecisionTreeClassifier(max_depth=34) clf = imbens.ensemble.AsymBoostClassifier( random_state=42, base_estimator=estimator, n_estimators=50 ).fit(X_train, y_train, eval_datasets={ 'valid': (X_valid, y_valid), # add validation data }, eval_metrics={ 'recall': (sklearn.metrics.f1_score, {}), 'f1_macro': (sklearn.metrics.f1_score, {'average': 'macro'}), }, cost_matrix=my_matrixs[1], train_verbose={ 'granularity': 1, } ) evaluate_predict(y_train, clf.predict(X_train), "train") evaluate_predict(y_valid, clf.predict(X_valid), "valid") fitted_ensembles = {'test': clf} visualizer = imbens.visualizer.ImbalancedEnsembleVisualizer( eval_datasets={ 'training': (X_train, y_train), 'validation': (X_valid, y_valid), }, eval_metrics={ 'f1_micro': (sklearn.metrics.f1_score, {'average': 'micro'}), 'f1_macro': (sklearn.metrics.f1_score, {'average': 'macro'}), 'recall': (sklearn.metrics.recall_score, {'average': 'binary'}) }, ) visualizer.fit(fitted_ensembles) fig, axes = visualizer.performance_lineplot() plt.show() return clf.predict(X_test) def evaluate_predict(y, y_pred, title=""): matrix = sklearn.metrics.confusion_matrix(y, y_pred) sns.heatmap(matrix, annot=True, cmap='Purples', fmt='g') plt.ylabel(u'True Value') plt.xlabel(u'Predict Value') plt.title(title) plt.show() print(title + "-----------------------------------------------------------") f1_micro = sklearn.metrics.f1_score(y, y_pred, average='micro') f1_macro = sklearn.metrics.f1_score(y, y_pred, average='macro') recall_1 = sklearn.metrics.recall_score(y, y_pred, average='binary') precision_1 = sklearn.metrics.precision_score(y, y_pred, average='binary') f1_1 = sklearn.metrics.f1_score(y, y_pred, average='binary') print('f1_micro: {:.3f}'.format(f1_micro)) print('f1_macro: {:.3f}'.format(f1_macro)) print('recall(passed=1): {:.3f}'.format(recall_1)) print('precision(passed=1): {:.3f}'.format(precision_1)) print('f1(passed=1): {:.3f}'.format(f1_1)) def main(argv): data = pd.read_csv('./train.csv', encoding='gbk') X_test = pd.read_csv('./test.csv', encoding='gbk') print(data.shape) # for string in data['g_model']: # if isinstance(string, str): # elementes = string.split('|') # for element in elementes: # if element not in subs: # subs.append(element) data = pre_process(data) X_test = pre_process(X_test) train, valid = train_test_split(data, test_size=0.2 , stratify=data[['passed']], random_state=0) X_train = train.drop(columns=['passed']) y_train = train['passed'] X_valid = valid.drop(columns=['passed']) y_valid = valid['passed'] print(type(X_train)) # plt.rcParams['figure.dpi'] = 300 colnm = X_train.columns.tolist() mcorr = X_train[colnm].corr( method="spearman") # 相关系数矩阵,即给出了任意两个变量之间的相关系数 mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型 mask[np.triu_indices_from(mask)] = True # 角分线右侧为True cmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap对象 g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, fmt='0.2f', xticklabels=False, yticklabels=False) # 热力图(看两两相似度) plt.show() X_train = preprocessing.scale(X_train) X_valid = preprocessing.scale(X_valid) X_test = preprocessing.scale(X_test) y_test_pred = imbens_classify(X_train, y_train, X_valid, y_valid, X_test) pd.DataFrame(y_test_pred).to_csv('./result.txt', index=False, header=False) if __name__ == '__main__': main(sys.argv)