|
|
#!/usr/bin/env python3
|
|
|
# -*- coding: utf-8 -*-
|
|
|
import sys
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import sklearn
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
|
from sklearn.svm import SVC
|
|
|
from sklearn import preprocessing
|
|
|
import imbalanced_ensemble as imbens
|
|
|
import matplotlib.pyplot as plt
|
|
|
import seaborn as sns
|
|
|
from IPython.core.pylabtools import figsize
|
|
|
|
|
|
subs = []
|
|
|
|
|
|
|
|
|
def container_count(str):
|
|
|
ct = 0
|
|
|
for ch in str:
|
|
|
if ch == ',':
|
|
|
ct = ct + 1
|
|
|
|
|
|
return ct + 1
|
|
|
|
|
|
|
|
|
def pre_process(data):
|
|
|
data.drop(columns=['id', 'entry_id', 'g_no', 'g_name', 'note_s', 'bill_no'
|
|
|
, 'loginname', 'username', 'trade_name', 'agent_name', 'owner_name'],
|
|
|
inplace=True)
|
|
|
|
|
|
# 尝试g_model彻底拆分,没效果
|
|
|
# global subs
|
|
|
# temp_data = pd.DataFrame(np.zeros((data.shape[0], len(subs))), columns=subs, index=range(0, data.shape[0]))
|
|
|
#
|
|
|
# for i in data.index:
|
|
|
# string = data.loc[i, 'g_model']
|
|
|
# if isinstance(string, str):
|
|
|
# elms = string.split('|')
|
|
|
# for elm in elms:
|
|
|
# if elm in subs:
|
|
|
# temp_data.loc[i, elm] = 1
|
|
|
# print(temp_data['杂色,粗洗'])
|
|
|
# print(data['g_model'])
|
|
|
for field in ['hs', 'g_model', 'trade_curr', 'origin_country', 'g_unit', 'wrap_type'
|
|
|
, 'dest', 'country', 'i_e_flag', 'trade_mode', 'traf_name', 'traf_mode']:
|
|
|
data[field] = pd.Categorical(data[field]).codes
|
|
|
status_dict = ['低', '中', '超高']
|
|
|
for field in ['decl_price', 'decl_total', 'g_qty', 'gross_wt', 'pack_no',
|
|
|
'net_wt']:
|
|
|
data[field] = data[field].apply(lambda x: status_dict.index(x))
|
|
|
for field in ['trade_co_id', 'agent_co_id', 'owner_co_id']:
|
|
|
data[field] = [int(x.split('_')[-1]) for x in data[field]]
|
|
|
|
|
|
data['container_nums'] = data['container_nums'].apply(container_count)
|
|
|
data['i_e_date'] = data['i_e_date'].apply(lambda x: 2 if np.isnan(x) else x)
|
|
|
# data = pd.concat([data, temp_data], axis=1)
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
def plot_cost_matrix(cost_matrix, title:str, **kwargs):
|
|
|
ax = sns.heatmap(data=cost_matrix, **kwargs)
|
|
|
ax.set_ylabel("Predicted Label")
|
|
|
ax.set_xlabel("Ground Truth")
|
|
|
ax.set_title(title)
|
|
|
|
|
|
|
|
|
def imbens_classify(X_train, y_train, X_valid, y_valid, X_test):
|
|
|
# estimator = DecisionTreeClassifier()
|
|
|
# estimator = RandomForestClassifier()
|
|
|
# estimator = SVC(probability=True)
|
|
|
my_matrixs = [
|
|
|
[
|
|
|
[1, 20],
|
|
|
[0.051, 1]
|
|
|
],
|
|
|
[
|
|
|
[2, 20],
|
|
|
[1, 2]
|
|
|
],
|
|
|
[
|
|
|
[2, 10],
|
|
|
[5, 1]
|
|
|
],
|
|
|
[
|
|
|
[2, 100],
|
|
|
[0.5, 2]
|
|
|
],
|
|
|
[
|
|
|
[1, 0.05],
|
|
|
[20, 1]
|
|
|
],
|
|
|
]
|
|
|
plot_cost_matrix(my_matrixs[0], "test", annot=True, cmap='YlOrRd', vmax=6)
|
|
|
plt.show()
|
|
|
|
|
|
depth_best = 1
|
|
|
matrix_best = 0
|
|
|
n_best = 1
|
|
|
f1_best = 0
|
|
|
|
|
|
# estimator = DecisionTreeClassifier()
|
|
|
# for var in range(1, 70):
|
|
|
# clf = imbens.ensemble.SelfPacedEnsembleClassifier(
|
|
|
# random_state=42,
|
|
|
# base_estimator=estimator,
|
|
|
# n_estimators=var
|
|
|
# ).fit(X_train, y_train,
|
|
|
# eval_datasets={
|
|
|
# 'valid': (X_valid, y_valid), # add validation data
|
|
|
# },
|
|
|
# eval_metrics={
|
|
|
# 'f1': (sklearn.metrics.f1_score, {}),
|
|
|
# 'f1_macro': (sklearn.metrics.f1_score, {'average': 'macro'}),
|
|
|
# },
|
|
|
# # train_verbose={
|
|
|
# # 'granularity': 1,
|
|
|
# # }
|
|
|
# )
|
|
|
#
|
|
|
# f1 = sklearn.metrics.f1_score(y_test, clf.predict(X_test))
|
|
|
# if f1 > f1_best:
|
|
|
# f1_best = f1
|
|
|
# var_best = var
|
|
|
# print(var_best, f1_best)
|
|
|
# exit(-1)
|
|
|
#
|
|
|
# for depth in range(11, 100, 10):
|
|
|
# estimator = DecisionTreeClassifier(max_depth=depth_best)
|
|
|
# clf = imbens.ensemble.AsymBoostClassifier(
|
|
|
# random_state=42,
|
|
|
# base_estimator=estimator,
|
|
|
# # n_estimators=50
|
|
|
# ).fit(X_train, y_train,
|
|
|
# eval_datasets={
|
|
|
# 'valid': (X_valid, y_valid), # add validation data
|
|
|
# },
|
|
|
# eval_metrics={
|
|
|
# 'f1_micro': (sklearn.metrics.f1_score, {'average': 'micro'}),
|
|
|
# 'f1_macro': (sklearn.metrics.f1_score, {'average': 'macro'}),
|
|
|
# },
|
|
|
# cost_matrix=my_matrixs[0]
|
|
|
# )
|
|
|
#
|
|
|
# f1_macro = sklearn.metrics.f1_score(y_test, clf.predict(X_test),
|
|
|
# average='macro')
|
|
|
# if f1_macro > f1_best:
|
|
|
# f1_best = f1_macro
|
|
|
# depth_best = depth
|
|
|
# print("sub best depth: " + str(depth_best))
|
|
|
#
|
|
|
# f1_best = 0
|
|
|
# for depth in range(depth_best - 10, depth_best + 10):
|
|
|
# for matrix in range(0, len(my_matrixs)):
|
|
|
# for n in range(1, 102):
|
|
|
# estimator = DecisionTreeClassifier(max_depth=depth)
|
|
|
# clf = imbens.ensemble.AsymBoostClassifier(
|
|
|
# random_state=42,
|
|
|
# base_estimator=estimator,
|
|
|
# n_estimators=n
|
|
|
# ).fit(X_train, y_train,
|
|
|
# eval_datasets={
|
|
|
# 'valid': (X_valid, y_valid), # add validation data
|
|
|
# },
|
|
|
# eval_metrics={
|
|
|
# 'f1_micro': (
|
|
|
# sklearn.metrics.f1_score, {'average': 'micro'}),
|
|
|
# 'f1_macro': (
|
|
|
# sklearn.metrics.f1_score, {'average': 'macro'}),
|
|
|
# },
|
|
|
# cost_matrix=my_matrixs[matrix],
|
|
|
# # train_verbose={
|
|
|
# # 'granularity': 1,
|
|
|
# # }
|
|
|
# )
|
|
|
#
|
|
|
# f1_macro = sklearn.metrics.f1_score(y_test, clf.predict(X_test),
|
|
|
# average='macro')
|
|
|
# if f1_macro > f1_best:
|
|
|
# f1_best = f1_macro
|
|
|
# depth_best = depth
|
|
|
# matrix_best = matrix
|
|
|
# n_best = n
|
|
|
# print("max_depth: %d, matrix_num: %d, n_estimators: %d"
|
|
|
# % (depth_best, matrix_best, n_best))
|
|
|
|
|
|
estimator = DecisionTreeClassifier(max_depth=34)
|
|
|
clf = imbens.ensemble.AsymBoostClassifier(
|
|
|
random_state=42,
|
|
|
base_estimator=estimator,
|
|
|
n_estimators=50
|
|
|
).fit(X_train, y_train,
|
|
|
eval_datasets={
|
|
|
'valid': (X_valid, y_valid), # add validation data
|
|
|
},
|
|
|
eval_metrics={
|
|
|
'recall': (sklearn.metrics.f1_score, {}),
|
|
|
'f1_macro': (sklearn.metrics.f1_score, {'average': 'macro'}),
|
|
|
},
|
|
|
cost_matrix=my_matrixs[1],
|
|
|
train_verbose={
|
|
|
'granularity': 1,
|
|
|
}
|
|
|
)
|
|
|
|
|
|
evaluate_predict(y_train, clf.predict(X_train), "train")
|
|
|
evaluate_predict(y_valid, clf.predict(X_valid), "valid")
|
|
|
|
|
|
fitted_ensembles = {'test': clf}
|
|
|
visualizer = imbens.visualizer.ImbalancedEnsembleVisualizer(
|
|
|
eval_datasets={
|
|
|
'training': (X_train, y_train),
|
|
|
'validation': (X_valid, y_valid),
|
|
|
},
|
|
|
eval_metrics={
|
|
|
'f1_micro': (sklearn.metrics.f1_score, {'average': 'micro'}),
|
|
|
'f1_macro': (sklearn.metrics.f1_score, {'average': 'macro'}),
|
|
|
'recall': (sklearn.metrics.recall_score, {'average': 'binary'})
|
|
|
},
|
|
|
)
|
|
|
visualizer.fit(fitted_ensembles)
|
|
|
fig, axes = visualizer.performance_lineplot()
|
|
|
plt.show()
|
|
|
|
|
|
return clf.predict(X_test)
|
|
|
|
|
|
|
|
|
def evaluate_predict(y, y_pred, title=""):
|
|
|
matrix = sklearn.metrics.confusion_matrix(y, y_pred)
|
|
|
sns.heatmap(matrix, annot=True, cmap='Purples', fmt='g')
|
|
|
plt.ylabel(u'True Value')
|
|
|
plt.xlabel(u'Predict Value')
|
|
|
plt.title(title)
|
|
|
plt.show()
|
|
|
|
|
|
print(title + "-----------------------------------------------------------")
|
|
|
f1_micro = sklearn.metrics.f1_score(y, y_pred, average='micro')
|
|
|
f1_macro = sklearn.metrics.f1_score(y, y_pred, average='macro')
|
|
|
recall_1 = sklearn.metrics.recall_score(y, y_pred, average='binary')
|
|
|
precision_1 = sklearn.metrics.precision_score(y, y_pred, average='binary')
|
|
|
f1_1 = sklearn.metrics.f1_score(y, y_pred, average='binary')
|
|
|
print('f1_micro: {:.3f}'.format(f1_micro))
|
|
|
print('f1_macro: {:.3f}'.format(f1_macro))
|
|
|
print('recall(passed=1): {:.3f}'.format(recall_1))
|
|
|
print('precision(passed=1): {:.3f}'.format(precision_1))
|
|
|
print('f1(passed=1): {:.3f}'.format(f1_1))
|
|
|
|
|
|
|
|
|
def main(argv):
|
|
|
data = pd.read_csv('./train.csv', encoding='gbk')
|
|
|
X_test = pd.read_csv('./test.csv', encoding='gbk')
|
|
|
|
|
|
print(data.shape)
|
|
|
|
|
|
# for string in data['g_model']:
|
|
|
# if isinstance(string, str):
|
|
|
# elementes = string.split('|')
|
|
|
# for element in elementes:
|
|
|
# if element not in subs:
|
|
|
# subs.append(element)
|
|
|
data = pre_process(data)
|
|
|
X_test = pre_process(X_test)
|
|
|
|
|
|
train, valid = train_test_split(data, test_size=0.2
|
|
|
, stratify=data[['passed']], random_state=0)
|
|
|
X_train = train.drop(columns=['passed'])
|
|
|
y_train = train['passed']
|
|
|
X_valid = valid.drop(columns=['passed'])
|
|
|
y_valid = valid['passed']
|
|
|
|
|
|
print(type(X_train))
|
|
|
# plt.rcParams['figure.dpi'] = 300
|
|
|
colnm = X_train.columns.tolist()
|
|
|
mcorr = X_train[colnm].corr(
|
|
|
method="spearman") # 相关系数矩阵,即给出了任意两个变量之间的相关系数
|
|
|
mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型
|
|
|
mask[np.triu_indices_from(mask)] = True # 角分线右侧为True
|
|
|
cmap = sns.diverging_palette(220, 10,
|
|
|
as_cmap=True) # 返回matplotlib colormap对象
|
|
|
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True,
|
|
|
fmt='0.2f', xticklabels=False, yticklabels=False) # 热力图(看两两相似度)
|
|
|
plt.show()
|
|
|
|
|
|
X_train = preprocessing.scale(X_train)
|
|
|
X_valid = preprocessing.scale(X_valid)
|
|
|
X_test = preprocessing.scale(X_test)
|
|
|
|
|
|
y_test_pred = imbens_classify(X_train, y_train, X_valid, y_valid, X_test)
|
|
|
pd.DataFrame(y_test_pred).to_csv('./result.txt', index=False, header=False)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main(sys.argv)
|