You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

297 lines
11 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
import imbalanced_ensemble as imbens
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.pylabtools import figsize
subs = []
def container_count(str):
ct = 0
for ch in str:
if ch == ',':
ct = ct + 1
return ct + 1
def pre_process(data):
data.drop(columns=['id', 'entry_id', 'g_no', 'g_name', 'note_s', 'bill_no'
, 'loginname', 'username', 'trade_name', 'agent_name', 'owner_name'],
inplace=True)
# 尝试g_model彻底拆分没效果
# global subs
# temp_data = pd.DataFrame(np.zeros((data.shape[0], len(subs))), columns=subs, index=range(0, data.shape[0]))
#
# for i in data.index:
# string = data.loc[i, 'g_model']
# if isinstance(string, str):
# elms = string.split('|')
# for elm in elms:
# if elm in subs:
# temp_data.loc[i, elm] = 1
# print(temp_data['杂色,粗洗'])
# print(data['g_model'])
for field in ['hs', 'g_model', 'trade_curr', 'origin_country', 'g_unit', 'wrap_type'
, 'dest', 'country', 'i_e_flag', 'trade_mode', 'traf_name', 'traf_mode']:
data[field] = pd.Categorical(data[field]).codes
status_dict = ['', '', '超高']
for field in ['decl_price', 'decl_total', 'g_qty', 'gross_wt', 'pack_no',
'net_wt']:
data[field] = data[field].apply(lambda x: status_dict.index(x))
for field in ['trade_co_id', 'agent_co_id', 'owner_co_id']:
data[field] = [int(x.split('_')[-1]) for x in data[field]]
data['container_nums'] = data['container_nums'].apply(container_count)
data['i_e_date'] = data['i_e_date'].apply(lambda x: 2 if np.isnan(x) else x)
# data = pd.concat([data, temp_data], axis=1)
return data
def plot_cost_matrix(cost_matrix, title:str, **kwargs):
ax = sns.heatmap(data=cost_matrix, **kwargs)
ax.set_ylabel("Predicted Label")
ax.set_xlabel("Ground Truth")
ax.set_title(title)
def imbens_classify(X_train, y_train, X_valid, y_valid, X_test):
# estimator = DecisionTreeClassifier()
# estimator = RandomForestClassifier()
# estimator = SVC(probability=True)
my_matrixs = [
[
[1, 20],
[0.051, 1]
],
[
[2, 20],
[1, 2]
],
[
[2, 10],
[5, 1]
],
[
[2, 100],
[0.5, 2]
],
[
[1, 0.05],
[20, 1]
],
]
plot_cost_matrix(my_matrixs[0], "test", annot=True, cmap='YlOrRd', vmax=6)
plt.show()
depth_best = 1
matrix_best = 0
n_best = 1
f1_best = 0
# estimator = DecisionTreeClassifier()
# for var in range(1, 70):
# clf = imbens.ensemble.SelfPacedEnsembleClassifier(
# random_state=42,
# base_estimator=estimator,
# n_estimators=var
# ).fit(X_train, y_train,
# eval_datasets={
# 'valid': (X_valid, y_valid), # add validation data
# },
# eval_metrics={
# 'f1': (sklearn.metrics.f1_score, {}),
# 'f1_macro': (sklearn.metrics.f1_score, {'average': 'macro'}),
# },
# # train_verbose={
# # 'granularity': 1,
# # }
# )
#
# f1 = sklearn.metrics.f1_score(y_test, clf.predict(X_test))
# if f1 > f1_best:
# f1_best = f1
# var_best = var
# print(var_best, f1_best)
# exit(-1)
#
# for depth in range(11, 100, 10):
# estimator = DecisionTreeClassifier(max_depth=depth_best)
# clf = imbens.ensemble.AsymBoostClassifier(
# random_state=42,
# base_estimator=estimator,
# # n_estimators=50
# ).fit(X_train, y_train,
# eval_datasets={
# 'valid': (X_valid, y_valid), # add validation data
# },
# eval_metrics={
# 'f1_micro': (sklearn.metrics.f1_score, {'average': 'micro'}),
# 'f1_macro': (sklearn.metrics.f1_score, {'average': 'macro'}),
# },
# cost_matrix=my_matrixs[0]
# )
#
# f1_macro = sklearn.metrics.f1_score(y_test, clf.predict(X_test),
# average='macro')
# if f1_macro > f1_best:
# f1_best = f1_macro
# depth_best = depth
# print("sub best depth: " + str(depth_best))
#
# f1_best = 0
# for depth in range(depth_best - 10, depth_best + 10):
# for matrix in range(0, len(my_matrixs)):
# for n in range(1, 102):
# estimator = DecisionTreeClassifier(max_depth=depth)
# clf = imbens.ensemble.AsymBoostClassifier(
# random_state=42,
# base_estimator=estimator,
# n_estimators=n
# ).fit(X_train, y_train,
# eval_datasets={
# 'valid': (X_valid, y_valid), # add validation data
# },
# eval_metrics={
# 'f1_micro': (
# sklearn.metrics.f1_score, {'average': 'micro'}),
# 'f1_macro': (
# sklearn.metrics.f1_score, {'average': 'macro'}),
# },
# cost_matrix=my_matrixs[matrix],
# # train_verbose={
# # 'granularity': 1,
# # }
# )
#
# f1_macro = sklearn.metrics.f1_score(y_test, clf.predict(X_test),
# average='macro')
# if f1_macro > f1_best:
# f1_best = f1_macro
# depth_best = depth
# matrix_best = matrix
# n_best = n
# print("max_depth: %d, matrix_num: %d, n_estimators: %d"
# % (depth_best, matrix_best, n_best))
estimator = DecisionTreeClassifier(max_depth=34)
clf = imbens.ensemble.AsymBoostClassifier(
random_state=42,
base_estimator=estimator,
n_estimators=50
).fit(X_train, y_train,
eval_datasets={
'valid': (X_valid, y_valid), # add validation data
},
eval_metrics={
'recall': (sklearn.metrics.f1_score, {}),
'f1_macro': (sklearn.metrics.f1_score, {'average': 'macro'}),
},
cost_matrix=my_matrixs[1],
train_verbose={
'granularity': 1,
}
)
evaluate_predict(y_train, clf.predict(X_train), "train")
evaluate_predict(y_valid, clf.predict(X_valid), "valid")
fitted_ensembles = {'test': clf}
visualizer = imbens.visualizer.ImbalancedEnsembleVisualizer(
eval_datasets={
'training': (X_train, y_train),
'validation': (X_valid, y_valid),
},
eval_metrics={
'f1_micro': (sklearn.metrics.f1_score, {'average': 'micro'}),
'f1_macro': (sklearn.metrics.f1_score, {'average': 'macro'}),
'recall': (sklearn.metrics.recall_score, {'average': 'binary'})
},
)
visualizer.fit(fitted_ensembles)
fig, axes = visualizer.performance_lineplot()
plt.show()
return clf.predict(X_test)
def evaluate_predict(y, y_pred, title=""):
matrix = sklearn.metrics.confusion_matrix(y, y_pred)
sns.heatmap(matrix, annot=True, cmap='Purples', fmt='g')
plt.ylabel(u'True Value')
plt.xlabel(u'Predict Value')
plt.title(title)
plt.show()
print(title + "-----------------------------------------------------------")
f1_micro = sklearn.metrics.f1_score(y, y_pred, average='micro')
f1_macro = sklearn.metrics.f1_score(y, y_pred, average='macro')
recall_1 = sklearn.metrics.recall_score(y, y_pred, average='binary')
precision_1 = sklearn.metrics.precision_score(y, y_pred, average='binary')
f1_1 = sklearn.metrics.f1_score(y, y_pred, average='binary')
print('f1_micro: {:.3f}'.format(f1_micro))
print('f1_macro: {:.3f}'.format(f1_macro))
print('recall(passed=1): {:.3f}'.format(recall_1))
print('precision(passed=1): {:.3f}'.format(precision_1))
print('f1(passed=1): {:.3f}'.format(f1_1))
def main(argv):
data = pd.read_csv('./train.csv', encoding='gbk')
X_test = pd.read_csv('./test.csv', encoding='gbk')
print(data.shape)
# for string in data['g_model']:
# if isinstance(string, str):
# elementes = string.split('|')
# for element in elementes:
# if element not in subs:
# subs.append(element)
data = pre_process(data)
X_test = pre_process(X_test)
train, valid = train_test_split(data, test_size=0.2
, stratify=data[['passed']], random_state=0)
X_train = train.drop(columns=['passed'])
y_train = train['passed']
X_valid = valid.drop(columns=['passed'])
y_valid = valid['passed']
print(type(X_train))
# plt.rcParams['figure.dpi'] = 300
colnm = X_train.columns.tolist()
mcorr = X_train[colnm].corr(
method="spearman") # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True # 角分线右侧为True
cmap = sns.diverging_palette(220, 10,
as_cmap=True) # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True,
fmt='0.2f', xticklabels=False, yticklabels=False) # 热力图(看两两相似度)
plt.show()
X_train = preprocessing.scale(X_train)
X_valid = preprocessing.scale(X_valid)
X_test = preprocessing.scale(X_test)
y_test_pred = imbens_classify(X_train, y_train, X_valid, y_valid, X_test)
pd.DataFrame(y_test_pred).to_csv('./result.txt', index=False, header=False)
if __name__ == '__main__':
main(sys.argv)