You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

175 lines
6.1 KiB

import argparse
import json
import os
from datetime import timedelta,datetime
import pandas as pd
from prophet import Prophet
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
from src.data_preprocessing import load_data, preprocess_data, split_data
from src.time_series_prediction import extract_features, label_churners
# 数据预处理
# 从 JSON 文件中加载数据
with open('commits_data_large.json', 'r') as f:
commits = json.load(f)
# 将数据转换为 DataFrame
df = pd.DataFrame(commits)
# 1.处理缺失值
df = df.dropna()
# 2. 转换日期时间格式
df['author_date'] = pd.to_datetime(df['author_date'])
#时间序列预测
def time_series_predict(time_series_data):
time_series = Prophet(seasonality_mode='multiplicative').fit(time_series_data)
future = time_series.make_future_dataframe()
future_time_series_data = time_series.predict(future)
return future_time_series_data
def time_series_prediction(time_series_data):
return time_series_data
# 特征工程
# 1. 提取时间相关特征
df['year'] = df['author_date'].dt.year
df['month'] = df['author_date'].dt.month
df['day'] = df['author_date'].dt.day
df['day_of_week'] = df['author_date'].dt.dayofweek
df['hour'] = df['author_date'].dt.hour
# 2. 计算开发者提交频率
commit_counts = df.groupby('author_email').size().reset_index(name='commit_count')
# 3. 计算开发者的活跃周期
df['first_commit_date'] = df.groupby('author_email')['author_date'].transform('min')
df['last_commit_date'] = df.groupby('author_email')['author_date'].transform('max')
df['active_days'] = (df['last_commit_date'] - df['first_commit_date']).dt.days
# 4. 合并特征
df_features = pd.merge(commit_counts, avg_interval, on='author_email')
df_features = pd.merge(df_features, df[['author_email', 'active_days']].drop_duplicates(), on='author_email')
output_path = 'Dataset.csv'
df_features.to_csv(output_path, index=False)
print("数据预处理完成。")
# 模型训练
# Load Data
data = pd.read_csv('dataset.csv', index_col=0)
# 去掉非数字特征和直接前驱特征
X = data.loc[:, 'prs': 'sig_cluster'].drop(labels=['last_contribute_to_now', 'user_login_pr'], axis=1)
print(X)
X = MinMaxScaler().fit_transform(X.values)
y = data['tag'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
# Results initialize
results = pd.DataFrame(columns=['Acc', 'Pre', 'Rec', 'F1'],
index=['LR', 'SVM', 'LDA', 'NB', 'KNN', 'NN', 'DT', 'RF', 'GBT'])
print(results)
# Logistic Regression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)
results.loc['LR', 'Acc'] = accuracy_score(y_test, y_pred)
results.loc['LR', 'Pre'] = precision_score(y_test, y_pred)
results.loc['LR', 'Rec'] = recall_score(y_test, y_pred)
results.loc['LR', 'F1'] = f1_score(y_test, y_pred)
print(results.loc['LR'])
# SVM
clf = SVC(random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)
results.loc['SVM', 'Acc'] = accuracy_score(y_test, y_pred)
results.loc['SVM', 'Pre'] = precision_score(y_test, y_pred)
results.loc['SVM', 'Rec'] = recall_score(y_test, y_pred)
results.loc['SVM', 'F1'] = f1_score(y_test, y_pred)
print(results.loc['SVM'])
# NN
clf = MLPClassifier(random_state=0, max_iter=10000).fit(X_train, y_train)
y_pred = clf.predict(X_test)
results.loc['NN', 'Acc'] = accuracy_score(y_test, y_pred)
results.loc['NN', 'Pre'] = precision_score(y_test, y_pred)
results.loc['NN', 'Rec'] = recall_score(y_test, y_pred)
results.loc['NN', 'F1'] = f1_score(y_test, y_pred)
print(results.loc['NN'])
# LDA
clf = LinearDiscriminantAnalysis().fit(X_train, y_train)
y_pred = clf.predict(X_test)
results.loc['LDA', 'Acc'] = accuracy_score(y_test, y_pred)
results.loc['LDA', 'Pre'] = precision_score(y_test, y_pred)
results.loc['LDA', 'Rec'] = recall_score(y_test, y_pred)
results.loc['LDA', 'F1'] = f1_score(y_test, y_pred)
print(results.loc['LDA'])
# NB
clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)
results.loc['NB', 'Acc'] = accuracy_score(y_test, y_pred)
results.loc['NB', 'Pre'] = precision_score(y_test, y_pred)
results.loc['NB', 'Rec'] = recall_score(y_test, y_pred)
results.loc['NB', 'F1'] = f1_score(y_test, y_pred)
print(results.loc['NB'])
# KNN
clf = KNeighborsClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
results.loc['KNN', 'Acc'] = accuracy_score(y_test, y_pred)
results.loc['KNN', 'Pre'] = precision_score(y_test, y_pred)
results.loc['KNN', 'Rec'] = recall_score(y_test, y_pred)
results.loc['KNN', 'F1'] = f1_score(y_test, y_pred)
print(results.loc['KNN'])
# DT
clf = DecisionTreeClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
results.loc['DT', 'Acc'] = accuracy_score(y_test, y_pred)
results.loc['DT', 'Pre'] = precision_score(y_test, y_pred)
results.loc['DT', 'Rec'] = recall_score(y_test, y_pred)
results.loc['DT', 'F1'] = f1_score(y_test, y_pred)
print(results.loc['DT'])
# RF
clf = RandomForestClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
results.loc['RF', 'Acc'] = accuracy_score(y_test, y_pred)
results.loc['RF', 'Pre'] = precision_score(y_test, y_pred)
results.loc['RF', 'Rec'] = recall_score(y_test, y_pred)
results.loc['RF', 'F1'] = f1_score(y_test, y_pred)
print(results.loc['RF'])
# GBT
clf = GradientBoostingClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
results.loc['GBT', 'Acc'] = accuracy_score(y_test, y_pred)
results.loc['GBT', 'Pre'] = precision_score(y_test, y_pred)
results.loc['GBT', 'Rec'] = recall_score(y_test, y_pred)
results.loc['GBT', 'F1'] = f1_score(y_test, y_pred)
print(results.loc['GBT'])
print(results)