parent
9cfd3b4415
commit
b50aed47cb
@ -0,0 +1,174 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from datetime import timedelta,datetime
|
||||||
|
import pandas as pd
|
||||||
|
from prophet import Prophet
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.neural_network import MLPClassifier
|
||||||
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
||||||
|
import joblib
|
||||||
|
from src.data_preprocessing import load_data, preprocess_data, split_data
|
||||||
|
from src.time_series_prediction import extract_features, label_churners
|
||||||
|
|
||||||
|
# 数据预处理
|
||||||
|
# 从 JSON 文件中加载数据
|
||||||
|
with open('commits_data_large.json', 'r') as f:
|
||||||
|
commits = json.load(f)
|
||||||
|
|
||||||
|
# 将数据转换为 DataFrame
|
||||||
|
|
||||||
|
df = pd.DataFrame(commits)
|
||||||
|
|
||||||
|
# 1.处理缺失值
|
||||||
|
df = df.dropna()
|
||||||
|
|
||||||
|
# 2. 转换日期时间格式
|
||||||
|
df['author_date'] = pd.to_datetime(df['author_date'])
|
||||||
|
|
||||||
|
|
||||||
|
#时间序列预测
|
||||||
|
def time_series_predict(time_series_data):
|
||||||
|
time_series = Prophet(seasonality_mode='multiplicative').fit(time_series_data)
|
||||||
|
future = time_series.make_future_dataframe()
|
||||||
|
future_time_series_data = time_series.predict(future)
|
||||||
|
return future_time_series_data
|
||||||
|
|
||||||
|
def time_series_prediction(time_series_data):
|
||||||
|
return time_series_data
|
||||||
|
|
||||||
|
|
||||||
|
# 特征工程
|
||||||
|
|
||||||
|
|
||||||
|
# 1. 提取时间相关特征
|
||||||
|
df['year'] = df['author_date'].dt.year
|
||||||
|
df['month'] = df['author_date'].dt.month
|
||||||
|
df['day'] = df['author_date'].dt.day
|
||||||
|
df['day_of_week'] = df['author_date'].dt.dayofweek
|
||||||
|
df['hour'] = df['author_date'].dt.hour
|
||||||
|
|
||||||
|
# 2. 计算开发者提交频率
|
||||||
|
commit_counts = df.groupby('author_email').size().reset_index(name='commit_count')
|
||||||
|
|
||||||
|
# 3. 计算开发者的活跃周期
|
||||||
|
df['first_commit_date'] = df.groupby('author_email')['author_date'].transform('min')
|
||||||
|
df['last_commit_date'] = df.groupby('author_email')['author_date'].transform('max')
|
||||||
|
df['active_days'] = (df['last_commit_date'] - df['first_commit_date']).dt.days
|
||||||
|
|
||||||
|
# 4. 合并特征
|
||||||
|
df_features = pd.merge(commit_counts, avg_interval, on='author_email')
|
||||||
|
df_features = pd.merge(df_features, df[['author_email', 'active_days']].drop_duplicates(), on='author_email')
|
||||||
|
|
||||||
|
output_path = 'Dataset.csv'
|
||||||
|
df_features.to_csv(output_path, index=False)
|
||||||
|
print("数据预处理完成。")
|
||||||
|
|
||||||
|
|
||||||
|
# 模型训练
|
||||||
|
|
||||||
|
# Load Data
|
||||||
|
data = pd.read_csv('dataset.csv', index_col=0)
|
||||||
|
# 去掉非数字特征和直接前驱特征
|
||||||
|
X = data.loc[:, 'prs': 'sig_cluster'].drop(labels=['last_contribute_to_now', 'user_login_pr'], axis=1)
|
||||||
|
print(X)
|
||||||
|
|
||||||
|
X = MinMaxScaler().fit_transform(X.values)
|
||||||
|
y = data['tag'].values
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
|
||||||
|
|
||||||
|
# Results initialize
|
||||||
|
results = pd.DataFrame(columns=['Acc', 'Pre', 'Rec', 'F1'],
|
||||||
|
index=['LR', 'SVM', 'LDA', 'NB', 'KNN', 'NN', 'DT', 'RF', 'GBT'])
|
||||||
|
print(results)
|
||||||
|
|
||||||
|
# Logistic Regression
|
||||||
|
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
|
||||||
|
y_pred = clf.predict(X_test)
|
||||||
|
results.loc['LR', 'Acc'] = accuracy_score(y_test, y_pred)
|
||||||
|
results.loc['LR', 'Pre'] = precision_score(y_test, y_pred)
|
||||||
|
results.loc['LR', 'Rec'] = recall_score(y_test, y_pred)
|
||||||
|
results.loc['LR', 'F1'] = f1_score(y_test, y_pred)
|
||||||
|
print(results.loc['LR'])
|
||||||
|
|
||||||
|
# SVM
|
||||||
|
clf = SVC(random_state=0).fit(X_train, y_train)
|
||||||
|
y_pred = clf.predict(X_test)
|
||||||
|
results.loc['SVM', 'Acc'] = accuracy_score(y_test, y_pred)
|
||||||
|
results.loc['SVM', 'Pre'] = precision_score(y_test, y_pred)
|
||||||
|
results.loc['SVM', 'Rec'] = recall_score(y_test, y_pred)
|
||||||
|
results.loc['SVM', 'F1'] = f1_score(y_test, y_pred)
|
||||||
|
print(results.loc['SVM'])
|
||||||
|
|
||||||
|
# NN
|
||||||
|
clf = MLPClassifier(random_state=0, max_iter=10000).fit(X_train, y_train)
|
||||||
|
y_pred = clf.predict(X_test)
|
||||||
|
results.loc['NN', 'Acc'] = accuracy_score(y_test, y_pred)
|
||||||
|
results.loc['NN', 'Pre'] = precision_score(y_test, y_pred)
|
||||||
|
results.loc['NN', 'Rec'] = recall_score(y_test, y_pred)
|
||||||
|
results.loc['NN', 'F1'] = f1_score(y_test, y_pred)
|
||||||
|
print(results.loc['NN'])
|
||||||
|
|
||||||
|
# LDA
|
||||||
|
clf = LinearDiscriminantAnalysis().fit(X_train, y_train)
|
||||||
|
y_pred = clf.predict(X_test)
|
||||||
|
results.loc['LDA', 'Acc'] = accuracy_score(y_test, y_pred)
|
||||||
|
results.loc['LDA', 'Pre'] = precision_score(y_test, y_pred)
|
||||||
|
results.loc['LDA', 'Rec'] = recall_score(y_test, y_pred)
|
||||||
|
results.loc['LDA', 'F1'] = f1_score(y_test, y_pred)
|
||||||
|
print(results.loc['LDA'])
|
||||||
|
|
||||||
|
# NB
|
||||||
|
clf = GaussianNB().fit(X_train, y_train)
|
||||||
|
y_pred = clf.predict(X_test)
|
||||||
|
results.loc['NB', 'Acc'] = accuracy_score(y_test, y_pred)
|
||||||
|
results.loc['NB', 'Pre'] = precision_score(y_test, y_pred)
|
||||||
|
results.loc['NB', 'Rec'] = recall_score(y_test, y_pred)
|
||||||
|
results.loc['NB', 'F1'] = f1_score(y_test, y_pred)
|
||||||
|
print(results.loc['NB'])
|
||||||
|
|
||||||
|
# KNN
|
||||||
|
clf = KNeighborsClassifier().fit(X_train, y_train)
|
||||||
|
y_pred = clf.predict(X_test)
|
||||||
|
results.loc['KNN', 'Acc'] = accuracy_score(y_test, y_pred)
|
||||||
|
results.loc['KNN', 'Pre'] = precision_score(y_test, y_pred)
|
||||||
|
results.loc['KNN', 'Rec'] = recall_score(y_test, y_pred)
|
||||||
|
results.loc['KNN', 'F1'] = f1_score(y_test, y_pred)
|
||||||
|
print(results.loc['KNN'])
|
||||||
|
|
||||||
|
# DT
|
||||||
|
clf = DecisionTreeClassifier().fit(X_train, y_train)
|
||||||
|
y_pred = clf.predict(X_test)
|
||||||
|
results.loc['DT', 'Acc'] = accuracy_score(y_test, y_pred)
|
||||||
|
results.loc['DT', 'Pre'] = precision_score(y_test, y_pred)
|
||||||
|
results.loc['DT', 'Rec'] = recall_score(y_test, y_pred)
|
||||||
|
results.loc['DT', 'F1'] = f1_score(y_test, y_pred)
|
||||||
|
print(results.loc['DT'])
|
||||||
|
|
||||||
|
# RF
|
||||||
|
clf = RandomForestClassifier().fit(X_train, y_train)
|
||||||
|
y_pred = clf.predict(X_test)
|
||||||
|
results.loc['RF', 'Acc'] = accuracy_score(y_test, y_pred)
|
||||||
|
results.loc['RF', 'Pre'] = precision_score(y_test, y_pred)
|
||||||
|
results.loc['RF', 'Rec'] = recall_score(y_test, y_pred)
|
||||||
|
results.loc['RF', 'F1'] = f1_score(y_test, y_pred)
|
||||||
|
print(results.loc['RF'])
|
||||||
|
|
||||||
|
# GBT
|
||||||
|
clf = GradientBoostingClassifier().fit(X_train, y_train)
|
||||||
|
y_pred = clf.predict(X_test)
|
||||||
|
results.loc['GBT', 'Acc'] = accuracy_score(y_test, y_pred)
|
||||||
|
results.loc['GBT', 'Pre'] = precision_score(y_test, y_pred)
|
||||||
|
results.loc['GBT', 'Rec'] = recall_score(y_test, y_pred)
|
||||||
|
results.loc['GBT', 'F1'] = f1_score(y_test, y_pred)
|
||||||
|
print(results.loc['GBT'])
|
||||||
|
|
||||||
|
print(results)
|
Loading…
Reference in new issue