From b50aed47cbb9f797a9c383c8185a902a430a47f6 Mon Sep 17 00:00:00 2001 From: poxf2i6c4 <2437587625@qq.com> Date: Mon, 20 Jan 2025 19:25:58 +0800 Subject: [PATCH] ADD file via upload --- main.py | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..c0335db --- /dev/null +++ b/main.py @@ -0,0 +1,174 @@ +import argparse +import json +import os +from datetime import timedelta,datetime +import pandas as pd +from prophet import Prophet +from sklearn.linear_model import LogisticRegression +from sklearn.svm import SVC +from sklearn.neural_network import MLPClassifier +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.naive_bayes import GaussianNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MinMaxScaler +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score +import joblib +from src.data_preprocessing import load_data, preprocess_data, split_data +from src.time_series_prediction import extract_features, label_churners + +# 数据预处理 +# 从 JSON 文件中加载数据 +with open('commits_data_large.json', 'r') as f: + commits = json.load(f) + +# 将数据转换为 DataFrame + +df = pd.DataFrame(commits) + +# 1.处理缺失值 +df = df.dropna() + +# 2. 转换日期时间格式 +df['author_date'] = pd.to_datetime(df['author_date']) + + +#时间序列预测 +def time_series_predict(time_series_data): + time_series = Prophet(seasonality_mode='multiplicative').fit(time_series_data) + future = time_series.make_future_dataframe() + future_time_series_data = time_series.predict(future) + return future_time_series_data + +def time_series_prediction(time_series_data): + return time_series_data + + +# 特征工程 + + +# 1. 提取时间相关特征 +df['year'] = df['author_date'].dt.year +df['month'] = df['author_date'].dt.month +df['day'] = df['author_date'].dt.day +df['day_of_week'] = df['author_date'].dt.dayofweek +df['hour'] = df['author_date'].dt.hour + +# 2. 计算开发者提交频率 +commit_counts = df.groupby('author_email').size().reset_index(name='commit_count') + +# 3. 计算开发者的活跃周期 +df['first_commit_date'] = df.groupby('author_email')['author_date'].transform('min') +df['last_commit_date'] = df.groupby('author_email')['author_date'].transform('max') +df['active_days'] = (df['last_commit_date'] - df['first_commit_date']).dt.days + +# 4. 合并特征 +df_features = pd.merge(commit_counts, avg_interval, on='author_email') +df_features = pd.merge(df_features, df[['author_email', 'active_days']].drop_duplicates(), on='author_email') + +output_path = 'Dataset.csv' +df_features.to_csv(output_path, index=False) +print("数据预处理完成。") + + +# 模型训练 + +# Load Data +data = pd.read_csv('dataset.csv', index_col=0) +# 去掉非数字特征和直接前驱特征 +X = data.loc[:, 'prs': 'sig_cluster'].drop(labels=['last_contribute_to_now', 'user_login_pr'], axis=1) +print(X) + +X = MinMaxScaler().fit_transform(X.values) +y = data['tag'].values +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) + +# Results initialize +results = pd.DataFrame(columns=['Acc', 'Pre', 'Rec', 'F1'], + index=['LR', 'SVM', 'LDA', 'NB', 'KNN', 'NN', 'DT', 'RF', 'GBT']) +print(results) + +# Logistic Regression +clf = LogisticRegression(random_state=0).fit(X_train, y_train) +y_pred = clf.predict(X_test) +results.loc['LR', 'Acc'] = accuracy_score(y_test, y_pred) +results.loc['LR', 'Pre'] = precision_score(y_test, y_pred) +results.loc['LR', 'Rec'] = recall_score(y_test, y_pred) +results.loc['LR', 'F1'] = f1_score(y_test, y_pred) +print(results.loc['LR']) + +# SVM +clf = SVC(random_state=0).fit(X_train, y_train) +y_pred = clf.predict(X_test) +results.loc['SVM', 'Acc'] = accuracy_score(y_test, y_pred) +results.loc['SVM', 'Pre'] = precision_score(y_test, y_pred) +results.loc['SVM', 'Rec'] = recall_score(y_test, y_pred) +results.loc['SVM', 'F1'] = f1_score(y_test, y_pred) +print(results.loc['SVM']) + +# NN +clf = MLPClassifier(random_state=0, max_iter=10000).fit(X_train, y_train) +y_pred = clf.predict(X_test) +results.loc['NN', 'Acc'] = accuracy_score(y_test, y_pred) +results.loc['NN', 'Pre'] = precision_score(y_test, y_pred) +results.loc['NN', 'Rec'] = recall_score(y_test, y_pred) +results.loc['NN', 'F1'] = f1_score(y_test, y_pred) +print(results.loc['NN']) + +# LDA +clf = LinearDiscriminantAnalysis().fit(X_train, y_train) +y_pred = clf.predict(X_test) +results.loc['LDA', 'Acc'] = accuracy_score(y_test, y_pred) +results.loc['LDA', 'Pre'] = precision_score(y_test, y_pred) +results.loc['LDA', 'Rec'] = recall_score(y_test, y_pred) +results.loc['LDA', 'F1'] = f1_score(y_test, y_pred) +print(results.loc['LDA']) + +# NB +clf = GaussianNB().fit(X_train, y_train) +y_pred = clf.predict(X_test) +results.loc['NB', 'Acc'] = accuracy_score(y_test, y_pred) +results.loc['NB', 'Pre'] = precision_score(y_test, y_pred) +results.loc['NB', 'Rec'] = recall_score(y_test, y_pred) +results.loc['NB', 'F1'] = f1_score(y_test, y_pred) +print(results.loc['NB']) + +# KNN +clf = KNeighborsClassifier().fit(X_train, y_train) +y_pred = clf.predict(X_test) +results.loc['KNN', 'Acc'] = accuracy_score(y_test, y_pred) +results.loc['KNN', 'Pre'] = precision_score(y_test, y_pred) +results.loc['KNN', 'Rec'] = recall_score(y_test, y_pred) +results.loc['KNN', 'F1'] = f1_score(y_test, y_pred) +print(results.loc['KNN']) + +# DT +clf = DecisionTreeClassifier().fit(X_train, y_train) +y_pred = clf.predict(X_test) +results.loc['DT', 'Acc'] = accuracy_score(y_test, y_pred) +results.loc['DT', 'Pre'] = precision_score(y_test, y_pred) +results.loc['DT', 'Rec'] = recall_score(y_test, y_pred) +results.loc['DT', 'F1'] = f1_score(y_test, y_pred) +print(results.loc['DT']) + +# RF +clf = RandomForestClassifier().fit(X_train, y_train) +y_pred = clf.predict(X_test) +results.loc['RF', 'Acc'] = accuracy_score(y_test, y_pred) +results.loc['RF', 'Pre'] = precision_score(y_test, y_pred) +results.loc['RF', 'Rec'] = recall_score(y_test, y_pred) +results.loc['RF', 'F1'] = f1_score(y_test, y_pred) +print(results.loc['RF']) + +# GBT +clf = GradientBoostingClassifier().fit(X_train, y_train) +y_pred = clf.predict(X_test) +results.loc['GBT', 'Acc'] = accuracy_score(y_test, y_pred) +results.loc['GBT', 'Pre'] = precision_score(y_test, y_pred) +results.loc['GBT', 'Rec'] = recall_score(y_test, y_pred) +results.loc['GBT', 'F1'] = f1_score(y_test, y_pred) +print(results.loc['GBT']) + +print(results)