From e64d0e8bab4bf68ba24508f67f6539caa0f706bb Mon Sep 17 00:00:00 2001 From: poxf2i6c4 <2437587625@qq.com> Date: Mon, 20 Jan 2025 19:27:07 +0800 Subject: [PATCH] ADD file via upload --- src | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 src diff --git a/src b/src new file mode 100644 index 0000000..060f4cf --- /dev/null +++ b/src @@ -0,0 +1,44 @@ +import json +import pandas as pd +from datetime import timedelta,datetime + +# 从 JSON 文件中加载数据 + with open('commits_data_large.json', 'r') as f: + commits = json.load(f) + + # 将数据转换为 DataFrame + df = pd.DataFrame(commits) + +# 1.处理缺失值 +df = df.dropna() + +# 2. 转换日期时间格式 +df['author_date'] = pd.to_datetime(df['author_date']) + + + + +# 特征工程 + + # 1. 提取时间相关特征 + df['year'] = df['author_date'].dt.year + df['month'] = df['author_date'].dt.month +df['day'] = df['author_date'].dt.day +df['day_of_week'] = df['author_date'].dt.dayofweek +df['hour'] = df['author_date'].dt.hour + +# 2. 计算开发者提交频率 +commit_counts = df.groupby('author_email').size().reset_index(name='commit_count') + +# 3. 计算开发者的活跃周期 +df['first_commit_date'] = df.groupby('author_email')['author_date'].transform('min') +df['last_commit_date'] = df.groupby('author_email')['author_date'].transform('max') +df['active_days'] = (df['last_commit_date'] - df['first_commit_date']).dt.days + +# 4. 合并特征 +df_features = pd.merge(commit_counts, avg_interval, on='author_email') +df_features = pd.merge(df_features, df[['author_email', 'active_days']].drop_duplicates(), on='author_email') + +output_path = 'Dataset.csv' +df_features.to_csv(output_path, index=False) +print("数据预处理完成。") \ No newline at end of file