ADD file via upload

main
poxf2i6c4 7 months ago
parent b50aed47cb
commit e64d0e8bab

44
src

@ -0,0 +1,44 @@
import json
import pandas as pd
from datetime import timedelta,datetime
# 从 JSON 文件中加载数据
with open('commits_data_large.json', 'r') as f:
commits = json.load(f)
# 将数据转换为 DataFrame
df = pd.DataFrame(commits)
# 1.处理缺失值
df = df.dropna()
# 2. 转换日期时间格式
df['author_date'] = pd.to_datetime(df['author_date'])
# 特征工程
# 1. 提取时间相关特征
df['year'] = df['author_date'].dt.year
df['month'] = df['author_date'].dt.month
df['day'] = df['author_date'].dt.day
df['day_of_week'] = df['author_date'].dt.dayofweek
df['hour'] = df['author_date'].dt.hour
# 2. 计算开发者提交频率
commit_counts = df.groupby('author_email').size().reset_index(name='commit_count')
# 3. 计算开发者的活跃周期
df['first_commit_date'] = df.groupby('author_email')['author_date'].transform('min')
df['last_commit_date'] = df.groupby('author_email')['author_date'].transform('max')
df['active_days'] = (df['last_commit_date'] - df['first_commit_date']).dt.days
# 4. 合并特征
df_features = pd.merge(commit_counts, avg_interval, on='author_email')
df_features = pd.merge(df_features, df[['author_email', 'active_days']].drop_duplicates(), on='author_email')
output_path = 'Dataset.csv'
df_features.to_csv(output_path, index=False)
print("数据预处理完成。")
Loading…
Cancel
Save