parent
b50aed47cb
commit
e64d0e8bab
@ -0,0 +1,44 @@
|
||||
import json
|
||||
import pandas as pd
|
||||
from datetime import timedelta,datetime
|
||||
|
||||
# 从 JSON 文件中加载数据
|
||||
with open('commits_data_large.json', 'r') as f:
|
||||
commits = json.load(f)
|
||||
|
||||
# 将数据转换为 DataFrame
|
||||
df = pd.DataFrame(commits)
|
||||
|
||||
# 1.处理缺失值
|
||||
df = df.dropna()
|
||||
|
||||
# 2. 转换日期时间格式
|
||||
df['author_date'] = pd.to_datetime(df['author_date'])
|
||||
|
||||
|
||||
|
||||
|
||||
# 特征工程
|
||||
|
||||
# 1. 提取时间相关特征
|
||||
df['year'] = df['author_date'].dt.year
|
||||
df['month'] = df['author_date'].dt.month
|
||||
df['day'] = df['author_date'].dt.day
|
||||
df['day_of_week'] = df['author_date'].dt.dayofweek
|
||||
df['hour'] = df['author_date'].dt.hour
|
||||
|
||||
# 2. 计算开发者提交频率
|
||||
commit_counts = df.groupby('author_email').size().reset_index(name='commit_count')
|
||||
|
||||
# 3. 计算开发者的活跃周期
|
||||
df['first_commit_date'] = df.groupby('author_email')['author_date'].transform('min')
|
||||
df['last_commit_date'] = df.groupby('author_email')['author_date'].transform('max')
|
||||
df['active_days'] = (df['last_commit_date'] - df['first_commit_date']).dt.days
|
||||
|
||||
# 4. 合并特征
|
||||
df_features = pd.merge(commit_counts, avg_interval, on='author_email')
|
||||
df_features = pd.merge(df_features, df[['author_email', 'active_days']].drop_duplicates(), on='author_email')
|
||||
|
||||
output_path = 'Dataset.csv'
|
||||
df_features.to_csv(output_path, index=False)
|
||||
print("数据预处理完成。")
|
Loading…
Reference in new issue