import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.impute import SimpleImputer # 加载数据集 file_path = 'row_data.csv' data = pd.read_csv(file_path) # 将日期列转换为时间戳 data['date'] = pd.to_datetime(data['date']).astype('int64') / 10 ** 9 # 定义一个函数来进行回归填补 def fill_missing_values_with_regression(df, target_column): # 拆分数据为训练集和测试集 train_data = df[df[target_column].notna()] test_data = df[df[target_column].isna()] # 特征和目标 X_train = train_data.drop(columns=[target_column]) y_train = train_data[target_column] X_test = test_data.drop(columns=[target_column]) # 简单填补其他特征的缺失值 imputer = SimpleImputer(strategy='mean') X_train = imputer.fit_transform(X_train) X_test = imputer.transform(X_test) # 训练回归模型 model = LinearRegression() model.fit(X_train, y_train) # 预测缺失值 predicted_values = model.predict(X_test) # 填补缺失值 df.loc[df[target_column].isna(), target_column] = predicted_values return df # 只对数值列进行回归填补 numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns for column in numeric_columns: if data[column].isna().sum() > 0: data = fill_missing_values_with_regression(data, column) data['date'] = pd.to_datetime(data['date'], unit='s') # 显示填补后的数据集前几行 print("填补后的数据:") print(data.head()) # 保存填补后的数据集 data.to_csv('filled_row_data.csv', index=False)