You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
54 lines
1.6 KiB
54 lines
1.6 KiB
5 months ago
|
import pandas as pd
|
||
|
from sklearn.linear_model import LinearRegression
|
||
|
from sklearn.impute import SimpleImputer
|
||
|
|
||
|
# 加载数据集
|
||
|
file_path = 'row_data.csv'
|
||
|
data = pd.read_csv(file_path)
|
||
|
|
||
|
# 将日期列转换为时间戳
|
||
|
data['date'] = pd.to_datetime(data['date']).astype('int64') / 10 ** 9
|
||
|
|
||
|
|
||
|
# 定义一个函数来进行回归填补
|
||
|
def fill_missing_values_with_regression(df, target_column):
|
||
|
# 拆分数据为训练集和测试集
|
||
|
train_data = df[df[target_column].notna()]
|
||
|
test_data = df[df[target_column].isna()]
|
||
|
|
||
|
# 特征和目标
|
||
|
X_train = train_data.drop(columns=[target_column])
|
||
|
y_train = train_data[target_column]
|
||
|
X_test = test_data.drop(columns=[target_column])
|
||
|
|
||
|
# 简单填补其他特征的缺失值
|
||
|
imputer = SimpleImputer(strategy='mean')
|
||
|
X_train = imputer.fit_transform(X_train)
|
||
|
X_test = imputer.transform(X_test)
|
||
|
|
||
|
# 训练回归模型
|
||
|
model = LinearRegression()
|
||
|
model.fit(X_train, y_train)
|
||
|
|
||
|
# 预测缺失值
|
||
|
predicted_values = model.predict(X_test)
|
||
|
|
||
|
# 填补缺失值
|
||
|
df.loc[df[target_column].isna(), target_column] = predicted_values
|
||
|
|
||
|
return df
|
||
|
|
||
|
|
||
|
# 只对数值列进行回归填补
|
||
|
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
|
||
|
|
||
|
for column in numeric_columns:
|
||
|
if data[column].isna().sum() > 0:
|
||
|
data = fill_missing_values_with_regression(data, column)
|
||
|
data['date'] = pd.to_datetime(data['date'], unit='s')
|
||
|
# 显示填补后的数据集前几行
|
||
|
print("填补后的数据:")
|
||
|
print(data.head())
|
||
|
|
||
|
# 保存填补后的数据集
|
||
|
data.to_csv('filled_row_data.csv', index=False)
|