You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Influenza_fund_linkage_system/row_data_interpolation.py

54 lines
1.6 KiB

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
# 加载数据集
file_path = 'row_data.csv'
data = pd.read_csv(file_path)
# 将日期列转换为时间戳
data['date'] = pd.to_datetime(data['date']).astype('int64') / 10 ** 9
# 定义一个函数来进行回归填补
def fill_missing_values_with_regression(df, target_column):
# 拆分数据为训练集和测试集
train_data = df[df[target_column].notna()]
test_data = df[df[target_column].isna()]
# 特征和目标
X_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]
X_test = test_data.drop(columns=[target_column])
# 简单填补其他特征的缺失值
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
# 训练回归模型
model = LinearRegression()
model.fit(X_train, y_train)
# 预测缺失值
predicted_values = model.predict(X_test)
# 填补缺失值
df.loc[df[target_column].isna(), target_column] = predicted_values
return df
# 只对数值列进行回归填补
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
for column in numeric_columns:
if data[column].isna().sum() > 0:
data = fill_missing_values_with_regression(data, column)
data['date'] = pd.to_datetime(data['date'], unit='s')
# 显示填补后的数据集前几行
print("填补后的数据:")
print(data.head())
# 保存填补后的数据集
data.to_csv('filled_row_data.csv', index=False)