You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
933 lines
33 KiB
933 lines
33 KiB
7 months ago
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas._config import using_pyarrow_string_dtype
|
||
|
|
||
|
import pandas.util._test_decorators as td
|
||
|
|
||
|
from pandas import (
|
||
|
Categorical,
|
||
|
DataFrame,
|
||
|
DatetimeIndex,
|
||
|
NaT,
|
||
|
PeriodIndex,
|
||
|
Series,
|
||
|
TimedeltaIndex,
|
||
|
Timestamp,
|
||
|
date_range,
|
||
|
to_datetime,
|
||
|
)
|
||
|
import pandas._testing as tm
|
||
|
from pandas.tests.frame.common import _check_mixed_float
|
||
|
|
||
|
|
||
|
class TestFillNA:
|
||
|
def test_fillna_dict_inplace_nonunique_columns(
|
||
|
self, using_copy_on_write, warn_copy_on_write
|
||
|
):
|
||
|
df = DataFrame(
|
||
|
{"A": [np.nan] * 3, "B": [NaT, Timestamp(1), NaT], "C": [np.nan, "foo", 2]}
|
||
|
)
|
||
|
df.columns = ["A", "A", "A"]
|
||
|
orig = df[:]
|
||
|
|
||
|
# TODO(CoW-warn) better warning message
|
||
|
with tm.assert_cow_warning(warn_copy_on_write):
|
||
|
df.fillna({"A": 2}, inplace=True)
|
||
|
# The first and third columns can be set inplace, while the second cannot.
|
||
|
|
||
|
expected = DataFrame(
|
||
|
{"A": [2.0] * 3, "B": [2, Timestamp(1), 2], "C": [2, "foo", 2]}
|
||
|
)
|
||
|
expected.columns = ["A", "A", "A"]
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
# TODO: what's the expected/desired behavior with CoW?
|
||
|
if not using_copy_on_write:
|
||
|
assert tm.shares_memory(df.iloc[:, 0], orig.iloc[:, 0])
|
||
|
assert not tm.shares_memory(df.iloc[:, 1], orig.iloc[:, 1])
|
||
|
if not using_copy_on_write:
|
||
|
assert tm.shares_memory(df.iloc[:, 2], orig.iloc[:, 2])
|
||
|
|
||
|
@td.skip_array_manager_not_yet_implemented
|
||
|
def test_fillna_on_column_view(self, using_copy_on_write):
|
||
|
# GH#46149 avoid unnecessary copies
|
||
|
arr = np.full((40, 50), np.nan)
|
||
|
df = DataFrame(arr, copy=False)
|
||
|
|
||
|
if using_copy_on_write:
|
||
|
with tm.raises_chained_assignment_error():
|
||
|
df[0].fillna(-1, inplace=True)
|
||
|
assert np.isnan(arr[:, 0]).all()
|
||
|
else:
|
||
|
with tm.assert_produces_warning(FutureWarning, match="inplace method"):
|
||
|
df[0].fillna(-1, inplace=True)
|
||
|
assert (arr[:, 0] == -1).all()
|
||
|
|
||
|
# i.e. we didn't create a new 49-column block
|
||
|
assert len(df._mgr.arrays) == 1
|
||
|
assert np.shares_memory(df.values, arr)
|
||
|
|
||
|
def test_fillna_datetime(self, datetime_frame):
|
||
|
tf = datetime_frame
|
||
|
tf.loc[tf.index[:5], "A"] = np.nan
|
||
|
tf.loc[tf.index[-5:], "A"] = np.nan
|
||
|
|
||
|
zero_filled = datetime_frame.fillna(0)
|
||
|
assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all()
|
||
|
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
padded = datetime_frame.fillna(method="pad")
|
||
|
assert np.isnan(padded.loc[padded.index[:5], "A"]).all()
|
||
|
assert (
|
||
|
padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"]
|
||
|
).all()
|
||
|
|
||
|
msg = "Must specify a fill 'value' or 'method'"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
datetime_frame.fillna()
|
||
|
msg = "Cannot specify both 'value' and 'method'"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
datetime_frame.fillna(5, method="ffill")
|
||
|
|
||
|
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string")
|
||
|
def test_fillna_mixed_type(self, float_string_frame):
|
||
|
mf = float_string_frame
|
||
|
mf.loc[mf.index[5:20], "foo"] = np.nan
|
||
|
mf.loc[mf.index[-10:], "A"] = np.nan
|
||
|
# TODO: make stronger assertion here, GH 25640
|
||
|
mf.fillna(value=0)
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
mf.fillna(method="pad")
|
||
|
|
||
|
def test_fillna_mixed_float(self, mixed_float_frame):
|
||
|
# mixed numeric (but no float16)
|
||
|
mf = mixed_float_frame.reindex(columns=["A", "B", "D"])
|
||
|
mf.loc[mf.index[-10:], "A"] = np.nan
|
||
|
result = mf.fillna(value=0)
|
||
|
_check_mixed_float(result, dtype={"C": None})
|
||
|
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
result = mf.fillna(method="pad")
|
||
|
_check_mixed_float(result, dtype={"C": None})
|
||
|
|
||
|
def test_fillna_empty(self, using_copy_on_write):
|
||
|
if using_copy_on_write:
|
||
|
pytest.skip("condition is unnecessary complex and is deprecated anyway")
|
||
|
# empty frame (GH#2778)
|
||
|
df = DataFrame(columns=["x"])
|
||
|
for m in ["pad", "backfill"]:
|
||
|
msg = "Series.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
df.x.fillna(method=m, inplace=True)
|
||
|
df.x.fillna(method=m)
|
||
|
|
||
|
def test_fillna_different_dtype(self, using_infer_string):
|
||
|
# with different dtype (GH#3386)
|
||
|
df = DataFrame(
|
||
|
[["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
|
||
|
)
|
||
|
|
||
|
if using_infer_string:
|
||
|
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
|
||
|
result = df.fillna({2: "foo"})
|
||
|
else:
|
||
|
result = df.fillna({2: "foo"})
|
||
|
expected = DataFrame(
|
||
|
[["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
if using_infer_string:
|
||
|
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
|
||
|
return_value = df.fillna({2: "foo"}, inplace=True)
|
||
|
else:
|
||
|
return_value = df.fillna({2: "foo"}, inplace=True)
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
assert return_value is None
|
||
|
|
||
|
def test_fillna_limit_and_value(self):
|
||
|
# limit and value
|
||
|
df = DataFrame(np.random.default_rng(2).standard_normal((10, 3)))
|
||
|
df.iloc[2:7, 0] = np.nan
|
||
|
df.iloc[3:5, 2] = np.nan
|
||
|
|
||
|
expected = df.copy()
|
||
|
expected.iloc[2, 0] = 999
|
||
|
expected.iloc[3, 2] = 999
|
||
|
result = df.fillna(999, limit=1)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_datelike(self):
|
||
|
# with datelike
|
||
|
# GH#6344
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"Date": [NaT, Timestamp("2014-1-1")],
|
||
|
"Date2": [Timestamp("2013-1-1"), NaT],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
expected = df.copy()
|
||
|
expected["Date"] = expected["Date"].fillna(df.loc[df.index[0], "Date2"])
|
||
|
result = df.fillna(value={"Date": df["Date2"]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_tzaware(self):
|
||
|
# with timezone
|
||
|
# GH#15855
|
||
|
df = DataFrame({"A": [Timestamp("2012-11-11 00:00:00+01:00"), NaT]})
|
||
|
exp = DataFrame(
|
||
|
{
|
||
|
"A": [
|
||
|
Timestamp("2012-11-11 00:00:00+01:00"),
|
||
|
Timestamp("2012-11-11 00:00:00+01:00"),
|
||
|
]
|
||
|
}
|
||
|
)
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
res = df.fillna(method="pad")
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
|
||
|
df = DataFrame({"A": [NaT, Timestamp("2012-11-11 00:00:00+01:00")]})
|
||
|
exp = DataFrame(
|
||
|
{
|
||
|
"A": [
|
||
|
Timestamp("2012-11-11 00:00:00+01:00"),
|
||
|
Timestamp("2012-11-11 00:00:00+01:00"),
|
||
|
]
|
||
|
}
|
||
|
)
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
res = df.fillna(method="bfill")
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
|
||
|
def test_fillna_tzaware_different_column(self):
|
||
|
# with timezone in another column
|
||
|
# GH#15522
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": date_range("20130101", periods=4, tz="US/Eastern"),
|
||
|
"B": [1, 2, np.nan, np.nan],
|
||
|
}
|
||
|
)
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
result = df.fillna(method="pad")
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"A": date_range("20130101", periods=4, tz="US/Eastern"),
|
||
|
"B": [1.0, 2.0, 2.0, 2.0],
|
||
|
}
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_na_actions_categorical(self):
|
||
|
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
|
||
|
vals = ["a", "b", np.nan, "d"]
|
||
|
df = DataFrame({"cats": cat, "vals": vals})
|
||
|
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
|
||
|
vals2 = ["a", "b", "b", "d"]
|
||
|
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
|
||
|
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
|
||
|
vals3 = ["a", "b", np.nan]
|
||
|
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
|
||
|
cat4 = Categorical([1, 2], categories=[1, 2, 3])
|
||
|
vals4 = ["a", "b"]
|
||
|
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
|
||
|
|
||
|
# fillna
|
||
|
res = df.fillna(value={"cats": 3, "vals": "b"})
|
||
|
tm.assert_frame_equal(res, df_exp_fill)
|
||
|
|
||
|
msg = "Cannot setitem on a Categorical with a new category"
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
df.fillna(value={"cats": 4, "vals": "c"})
|
||
|
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
res = df.fillna(method="pad")
|
||
|
tm.assert_frame_equal(res, df_exp_fill)
|
||
|
|
||
|
# dropna
|
||
|
res = df.dropna(subset=["cats"])
|
||
|
tm.assert_frame_equal(res, df_exp_drop_cats)
|
||
|
|
||
|
res = df.dropna()
|
||
|
tm.assert_frame_equal(res, df_exp_drop_all)
|
||
|
|
||
|
# make sure that fillna takes missing values into account
|
||
|
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
|
||
|
df = DataFrame({"cats": c, "vals": [1, 2, 3]})
|
||
|
|
||
|
cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
|
||
|
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
|
||
|
|
||
|
res = df.fillna("a")
|
||
|
tm.assert_frame_equal(res, df_exp)
|
||
|
|
||
|
def test_fillna_categorical_nan(self):
|
||
|
# GH#14021
|
||
|
# np.nan should always be a valid filler
|
||
|
cat = Categorical([np.nan, 2, np.nan])
|
||
|
val = Categorical([np.nan, np.nan, np.nan])
|
||
|
df = DataFrame({"cats": cat, "vals": val})
|
||
|
|
||
|
# GH#32950 df.median() is poorly behaved because there is no
|
||
|
# Categorical.median
|
||
|
median = Series({"cats": 2.0, "vals": np.nan})
|
||
|
|
||
|
res = df.fillna(median)
|
||
|
v_exp = [np.nan, np.nan, np.nan]
|
||
|
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category")
|
||
|
tm.assert_frame_equal(res, df_exp)
|
||
|
|
||
|
result = df.cats.fillna(np.nan)
|
||
|
tm.assert_series_equal(result, df.cats)
|
||
|
|
||
|
result = df.vals.fillna(np.nan)
|
||
|
tm.assert_series_equal(result, df.vals)
|
||
|
|
||
|
idx = DatetimeIndex(
|
||
|
["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", NaT, NaT]
|
||
|
)
|
||
|
df = DataFrame({"a": Categorical(idx)})
|
||
|
tm.assert_frame_equal(df.fillna(value=NaT), df)
|
||
|
|
||
|
idx = PeriodIndex(["2011-01", "2011-01", "2011-01", NaT, NaT], freq="M")
|
||
|
df = DataFrame({"a": Categorical(idx)})
|
||
|
tm.assert_frame_equal(df.fillna(value=NaT), df)
|
||
|
|
||
|
idx = TimedeltaIndex(["1 days", "2 days", "1 days", NaT, NaT])
|
||
|
df = DataFrame({"a": Categorical(idx)})
|
||
|
tm.assert_frame_equal(df.fillna(value=NaT), df)
|
||
|
|
||
|
def test_fillna_downcast(self):
|
||
|
# GH#15277
|
||
|
# infer int64 from float64
|
||
|
df = DataFrame({"a": [1.0, np.nan]})
|
||
|
msg = "The 'downcast' keyword in fillna is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
result = df.fillna(0, downcast="infer")
|
||
|
expected = DataFrame({"a": [1, 0]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# infer int64 from float64 when fillna value is a dict
|
||
|
df = DataFrame({"a": [1.0, np.nan]})
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
result = df.fillna({"a": 0}, downcast="infer")
|
||
|
expected = DataFrame({"a": [1, 0]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_downcast_false(self, frame_or_series):
|
||
|
# GH#45603 preserve object dtype with downcast=False
|
||
|
obj = frame_or_series([1, 2, 3], dtype="object")
|
||
|
msg = "The 'downcast' keyword in fillna"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
result = obj.fillna("", downcast=False)
|
||
|
tm.assert_equal(result, obj)
|
||
|
|
||
|
def test_fillna_downcast_noop(self, frame_or_series):
|
||
|
# GH#45423
|
||
|
# Two relevant paths:
|
||
|
# 1) not _can_hold_na (e.g. integer)
|
||
|
# 2) _can_hold_na + noop + not can_hold_element
|
||
|
|
||
|
obj = frame_or_series([1, 2, 3], dtype=np.int64)
|
||
|
|
||
|
msg = "The 'downcast' keyword in fillna"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
# GH#40988
|
||
|
res = obj.fillna("foo", downcast=np.dtype(np.int32))
|
||
|
expected = obj.astype(np.int32)
|
||
|
tm.assert_equal(res, expected)
|
||
|
|
||
|
obj2 = obj.astype(np.float64)
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
res2 = obj2.fillna("foo", downcast="infer")
|
||
|
expected2 = obj # get back int64
|
||
|
tm.assert_equal(res2, expected2)
|
||
|
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
# GH#40988
|
||
|
res3 = obj2.fillna("foo", downcast=np.dtype(np.int32))
|
||
|
tm.assert_equal(res3, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("columns", [["A", "A", "B"], ["A", "A"]])
|
||
|
def test_fillna_dictlike_value_duplicate_colnames(self, columns):
|
||
|
# GH#43476
|
||
|
df = DataFrame(np.nan, index=[0, 1], columns=columns)
|
||
|
with tm.assert_produces_warning(None):
|
||
|
result = df.fillna({"A": 0})
|
||
|
|
||
|
expected = df.copy()
|
||
|
expected["A"] = 0.0
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_dtype_conversion(self, using_infer_string):
|
||
|
# make sure that fillna on an empty frame works
|
||
|
df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
|
||
|
result = df.dtypes
|
||
|
expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
msg = "Downcasting object dtype arrays"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
result = df.fillna(1)
|
||
|
expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# empty block
|
||
|
df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
|
||
|
if using_infer_string:
|
||
|
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
|
||
|
result = df.fillna("nan")
|
||
|
else:
|
||
|
result = df.fillna("nan")
|
||
|
expected = DataFrame("nan", index=range(3), columns=["A", "B"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("val", ["", 1, np.nan, 1.0])
|
||
|
def test_fillna_dtype_conversion_equiv_replace(self, val):
|
||
|
df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]})
|
||
|
expected = df.replace(np.nan, val)
|
||
|
result = df.fillna(val)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_datetime_columns(self):
|
||
|
# GH#7095
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": [-1, -2, np.nan],
|
||
|
"B": date_range("20130101", periods=3),
|
||
|
"C": ["foo", "bar", None],
|
||
|
"D": ["foo2", "bar2", None],
|
||
|
},
|
||
|
index=date_range("20130110", periods=3),
|
||
|
)
|
||
|
result = df.fillna("?")
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"A": [-1, -2, "?"],
|
||
|
"B": date_range("20130101", periods=3),
|
||
|
"C": ["foo", "bar", "?"],
|
||
|
"D": ["foo2", "bar2", "?"],
|
||
|
},
|
||
|
index=date_range("20130110", periods=3),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": [-1, -2, np.nan],
|
||
|
"B": [Timestamp("2013-01-01"), Timestamp("2013-01-02"), NaT],
|
||
|
"C": ["foo", "bar", None],
|
||
|
"D": ["foo2", "bar2", None],
|
||
|
},
|
||
|
index=date_range("20130110", periods=3),
|
||
|
)
|
||
|
result = df.fillna("?")
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"A": [-1, -2, "?"],
|
||
|
"B": [Timestamp("2013-01-01"), Timestamp("2013-01-02"), "?"],
|
||
|
"C": ["foo", "bar", "?"],
|
||
|
"D": ["foo2", "bar2", "?"],
|
||
|
},
|
||
|
index=date_range("20130110", periods=3),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_ffill(self, datetime_frame):
|
||
|
datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
|
||
|
datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
|
||
|
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
alt = datetime_frame.fillna(method="ffill")
|
||
|
tm.assert_frame_equal(datetime_frame.ffill(), alt)
|
||
|
|
||
|
def test_bfill(self, datetime_frame):
|
||
|
datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
|
||
|
datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
|
||
|
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
alt = datetime_frame.fillna(method="bfill")
|
||
|
|
||
|
tm.assert_frame_equal(datetime_frame.bfill(), alt)
|
||
|
|
||
|
def test_frame_pad_backfill_limit(self):
|
||
|
index = np.arange(10)
|
||
|
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)), index=index)
|
||
|
|
||
|
result = df[:2].reindex(index, method="pad", limit=5)
|
||
|
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
expected = df[:2].reindex(index).fillna(method="pad")
|
||
|
expected.iloc[-3:] = np.nan
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df[-2:].reindex(index, method="backfill", limit=5)
|
||
|
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
expected = df[-2:].reindex(index).fillna(method="backfill")
|
||
|
expected.iloc[:3] = np.nan
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_frame_fillna_limit(self):
|
||
|
index = np.arange(10)
|
||
|
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)), index=index)
|
||
|
|
||
|
result = df[:2].reindex(index)
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
result = result.fillna(method="pad", limit=5)
|
||
|
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
expected = df[:2].reindex(index).fillna(method="pad")
|
||
|
expected.iloc[-3:] = np.nan
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df[-2:].reindex(index)
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
result = result.fillna(method="backfill", limit=5)
|
||
|
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
expected = df[-2:].reindex(index).fillna(method="backfill")
|
||
|
expected.iloc[:3] = np.nan
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_skip_certain_blocks(self):
|
||
|
# don't try to fill boolean, int blocks
|
||
|
|
||
|
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)).astype(int))
|
||
|
|
||
|
# it works!
|
||
|
df.fillna(np.nan)
|
||
|
|
||
|
@pytest.mark.parametrize("type", [int, float])
|
||
|
def test_fillna_positive_limit(self, type):
|
||
|
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))).astype(type)
|
||
|
|
||
|
msg = "Limit must be greater than 0"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.fillna(0, limit=-5)
|
||
|
|
||
|
@pytest.mark.parametrize("type", [int, float])
|
||
|
def test_fillna_integer_limit(self, type):
|
||
|
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))).astype(type)
|
||
|
|
||
|
msg = "Limit must be an integer"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.fillna(0, limit=0.5)
|
||
|
|
||
|
def test_fillna_inplace(self):
|
||
|
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
|
||
|
df.loc[:4, 1] = np.nan
|
||
|
df.loc[-4:, 3] = np.nan
|
||
|
|
||
|
expected = df.fillna(value=0)
|
||
|
assert expected is not df
|
||
|
|
||
|
df.fillna(value=0, inplace=True)
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
expected = df.fillna(value={0: 0}, inplace=True)
|
||
|
assert expected is None
|
||
|
|
||
|
df.loc[:4, 1] = np.nan
|
||
|
df.loc[-4:, 3] = np.nan
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
expected = df.fillna(method="ffill")
|
||
|
assert expected is not df
|
||
|
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
df.fillna(method="ffill", inplace=True)
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
def test_fillna_dict_series(self):
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"a": [np.nan, 1, 2, np.nan, np.nan],
|
||
|
"b": [1, 2, 3, np.nan, np.nan],
|
||
|
"c": [np.nan, 1, 2, 3, 4],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
result = df.fillna({"a": 0, "b": 5})
|
||
|
|
||
|
expected = df.copy()
|
||
|
expected["a"] = expected["a"].fillna(0)
|
||
|
expected["b"] = expected["b"].fillna(5)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# it works
|
||
|
result = df.fillna({"a": 0, "b": 5, "d": 7})
|
||
|
|
||
|
# Series treated same as dict
|
||
|
result = df.fillna(df.max())
|
||
|
expected = df.fillna(df.max().to_dict())
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# disable this for now
|
||
|
with pytest.raises(NotImplementedError, match="column by column"):
|
||
|
df.fillna(df.max(1), axis=1)
|
||
|
|
||
|
def test_fillna_dataframe(self):
|
||
|
# GH#8377
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"a": [np.nan, 1, 2, np.nan, np.nan],
|
||
|
"b": [1, 2, 3, np.nan, np.nan],
|
||
|
"c": [np.nan, 1, 2, 3, 4],
|
||
|
},
|
||
|
index=list("VWXYZ"),
|
||
|
)
|
||
|
|
||
|
# df2 may have different index and columns
|
||
|
df2 = DataFrame(
|
||
|
{
|
||
|
"a": [np.nan, 10, 20, 30, 40],
|
||
|
"b": [50, 60, 70, 80, 90],
|
||
|
"foo": ["bar"] * 5,
|
||
|
},
|
||
|
index=list("VWXuZ"),
|
||
|
)
|
||
|
|
||
|
result = df.fillna(df2)
|
||
|
|
||
|
# only those columns and indices which are shared get filled
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"a": [np.nan, 1, 2, np.nan, 40],
|
||
|
"b": [1, 2, 3, np.nan, 90],
|
||
|
"c": [np.nan, 1, 2, 3, 4],
|
||
|
},
|
||
|
index=list("VWXYZ"),
|
||
|
)
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_columns(self):
|
||
|
arr = np.random.default_rng(2).standard_normal((10, 10))
|
||
|
arr[:, ::2] = np.nan
|
||
|
df = DataFrame(arr)
|
||
|
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
result = df.fillna(method="ffill", axis=1)
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
expected = df.T.fillna(method="pad").T
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
df.insert(6, "foo", 5)
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
result = df.fillna(method="ffill", axis=1)
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
expected = df.astype(float).fillna(method="ffill", axis=1)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_invalid_method(self, float_frame):
|
||
|
with pytest.raises(ValueError, match="ffil"):
|
||
|
float_frame.fillna(method="ffil")
|
||
|
|
||
|
def test_fillna_invalid_value(self, float_frame):
|
||
|
# list
|
||
|
msg = '"value" parameter must be a scalar or dict, but you passed a "{}"'
|
||
|
with pytest.raises(TypeError, match=msg.format("list")):
|
||
|
float_frame.fillna([1, 2])
|
||
|
# tuple
|
||
|
with pytest.raises(TypeError, match=msg.format("tuple")):
|
||
|
float_frame.fillna((1, 2))
|
||
|
# frame with series
|
||
|
msg = (
|
||
|
'"value" parameter must be a scalar, dict or Series, but you '
|
||
|
'passed a "DataFrame"'
|
||
|
)
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
float_frame.iloc[:, 0].fillna(float_frame)
|
||
|
|
||
|
def test_fillna_col_reordering(self):
|
||
|
cols = ["COL." + str(i) for i in range(5, 0, -1)]
|
||
|
data = np.random.default_rng(2).random((20, 5))
|
||
|
df = DataFrame(index=range(20), columns=cols, data=data)
|
||
|
msg = "DataFrame.fillna with 'method' is deprecated"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
filled = df.fillna(method="ffill")
|
||
|
assert df.columns.tolist() == filled.columns.tolist()
|
||
|
|
||
|
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string")
|
||
|
def test_fill_corner(self, float_frame, float_string_frame):
|
||
|
mf = float_string_frame
|
||
|
mf.loc[mf.index[5:20], "foo"] = np.nan
|
||
|
mf.loc[mf.index[-10:], "A"] = np.nan
|
||
|
|
||
|
filled = float_string_frame.fillna(value=0)
|
||
|
assert (filled.loc[filled.index[5:20], "foo"] == 0).all()
|
||
|
del float_string_frame["foo"]
|
||
|
|
||
|
float_frame.reindex(columns=[]).fillna(value=0)
|
||
|
|
||
|
def test_fillna_downcast_dict(self):
|
||
|
# GH#40809
|
||
|
df = DataFrame({"col1": [1, np.nan]})
|
||
|
|
||
|
msg = "The 'downcast' keyword in fillna"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
result = df.fillna({"col1": 2}, downcast={"col1": "int64"})
|
||
|
expected = DataFrame({"col1": [1, 2]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_with_columns_and_limit(self):
|
||
|
# GH40989
|
||
|
df = DataFrame(
|
||
|
[
|
||
|
[np.nan, 2, np.nan, 0],
|
||
|
[3, 4, np.nan, 1],
|
||
|
[np.nan, np.nan, np.nan, 5],
|
||
|
[np.nan, 3, np.nan, 4],
|
||
|
],
|
||
|
columns=list("ABCD"),
|
||
|
)
|
||
|
result = df.fillna(axis=1, value=100, limit=1)
|
||
|
result2 = df.fillna(axis=1, value=100, limit=2)
|
||
|
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"A": Series([100, 3, 100, 100], dtype="float64"),
|
||
|
"B": [2, 4, np.nan, 3],
|
||
|
"C": [np.nan, 100, np.nan, np.nan],
|
||
|
"D": Series([0, 1, 5, 4], dtype="float64"),
|
||
|
},
|
||
|
index=[0, 1, 2, 3],
|
||
|
)
|
||
|
expected2 = DataFrame(
|
||
|
{
|
||
|
"A": Series([100, 3, 100, 100], dtype="float64"),
|
||
|
"B": Series([2, 4, 100, 3], dtype="float64"),
|
||
|
"C": [100, 100, np.nan, 100],
|
||
|
"D": Series([0, 1, 5, 4], dtype="float64"),
|
||
|
},
|
||
|
index=[0, 1, 2, 3],
|
||
|
)
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
tm.assert_frame_equal(result2, expected2)
|
||
|
|
||
|
def test_fillna_datetime_inplace(self):
|
||
|
# GH#48863
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"date1": to_datetime(["2018-05-30", None]),
|
||
|
"date2": to_datetime(["2018-09-30", None]),
|
||
|
}
|
||
|
)
|
||
|
expected = df.copy()
|
||
|
df.fillna(np.nan, inplace=True)
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
def test_fillna_inplace_with_columns_limit_and_value(self):
|
||
|
# GH40989
|
||
|
df = DataFrame(
|
||
|
[
|
||
|
[np.nan, 2, np.nan, 0],
|
||
|
[3, 4, np.nan, 1],
|
||
|
[np.nan, np.nan, np.nan, 5],
|
||
|
[np.nan, 3, np.nan, 4],
|
||
|
],
|
||
|
columns=list("ABCD"),
|
||
|
)
|
||
|
|
||
|
expected = df.fillna(axis=1, value=100, limit=1)
|
||
|
assert expected is not df
|
||
|
|
||
|
df.fillna(axis=1, value=100, limit=1, inplace=True)
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
@td.skip_array_manager_invalid_test
|
||
|
@pytest.mark.parametrize("val", [-1, {"x": -1, "y": -1}])
|
||
|
def test_inplace_dict_update_view(
|
||
|
self, val, using_copy_on_write, warn_copy_on_write
|
||
|
):
|
||
|
# GH#47188
|
||
|
df = DataFrame({"x": [np.nan, 2], "y": [np.nan, 2]})
|
||
|
df_orig = df.copy()
|
||
|
result_view = df[:]
|
||
|
with tm.assert_cow_warning(warn_copy_on_write):
|
||
|
df.fillna(val, inplace=True)
|
||
|
expected = DataFrame({"x": [-1, 2.0], "y": [-1.0, 2]})
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
if using_copy_on_write:
|
||
|
tm.assert_frame_equal(result_view, df_orig)
|
||
|
else:
|
||
|
tm.assert_frame_equal(result_view, expected)
|
||
|
|
||
|
def test_single_block_df_with_horizontal_axis(self):
|
||
|
# GH 47713
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"col1": [5, 0, np.nan, 10, np.nan],
|
||
|
"col2": [7, np.nan, np.nan, 5, 3],
|
||
|
"col3": [12, np.nan, 1, 2, 0],
|
||
|
"col4": [np.nan, 1, 1, np.nan, 18],
|
||
|
}
|
||
|
)
|
||
|
result = df.fillna(50, limit=1, axis=1)
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
[5.0, 7.0, 12.0, 50.0],
|
||
|
[0.0, 50.0, np.nan, 1.0],
|
||
|
[50.0, np.nan, 1.0, 1.0],
|
||
|
[10.0, 5.0, 2.0, 50.0],
|
||
|
[50.0, 3.0, 0.0, 18.0],
|
||
|
],
|
||
|
columns=["col1", "col2", "col3", "col4"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_with_multi_index_frame(self):
|
||
|
# GH 47649
|
||
|
pdf = DataFrame(
|
||
|
{
|
||
|
("x", "a"): [np.nan, 2.0, 3.0],
|
||
|
("x", "b"): [1.0, 2.0, np.nan],
|
||
|
("y", "c"): [1.0, 2.0, np.nan],
|
||
|
}
|
||
|
)
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
("x", "a"): [-1.0, 2.0, 3.0],
|
||
|
("x", "b"): [1.0, 2.0, -1.0],
|
||
|
("y", "c"): [1.0, 2.0, np.nan],
|
||
|
}
|
||
|
)
|
||
|
tm.assert_frame_equal(pdf.fillna({"x": -1}), expected)
|
||
|
tm.assert_frame_equal(pdf.fillna({"x": -1, ("x", "b"): -2}), expected)
|
||
|
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
("x", "a"): [-1.0, 2.0, 3.0],
|
||
|
("x", "b"): [1.0, 2.0, -2.0],
|
||
|
("y", "c"): [1.0, 2.0, np.nan],
|
||
|
}
|
||
|
)
|
||
|
tm.assert_frame_equal(pdf.fillna({("x", "b"): -2, "x": -1}), expected)
|
||
|
|
||
|
|
||
|
def test_fillna_nonconsolidated_frame():
|
||
|
# https://github.com/pandas-dev/pandas/issues/36495
|
||
|
df = DataFrame(
|
||
|
[
|
||
|
[1, 1, 1, 1.0],
|
||
|
[2, 2, 2, 2.0],
|
||
|
[3, 3, 3, 3.0],
|
||
|
],
|
||
|
columns=["i1", "i2", "i3", "f1"],
|
||
|
)
|
||
|
df_nonconsol = df.pivot(index="i1", columns="i2")
|
||
|
result = df_nonconsol.fillna(0)
|
||
|
assert result.isna().sum().sum() == 0
|
||
|
|
||
|
|
||
|
def test_fillna_nones_inplace():
|
||
|
# GH 48480
|
||
|
df = DataFrame(
|
||
|
[[None, None], [None, None]],
|
||
|
columns=["A", "B"],
|
||
|
)
|
||
|
msg = "Downcasting object dtype arrays"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
df.fillna(value={"A": 1, "B": 2}, inplace=True)
|
||
|
|
||
|
expected = DataFrame([[1, 2], [1, 2]], columns=["A", "B"])
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("func", ["pad", "backfill"])
|
||
|
def test_pad_backfill_deprecated(func):
|
||
|
# GH#33396
|
||
|
df = DataFrame({"a": [1, 2, 3]})
|
||
|
with tm.assert_produces_warning(FutureWarning):
|
||
|
getattr(df, func)()
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data, expected_data, method, kwargs",
|
||
|
(
|
||
|
(
|
||
|
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||
|
[np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan],
|
||
|
"ffill",
|
||
|
{"limit_area": "inside"},
|
||
|
),
|
||
|
(
|
||
|
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||
|
[np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan],
|
||
|
"ffill",
|
||
|
{"limit_area": "inside", "limit": 1},
|
||
|
),
|
||
|
(
|
||
|
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||
|
[np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0],
|
||
|
"ffill",
|
||
|
{"limit_area": "outside"},
|
||
|
),
|
||
|
(
|
||
|
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||
|
[np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan],
|
||
|
"ffill",
|
||
|
{"limit_area": "outside", "limit": 1},
|
||
|
),
|
||
|
(
|
||
|
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||
|
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||
|
"ffill",
|
||
|
{"limit_area": "outside", "limit": 1},
|
||
|
),
|
||
|
(
|
||
|
range(5),
|
||
|
range(5),
|
||
|
"ffill",
|
||
|
{"limit_area": "outside", "limit": 1},
|
||
|
),
|
||
|
(
|
||
|
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||
|
[np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan],
|
||
|
"bfill",
|
||
|
{"limit_area": "inside"},
|
||
|
),
|
||
|
(
|
||
|
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||
|
[np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan],
|
||
|
"bfill",
|
||
|
{"limit_area": "inside", "limit": 1},
|
||
|
),
|
||
|
(
|
||
|
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||
|
[3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan],
|
||
|
"bfill",
|
||
|
{"limit_area": "outside"},
|
||
|
),
|
||
|
(
|
||
|
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||
|
[np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan],
|
||
|
"bfill",
|
||
|
{"limit_area": "outside", "limit": 1},
|
||
|
),
|
||
|
),
|
||
|
)
|
||
|
def test_ffill_bfill_limit_area(data, expected_data, method, kwargs):
|
||
|
# GH#56492
|
||
|
df = DataFrame(data)
|
||
|
expected = DataFrame(expected_data)
|
||
|
result = getattr(df, method)(**kwargs)
|
||
|
tm.assert_frame_equal(result, expected)
|