You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

547 lines
20 KiB

6 months ago
import re
import numpy as np
import pytest
from pandas.errors import PerformanceWarning
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"msg,labels,level",
[
(r"labels \[4\] not found in level", 4, "a"),
(r"labels \[7\] not found in level", 7, "b"),
],
)
def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level):
# GH 8594
mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
s = Series([10, 20, 30], index=mi)
df = DataFrame([10, 20, 30], index=mi)
with pytest.raises(KeyError, match=msg):
s.drop(labels, level=level)
with pytest.raises(KeyError, match=msg):
df.drop(labels, level=level)
@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")])
def test_drop_errors_ignore(labels, level):
# GH 8594
mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
s = Series([10, 20, 30], index=mi)
df = DataFrame([10, 20, 30], index=mi)
expected_s = s.drop(labels, level=level, errors="ignore")
tm.assert_series_equal(s, expected_s)
expected_df = df.drop(labels, level=level, errors="ignore")
tm.assert_frame_equal(df, expected_df)
def test_drop_with_non_unique_datetime_index_and_invalid_keys():
# GH 30399
# define dataframe with unique datetime index
df = DataFrame(
np.random.default_rng(2).standard_normal((5, 3)),
columns=["a", "b", "c"],
index=pd.date_range("2012", freq="h", periods=5),
)
# create dataframe with non-unique datetime index
df = df.iloc[[0, 2, 2, 3]].copy()
with pytest.raises(KeyError, match="not found in axis"):
df.drop(["a", "b"]) # Dropping with labels not exist in the index
class TestDataFrameDrop:
def test_drop_names(self):
df = DataFrame(
[[1, 2, 3], [3, 4, 5], [5, 6, 7]],
index=["a", "b", "c"],
columns=["d", "e", "f"],
)
df.index.name, df.columns.name = "first", "second"
df_dropped_b = df.drop("b")
df_dropped_e = df.drop("e", axis=1)
df_inplace_b, df_inplace_e = df.copy(), df.copy()
return_value = df_inplace_b.drop("b", inplace=True)
assert return_value is None
return_value = df_inplace_e.drop("e", axis=1, inplace=True)
assert return_value is None
for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e):
assert obj.index.name == "first"
assert obj.columns.name == "second"
assert list(df.columns) == ["d", "e", "f"]
msg = r"\['g'\] not found in axis"
with pytest.raises(KeyError, match=msg):
df.drop(["g"])
with pytest.raises(KeyError, match=msg):
df.drop(["g"], axis=1)
# errors = 'ignore'
dropped = df.drop(["g"], errors="ignore")
expected = Index(["a", "b", "c"], name="first")
tm.assert_index_equal(dropped.index, expected)
dropped = df.drop(["b", "g"], errors="ignore")
expected = Index(["a", "c"], name="first")
tm.assert_index_equal(dropped.index, expected)
dropped = df.drop(["g"], axis=1, errors="ignore")
expected = Index(["d", "e", "f"], name="second")
tm.assert_index_equal(dropped.columns, expected)
dropped = df.drop(["d", "g"], axis=1, errors="ignore")
expected = Index(["e", "f"], name="second")
tm.assert_index_equal(dropped.columns, expected)
# GH 16398
dropped = df.drop([], errors="ignore")
expected = Index(["a", "b", "c"], name="first")
tm.assert_index_equal(dropped.index, expected)
def test_drop(self):
simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]})
tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]])
tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]])
tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :])
tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :])
with pytest.raises(KeyError, match=r"\[5\] not found in axis"):
simple.drop(5)
with pytest.raises(KeyError, match=r"\['C'\] not found in axis"):
simple.drop("C", axis=1)
with pytest.raises(KeyError, match=r"\[5\] not found in axis"):
simple.drop([1, 5])
with pytest.raises(KeyError, match=r"\['C'\] not found in axis"):
simple.drop(["A", "C"], axis=1)
# GH 42881
with pytest.raises(KeyError, match=r"\['C', 'D', 'F'\] not found in axis"):
simple.drop(["C", "D", "F"], axis=1)
# errors = 'ignore'
tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple)
tm.assert_frame_equal(
simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]
)
tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple)
tm.assert_frame_equal(
simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]
)
# non-unique - wheee!
nu_df = DataFrame(
list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"]
)
tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]])
tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"])
tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398
nu_df = nu_df.set_index(Index(["X", "Y", "X"]))
nu_df.columns = list("abc")
tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :])
tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :])
# inplace cache issue
# GH#5628
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc")
)
expected = df[~(df.b > 0)]
return_value = df.drop(labels=df[df.b > 0].index, inplace=True)
assert return_value is None
tm.assert_frame_equal(df, expected)
def test_drop_multiindex_not_lexsorted(self):
# GH#11640
# define the lexsorted version
lexsorted_mi = MultiIndex.from_tuples(
[("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
)
lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
assert lexsorted_df.columns._is_lexsorted()
# define the non-lexsorted version
not_lexsorted_df = DataFrame(
columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]
)
not_lexsorted_df = not_lexsorted_df.pivot_table(
index="a", columns=["b", "c"], values="d"
)
not_lexsorted_df = not_lexsorted_df.reset_index()
assert not not_lexsorted_df.columns._is_lexsorted()
expected = lexsorted_df.drop("a", axis=1).astype(float)
with tm.assert_produces_warning(PerformanceWarning):
result = not_lexsorted_df.drop("a", axis=1)
tm.assert_frame_equal(result, expected)
def test_drop_api_equivalence(self):
# equivalence of the labels/axis and index/columns API's (GH#12392)
df = DataFrame(
[[1, 2, 3], [3, 4, 5], [5, 6, 7]],
index=["a", "b", "c"],
columns=["d", "e", "f"],
)
res1 = df.drop("a")
res2 = df.drop(index="a")
tm.assert_frame_equal(res1, res2)
res1 = df.drop("d", axis=1)
res2 = df.drop(columns="d")
tm.assert_frame_equal(res1, res2)
res1 = df.drop(labels="e", axis=1)
res2 = df.drop(columns="e")
tm.assert_frame_equal(res1, res2)
res1 = df.drop(["a"], axis=0)
res2 = df.drop(index=["a"])
tm.assert_frame_equal(res1, res2)
res1 = df.drop(["a"], axis=0).drop(["d"], axis=1)
res2 = df.drop(index=["a"], columns=["d"])
tm.assert_frame_equal(res1, res2)
msg = "Cannot specify both 'labels' and 'index'/'columns'"
with pytest.raises(ValueError, match=msg):
df.drop(labels="a", index="b")
with pytest.raises(ValueError, match=msg):
df.drop(labels="a", columns="b")
msg = "Need to specify at least one of 'labels', 'index' or 'columns'"
with pytest.raises(ValueError, match=msg):
df.drop(axis=1)
data = [[1, 2, 3], [1, 2, 3]]
@pytest.mark.parametrize(
"actual",
[
DataFrame(data=data, index=["a", "a"]),
DataFrame(data=data, index=["a", "b"]),
DataFrame(data=data, index=["a", "b"]).set_index([0, 1]),
DataFrame(data=data, index=["a", "a"]).set_index([0, 1]),
],
)
def test_raise_on_drop_duplicate_index(self, actual):
# GH#19186
level = 0 if isinstance(actual.index, MultiIndex) else None
msg = re.escape("\"['c'] not found in axis\"")
with pytest.raises(KeyError, match=msg):
actual.drop("c", level=level, axis=0)
with pytest.raises(KeyError, match=msg):
actual.T.drop("c", level=level, axis=1)
expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore")
tm.assert_frame_equal(expected_no_err, actual)
expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore")
tm.assert_frame_equal(expected_no_err.T, actual)
@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]])
@pytest.mark.parametrize("drop_labels", [[], [1], [2]])
def test_drop_empty_list(self, index, drop_labels):
# GH#21494
expected_index = [i for i in index if i not in drop_labels]
frame = DataFrame(index=index).drop(drop_labels)
tm.assert_frame_equal(frame, DataFrame(index=expected_index))
@pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]])
@pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]])
def test_drop_non_empty_list(self, index, drop_labels):
# GH# 21494
with pytest.raises(KeyError, match="not found in axis"):
DataFrame(index=index).drop(drop_labels)
@pytest.mark.parametrize(
"empty_listlike",
[
[],
{},
np.array([]),
Series([], dtype="datetime64[ns]"),
Index([]),
DatetimeIndex([]),
],
)
def test_drop_empty_listlike_non_unique_datetime_index(self, empty_listlike):
# GH#27994
data = {"column_a": [5, 10], "column_b": ["one", "two"]}
index = [Timestamp("2021-01-01"), Timestamp("2021-01-01")]
df = DataFrame(data, index=index)
# Passing empty list-like should return the same DataFrame.
expected = df.copy()
result = df.drop(empty_listlike)
tm.assert_frame_equal(result, expected)
def test_mixed_depth_drop(self):
arrays = [
["a", "top", "top", "routine1", "routine1", "routine2"],
["", "OD", "OD", "result1", "result2", "result1"],
["", "wx", "wy", "", "", ""],
]
tuples = sorted(zip(*arrays))
index = MultiIndex.from_tuples(tuples)
df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index)
result = df.drop("a", axis=1)
expected = df.drop([("a", "", "")], axis=1)
tm.assert_frame_equal(expected, result)
result = df.drop(["top"], axis=1)
expected = df.drop([("top", "OD", "wx")], axis=1)
expected = expected.drop([("top", "OD", "wy")], axis=1)
tm.assert_frame_equal(expected, result)
result = df.drop(("top", "OD", "wx"), axis=1)
expected = df.drop([("top", "OD", "wx")], axis=1)
tm.assert_frame_equal(expected, result)
expected = df.drop([("top", "OD", "wy")], axis=1)
expected = df.drop("top", axis=1)
result = df.drop("result1", level=1, axis=1)
expected = df.drop(
[("routine1", "result1", ""), ("routine2", "result1", "")], axis=1
)
tm.assert_frame_equal(expected, result)
def test_drop_multiindex_other_level_nan(self):
# GH#12754
df = (
DataFrame(
{
"A": ["one", "one", "two", "two"],
"B": [np.nan, 0.0, 1.0, 2.0],
"C": ["a", "b", "c", "c"],
"D": [1, 2, 3, 4],
}
)
.set_index(["A", "B", "C"])
.sort_index()
)
result = df.drop("c", level="C")
expected = DataFrame(
[2, 1],
columns=["D"],
index=MultiIndex.from_tuples(
[("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"]
),
)
tm.assert_frame_equal(result, expected)
def test_drop_nonunique(self):
df = DataFrame(
[
["x-a", "x", "a", 1.5],
["x-a", "x", "a", 1.2],
["z-c", "z", "c", 3.1],
["x-a", "x", "a", 4.1],
["x-b", "x", "b", 5.1],
["x-b", "x", "b", 4.1],
["x-b", "x", "b", 2.2],
["y-a", "y", "a", 1.2],
["z-b", "z", "b", 2.1],
],
columns=["var1", "var2", "var3", "var4"],
)
grp_size = df.groupby("var1").size()
drop_idx = grp_size.loc[grp_size == 1]
idf = df.set_index(["var1", "var2", "var3"])
# it works! GH#2101
result = idf.drop(drop_idx.index, level=0).reset_index()
expected = df[-df.var1.isin(drop_idx.index)]
result.index = expected.index
tm.assert_frame_equal(result, expected)
def test_drop_level(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
result = frame.drop(["bar", "qux"], level="first")
expected = frame.iloc[[0, 1, 2, 5, 6]]
tm.assert_frame_equal(result, expected)
result = frame.drop(["two"], level="second")
expected = frame.iloc[[0, 2, 3, 6, 7, 9]]
tm.assert_frame_equal(result, expected)
result = frame.T.drop(["bar", "qux"], axis=1, level="first")
expected = frame.iloc[[0, 1, 2, 5, 6]].T
tm.assert_frame_equal(result, expected)
result = frame.T.drop(["two"], axis=1, level="second")
expected = frame.iloc[[0, 2, 3, 6, 7, 9]].T
tm.assert_frame_equal(result, expected)
def test_drop_level_nonunique_datetime(self):
# GH#12701
idx = Index([2, 3, 4, 4, 5], name="id")
idxdt = pd.to_datetime(
[
"2016-03-23 14:00",
"2016-03-23 15:00",
"2016-03-23 16:00",
"2016-03-23 16:00",
"2016-03-23 17:00",
]
)
df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx)
df["tstamp"] = idxdt
df = df.set_index("tstamp", append=True)
ts = Timestamp("201603231600")
assert df.index.is_unique is False
result = df.drop(ts, level="tstamp")
expected = df.loc[idx != 4]
tm.assert_frame_equal(result, expected)
def test_drop_tz_aware_timestamp_across_dst(self, frame_or_series):
# GH#21761
start = Timestamp("2017-10-29", tz="Europe/Berlin")
end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin")
index = pd.date_range(start, end, freq="15min")
data = frame_or_series(data=[1] * len(index), index=index)
result = data.drop(start)
expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin")
expected_idx = pd.date_range(expected_start, end, freq="15min")
expected = frame_or_series(data=[1] * len(expected_idx), index=expected_idx)
tm.assert_equal(result, expected)
def test_drop_preserve_names(self):
index = MultiIndex.from_arrays(
[[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"]
)
df = DataFrame(np.random.default_rng(2).standard_normal((6, 3)), index=index)
result = df.drop([(0, 2)])
assert result.index.names == ("one", "two")
@pytest.mark.parametrize(
"operation", ["__iadd__", "__isub__", "__imul__", "__ipow__"]
)
@pytest.mark.parametrize("inplace", [False, True])
def test_inplace_drop_and_operation(self, operation, inplace):
# GH#30484
df = DataFrame({"x": range(5)})
expected = df.copy()
df["y"] = range(5)
y = df["y"]
with tm.assert_produces_warning(None):
if inplace:
df.drop("y", axis=1, inplace=inplace)
else:
df = df.drop("y", axis=1, inplace=inplace)
# Perform operation and check result
getattr(y, operation)(1)
tm.assert_frame_equal(df, expected)
def test_drop_with_non_unique_multiindex(self):
# GH#36293
mi = MultiIndex.from_arrays([["x", "y", "x"], ["i", "j", "i"]])
df = DataFrame([1, 2, 3], index=mi)
result = df.drop(index="x")
expected = DataFrame([2], index=MultiIndex.from_arrays([["y"], ["j"]]))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("indexer", [("a", "a"), [("a", "a")]])
def test_drop_tuple_with_non_unique_multiindex(self, indexer):
# GH#42771
idx = MultiIndex.from_product([["a", "b"], ["a", "a"]])
df = DataFrame({"x": range(len(idx))}, index=idx)
result = df.drop(index=[("a", "a")])
expected = DataFrame(
{"x": [2, 3]}, index=MultiIndex.from_tuples([("b", "a"), ("b", "a")])
)
tm.assert_frame_equal(result, expected)
def test_drop_with_duplicate_columns(self):
df = DataFrame(
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
)
result = df.drop(["a"], axis=1)
expected = DataFrame([[1], [1], [1]], columns=["bar"])
tm.assert_frame_equal(result, expected)
result = df.drop("a", axis=1)
tm.assert_frame_equal(result, expected)
def test_drop_with_duplicate_columns2(self):
# drop buggy GH#6240
df = DataFrame(
{
"A": np.random.default_rng(2).standard_normal(5),
"B": np.random.default_rng(2).standard_normal(5),
"C": np.random.default_rng(2).standard_normal(5),
"D": ["a", "b", "c", "d", "e"],
}
)
expected = df.take([0, 1, 1], axis=1)
df2 = df.take([2, 0, 1, 2, 1], axis=1)
result = df2.drop("C", axis=1)
tm.assert_frame_equal(result, expected)
def test_drop_inplace_no_leftover_column_reference(self):
# GH 13934
df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object"))
a = df.a
df.drop(["a"], axis=1, inplace=True)
tm.assert_index_equal(df.columns, Index([], dtype="object"))
a -= a.mean()
tm.assert_index_equal(df.columns, Index([], dtype="object"))
def test_drop_level_missing_label_multiindex(self):
# GH 18561
df = DataFrame(index=MultiIndex.from_product([range(3), range(3)]))
with pytest.raises(KeyError, match="labels \\[5\\] not found in level"):
df.drop(5, level=0)
@pytest.mark.parametrize("idx, level", [(["a", "b"], 0), (["a"], None)])
def test_drop_index_ea_dtype(self, any_numeric_ea_dtype, idx, level):
# GH#45860
df = DataFrame(
{"a": [1, 2, 2, pd.NA], "b": 100}, dtype=any_numeric_ea_dtype
).set_index(idx)
result = df.drop(Index([2, pd.NA]), level=level)
expected = DataFrame(
{"a": [1], "b": 100}, dtype=any_numeric_ea_dtype
).set_index(idx)
tm.assert_frame_equal(result, expected)
def test_drop_parse_strings_datetime_index(self):
# GH #5355
df = DataFrame(
{"a": [1, 2], "b": [1, 2]},
index=[Timestamp("2000-01-03"), Timestamp("2000-01-04")],
)
result = df.drop("2000-01-03", axis=0)
expected = DataFrame({"a": [2], "b": [2]}, index=[Timestamp("2000-01-04")])
tm.assert_frame_equal(result, expected)