You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

547 lines
20 KiB

import re
import numpy as np
import pytest
from pandas.errors import PerformanceWarning
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"msg,labels,level",
[
(r"labels \[4\] not found in level", 4, "a"),
(r"labels \[7\] not found in level", 7, "b"),
],
)
def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level):
# GH 8594
mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
s = Series([10, 20, 30], index=mi)
df = DataFrame([10, 20, 30], index=mi)
with pytest.raises(KeyError, match=msg):
s.drop(labels, level=level)
with pytest.raises(KeyError, match=msg):
df.drop(labels, level=level)
@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")])
def test_drop_errors_ignore(labels, level):
# GH 8594
mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
s = Series([10, 20, 30], index=mi)
df = DataFrame([10, 20, 30], index=mi)
expected_s = s.drop(labels, level=level, errors="ignore")
tm.assert_series_equal(s, expected_s)
expected_df = df.drop(labels, level=level, errors="ignore")
tm.assert_frame_equal(df, expected_df)
def test_drop_with_non_unique_datetime_index_and_invalid_keys():
# GH 30399
# define dataframe with unique datetime index
df = DataFrame(
np.random.default_rng(2).standard_normal((5, 3)),
columns=["a", "b", "c"],
index=pd.date_range("2012", freq="h", periods=5),
)
# create dataframe with non-unique datetime index
df = df.iloc[[0, 2, 2, 3]].copy()
with pytest.raises(KeyError, match="not found in axis"):
df.drop(["a", "b"]) # Dropping with labels not exist in the index
class TestDataFrameDrop:
def test_drop_names(self):
df = DataFrame(
[[1, 2, 3], [3, 4, 5], [5, 6, 7]],
index=["a", "b", "c"],
columns=["d", "e", "f"],
)
df.index.name, df.columns.name = "first", "second"
df_dropped_b = df.drop("b")
df_dropped_e = df.drop("e", axis=1)
df_inplace_b, df_inplace_e = df.copy(), df.copy()
return_value = df_inplace_b.drop("b", inplace=True)
assert return_value is None
return_value = df_inplace_e.drop("e", axis=1, inplace=True)
assert return_value is None
for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e):
assert obj.index.name == "first"
assert obj.columns.name == "second"
assert list(df.columns) == ["d", "e", "f"]
msg = r"\['g'\] not found in axis"
with pytest.raises(KeyError, match=msg):
df.drop(["g"])
with pytest.raises(KeyError, match=msg):
df.drop(["g"], axis=1)
# errors = 'ignore'
dropped = df.drop(["g"], errors="ignore")
expected = Index(["a", "b", "c"], name="first")
tm.assert_index_equal(dropped.index, expected)
dropped = df.drop(["b", "g"], errors="ignore")
expected = Index(["a", "c"], name="first")
tm.assert_index_equal(dropped.index, expected)
dropped = df.drop(["g"], axis=1, errors="ignore")
expected = Index(["d", "e", "f"], name="second")
tm.assert_index_equal(dropped.columns, expected)
dropped = df.drop(["d", "g"], axis=1, errors="ignore")
expected = Index(["e", "f"], name="second")
tm.assert_index_equal(dropped.columns, expected)
# GH 16398
dropped = df.drop([], errors="ignore")
expected = Index(["a", "b", "c"], name="first")
tm.assert_index_equal(dropped.index, expected)
def test_drop(self):
simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]})
tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]])
tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]])
tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :])
tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :])
with pytest.raises(KeyError, match=r"\[5\] not found in axis"):
simple.drop(5)
with pytest.raises(KeyError, match=r"\['C'\] not found in axis"):
simple.drop("C", axis=1)
with pytest.raises(KeyError, match=r"\[5\] not found in axis"):
simple.drop([1, 5])
with pytest.raises(KeyError, match=r"\['C'\] not found in axis"):
simple.drop(["A", "C"], axis=1)
# GH 42881
with pytest.raises(KeyError, match=r"\['C', 'D', 'F'\] not found in axis"):
simple.drop(["C", "D", "F"], axis=1)
# errors = 'ignore'
tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple)
tm.assert_frame_equal(
simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]
)
tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple)
tm.assert_frame_equal(
simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]
)
# non-unique - wheee!
nu_df = DataFrame(
list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"]
)
tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]])
tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"])
tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398
nu_df = nu_df.set_index(Index(["X", "Y", "X"]))
nu_df.columns = list("abc")
tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :])
tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :])
# inplace cache issue
# GH#5628
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc")
)
expected = df[~(df.b > 0)]
return_value = df.drop(labels=df[df.b > 0].index, inplace=True)
assert return_value is None
tm.assert_frame_equal(df, expected)
def test_drop_multiindex_not_lexsorted(self):
# GH#11640
# define the lexsorted version
lexsorted_mi = MultiIndex.from_tuples(
[("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
)
lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
assert lexsorted_df.columns._is_lexsorted()
# define the non-lexsorted version
not_lexsorted_df = DataFrame(
columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]
)
not_lexsorted_df = not_lexsorted_df.pivot_table(
index="a", columns=["b", "c"], values="d"
)
not_lexsorted_df = not_lexsorted_df.reset_index()
assert not not_lexsorted_df.columns._is_lexsorted()
expected = lexsorted_df.drop("a", axis=1).astype(float)
with tm.assert_produces_warning(PerformanceWarning):
result = not_lexsorted_df.drop("a", axis=1)
tm.assert_frame_equal(result, expected)
def test_drop_api_equivalence(self):
# equivalence of the labels/axis and index/columns API's (GH#12392)
df = DataFrame(
[[1, 2, 3], [3, 4, 5], [5, 6, 7]],
index=["a", "b", "c"],
columns=["d", "e", "f"],
)
res1 = df.drop("a")
res2 = df.drop(index="a")
tm.assert_frame_equal(res1, res2)
res1 = df.drop("d", axis=1)
res2 = df.drop(columns="d")
tm.assert_frame_equal(res1, res2)
res1 = df.drop(labels="e", axis=1)
res2 = df.drop(columns="e")
tm.assert_frame_equal(res1, res2)
res1 = df.drop(["a"], axis=0)
res2 = df.drop(index=["a"])
tm.assert_frame_equal(res1, res2)
res1 = df.drop(["a"], axis=0).drop(["d"], axis=1)
res2 = df.drop(index=["a"], columns=["d"])
tm.assert_frame_equal(res1, res2)
msg = "Cannot specify both 'labels' and 'index'/'columns'"
with pytest.raises(ValueError, match=msg):
df.drop(labels="a", index="b")
with pytest.raises(ValueError, match=msg):
df.drop(labels="a", columns="b")
msg = "Need to specify at least one of 'labels', 'index' or 'columns'"
with pytest.raises(ValueError, match=msg):
df.drop(axis=1)
data = [[1, 2, 3], [1, 2, 3]]
@pytest.mark.parametrize(
"actual",
[
DataFrame(data=data, index=["a", "a"]),
DataFrame(data=data, index=["a", "b"]),
DataFrame(data=data, index=["a", "b"]).set_index([0, 1]),
DataFrame(data=data, index=["a", "a"]).set_index([0, 1]),
],
)
def test_raise_on_drop_duplicate_index(self, actual):
# GH#19186
level = 0 if isinstance(actual.index, MultiIndex) else None
msg = re.escape("\"['c'] not found in axis\"")
with pytest.raises(KeyError, match=msg):
actual.drop("c", level=level, axis=0)
with pytest.raises(KeyError, match=msg):
actual.T.drop("c", level=level, axis=1)
expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore")
tm.assert_frame_equal(expected_no_err, actual)
expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore")
tm.assert_frame_equal(expected_no_err.T, actual)
@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]])
@pytest.mark.parametrize("drop_labels", [[], [1], [2]])
def test_drop_empty_list(self, index, drop_labels):
# GH#21494
expected_index = [i for i in index if i not in drop_labels]
frame = DataFrame(index=index).drop(drop_labels)
tm.assert_frame_equal(frame, DataFrame(index=expected_index))
@pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]])
@pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]])
def test_drop_non_empty_list(self, index, drop_labels):
# GH# 21494
with pytest.raises(KeyError, match="not found in axis"):
DataFrame(index=index).drop(drop_labels)
@pytest.mark.parametrize(
"empty_listlike",
[
[],
{},
np.array([]),
Series([], dtype="datetime64[ns]"),
Index([]),
DatetimeIndex([]),
],
)
def test_drop_empty_listlike_non_unique_datetime_index(self, empty_listlike):
# GH#27994
data = {"column_a": [5, 10], "column_b": ["one", "two"]}
index = [Timestamp("2021-01-01"), Timestamp("2021-01-01")]
df = DataFrame(data, index=index)
# Passing empty list-like should return the same DataFrame.
expected = df.copy()
result = df.drop(empty_listlike)
tm.assert_frame_equal(result, expected)
def test_mixed_depth_drop(self):
arrays = [
["a", "top", "top", "routine1", "routine1", "routine2"],
["", "OD", "OD", "result1", "result2", "result1"],
["", "wx", "wy", "", "", ""],
]
tuples = sorted(zip(*arrays))
index = MultiIndex.from_tuples(tuples)
df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index)
result = df.drop("a", axis=1)
expected = df.drop([("a", "", "")], axis=1)
tm.assert_frame_equal(expected, result)
result = df.drop(["top"], axis=1)
expected = df.drop([("top", "OD", "wx")], axis=1)
expected = expected.drop([("top", "OD", "wy")], axis=1)
tm.assert_frame_equal(expected, result)
result = df.drop(("top", "OD", "wx"), axis=1)
expected = df.drop([("top", "OD", "wx")], axis=1)
tm.assert_frame_equal(expected, result)
expected = df.drop([("top", "OD", "wy")], axis=1)
expected = df.drop("top", axis=1)
result = df.drop("result1", level=1, axis=1)
expected = df.drop(
[("routine1", "result1", ""), ("routine2", "result1", "")], axis=1
)
tm.assert_frame_equal(expected, result)
def test_drop_multiindex_other_level_nan(self):
# GH#12754
df = (
DataFrame(
{
"A": ["one", "one", "two", "two"],
"B": [np.nan, 0.0, 1.0, 2.0],
"C": ["a", "b", "c", "c"],
"D": [1, 2, 3, 4],
}
)
.set_index(["A", "B", "C"])
.sort_index()
)
result = df.drop("c", level="C")
expected = DataFrame(
[2, 1],
columns=["D"],
index=MultiIndex.from_tuples(
[("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"]
),
)
tm.assert_frame_equal(result, expected)
def test_drop_nonunique(self):
df = DataFrame(
[
["x-a", "x", "a", 1.5],
["x-a", "x", "a", 1.2],
["z-c", "z", "c", 3.1],
["x-a", "x", "a", 4.1],
["x-b", "x", "b", 5.1],
["x-b", "x", "b", 4.1],
["x-b", "x", "b", 2.2],
["y-a", "y", "a", 1.2],
["z-b", "z", "b", 2.1],
],
columns=["var1", "var2", "var3", "var4"],
)
grp_size = df.groupby("var1").size()
drop_idx = grp_size.loc[grp_size == 1]
idf = df.set_index(["var1", "var2", "var3"])
# it works! GH#2101
result = idf.drop(drop_idx.index, level=0).reset_index()
expected = df[-df.var1.isin(drop_idx.index)]
result.index = expected.index
tm.assert_frame_equal(result, expected)
def test_drop_level(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
result = frame.drop(["bar", "qux"], level="first")
expected = frame.iloc[[0, 1, 2, 5, 6]]
tm.assert_frame_equal(result, expected)
result = frame.drop(["two"], level="second")
expected = frame.iloc[[0, 2, 3, 6, 7, 9]]
tm.assert_frame_equal(result, expected)
result = frame.T.drop(["bar", "qux"], axis=1, level="first")
expected = frame.iloc[[0, 1, 2, 5, 6]].T
tm.assert_frame_equal(result, expected)
result = frame.T.drop(["two"], axis=1, level="second")
expected = frame.iloc[[0, 2, 3, 6, 7, 9]].T
tm.assert_frame_equal(result, expected)
def test_drop_level_nonunique_datetime(self):
# GH#12701
idx = Index([2, 3, 4, 4, 5], name="id")
idxdt = pd.to_datetime(
[
"2016-03-23 14:00",
"2016-03-23 15:00",
"2016-03-23 16:00",
"2016-03-23 16:00",
"2016-03-23 17:00",
]
)
df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx)
df["tstamp"] = idxdt
df = df.set_index("tstamp", append=True)
ts = Timestamp("201603231600")
assert df.index.is_unique is False
result = df.drop(ts, level="tstamp")
expected = df.loc[idx != 4]
tm.assert_frame_equal(result, expected)
def test_drop_tz_aware_timestamp_across_dst(self, frame_or_series):
# GH#21761
start = Timestamp("2017-10-29", tz="Europe/Berlin")
end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin")
index = pd.date_range(start, end, freq="15min")
data = frame_or_series(data=[1] * len(index), index=index)
result = data.drop(start)
expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin")
expected_idx = pd.date_range(expected_start, end, freq="15min")
expected = frame_or_series(data=[1] * len(expected_idx), index=expected_idx)
tm.assert_equal(result, expected)
def test_drop_preserve_names(self):
index = MultiIndex.from_arrays(
[[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"]
)
df = DataFrame(np.random.default_rng(2).standard_normal((6, 3)), index=index)
result = df.drop([(0, 2)])
assert result.index.names == ("one", "two")
@pytest.mark.parametrize(
"operation", ["__iadd__", "__isub__", "__imul__", "__ipow__"]
)
@pytest.mark.parametrize("inplace", [False, True])
def test_inplace_drop_and_operation(self, operation, inplace):
# GH#30484
df = DataFrame({"x": range(5)})
expected = df.copy()
df["y"] = range(5)
y = df["y"]
with tm.assert_produces_warning(None):
if inplace:
df.drop("y", axis=1, inplace=inplace)
else:
df = df.drop("y", axis=1, inplace=inplace)
# Perform operation and check result
getattr(y, operation)(1)
tm.assert_frame_equal(df, expected)
def test_drop_with_non_unique_multiindex(self):
# GH#36293
mi = MultiIndex.from_arrays([["x", "y", "x"], ["i", "j", "i"]])
df = DataFrame([1, 2, 3], index=mi)
result = df.drop(index="x")
expected = DataFrame([2], index=MultiIndex.from_arrays([["y"], ["j"]]))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("indexer", [("a", "a"), [("a", "a")]])
def test_drop_tuple_with_non_unique_multiindex(self, indexer):
# GH#42771
idx = MultiIndex.from_product([["a", "b"], ["a", "a"]])
df = DataFrame({"x": range(len(idx))}, index=idx)
result = df.drop(index=[("a", "a")])
expected = DataFrame(
{"x": [2, 3]}, index=MultiIndex.from_tuples([("b", "a"), ("b", "a")])
)
tm.assert_frame_equal(result, expected)
def test_drop_with_duplicate_columns(self):
df = DataFrame(
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
)
result = df.drop(["a"], axis=1)
expected = DataFrame([[1], [1], [1]], columns=["bar"])
tm.assert_frame_equal(result, expected)
result = df.drop("a", axis=1)
tm.assert_frame_equal(result, expected)
def test_drop_with_duplicate_columns2(self):
# drop buggy GH#6240
df = DataFrame(
{
"A": np.random.default_rng(2).standard_normal(5),
"B": np.random.default_rng(2).standard_normal(5),
"C": np.random.default_rng(2).standard_normal(5),
"D": ["a", "b", "c", "d", "e"],
}
)
expected = df.take([0, 1, 1], axis=1)
df2 = df.take([2, 0, 1, 2, 1], axis=1)
result = df2.drop("C", axis=1)
tm.assert_frame_equal(result, expected)
def test_drop_inplace_no_leftover_column_reference(self):
# GH 13934
df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object"))
a = df.a
df.drop(["a"], axis=1, inplace=True)
tm.assert_index_equal(df.columns, Index([], dtype="object"))
a -= a.mean()
tm.assert_index_equal(df.columns, Index([], dtype="object"))
def test_drop_level_missing_label_multiindex(self):
# GH 18561
df = DataFrame(index=MultiIndex.from_product([range(3), range(3)]))
with pytest.raises(KeyError, match="labels \\[5\\] not found in level"):
df.drop(5, level=0)
@pytest.mark.parametrize("idx, level", [(["a", "b"], 0), (["a"], None)])
def test_drop_index_ea_dtype(self, any_numeric_ea_dtype, idx, level):
# GH#45860
df = DataFrame(
{"a": [1, 2, 2, pd.NA], "b": 100}, dtype=any_numeric_ea_dtype
).set_index(idx)
result = df.drop(Index([2, pd.NA]), level=level)
expected = DataFrame(
{"a": [1], "b": 100}, dtype=any_numeric_ea_dtype
).set_index(idx)
tm.assert_frame_equal(result, expected)
def test_drop_parse_strings_datetime_index(self):
# GH #5355
df = DataFrame(
{"a": [1, 2], "b": [1, 2]},
index=[Timestamp("2000-01-03"), Timestamp("2000-01-04")],
)
result = df.drop("2000-01-03", axis=0)
expected = DataFrame({"a": [2], "b": [2]}, index=[Timestamp("2000-01-04")])
tm.assert_frame_equal(result, expected)