You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
697 lines
23 KiB
697 lines
23 KiB
7 months ago
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas.compat.pyarrow import pa_version_under10p1
|
||
|
|
||
|
from pandas.core.dtypes.missing import na_value_for_dtype
|
||
|
|
||
|
import pandas as pd
|
||
|
import pandas._testing as tm
|
||
|
from pandas.tests.groupby import get_groupby_method_args
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"dropna, tuples, outputs",
|
||
|
[
|
||
|
(
|
||
|
True,
|
||
|
[["A", "B"], ["B", "A"]],
|
||
|
{"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]},
|
||
|
),
|
||
|
(
|
||
|
False,
|
||
|
[["A", "B"], ["A", np.nan], ["B", "A"]],
|
||
|
{
|
||
|
"c": [13.0, 12.3, 123.23],
|
||
|
"d": [13.0, 233.0, 123.0],
|
||
|
"e": [13.0, 12.0, 1.0],
|
||
|
},
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
|
||
|
dropna, tuples, outputs, nulls_fixture
|
||
|
):
|
||
|
# GH 3729 this is to test that NA is in one group
|
||
|
df_list = [
|
||
|
["A", "B", 12, 12, 12],
|
||
|
["A", nulls_fixture, 12.3, 233.0, 12],
|
||
|
["B", "A", 123.23, 123, 1],
|
||
|
["A", "B", 1, 1, 1.0],
|
||
|
]
|
||
|
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||
|
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
|
||
|
|
||
|
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||
|
|
||
|
# Since right now, by default MI will drop NA from levels when we create MI
|
||
|
# via `from_*`, so we need to add NA for level manually afterwards.
|
||
|
if not dropna:
|
||
|
mi = mi.set_levels(["A", "B", np.nan], level="b")
|
||
|
expected = pd.DataFrame(outputs, index=mi)
|
||
|
|
||
|
tm.assert_frame_equal(grouped, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"dropna, tuples, outputs",
|
||
|
[
|
||
|
(
|
||
|
True,
|
||
|
[["A", "B"], ["B", "A"]],
|
||
|
{"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]},
|
||
|
),
|
||
|
(
|
||
|
False,
|
||
|
[["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]],
|
||
|
{
|
||
|
"c": [12.0, 13.3, 123.23, 1.0],
|
||
|
"d": [12.0, 234.0, 123.0, 1.0],
|
||
|
"e": [12.0, 13.0, 1.0, 1.0],
|
||
|
},
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
|
||
|
dropna, tuples, outputs, nulls_fixture, nulls_fixture2
|
||
|
):
|
||
|
# GH 3729 this is to test that NA in different groups with different representations
|
||
|
df_list = [
|
||
|
["A", "B", 12, 12, 12],
|
||
|
["A", nulls_fixture, 12.3, 233.0, 12],
|
||
|
["B", "A", 123.23, 123, 1],
|
||
|
[nulls_fixture2, "B", 1, 1, 1.0],
|
||
|
["A", nulls_fixture2, 1, 1, 1.0],
|
||
|
]
|
||
|
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||
|
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
|
||
|
|
||
|
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||
|
|
||
|
# Since right now, by default MI will drop NA from levels when we create MI
|
||
|
# via `from_*`, so we need to add NA for level manually afterwards.
|
||
|
if not dropna:
|
||
|
mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
|
||
|
expected = pd.DataFrame(outputs, index=mi)
|
||
|
|
||
|
tm.assert_frame_equal(grouped, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"dropna, idx, outputs",
|
||
|
[
|
||
|
(True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}),
|
||
|
(
|
||
|
False,
|
||
|
["A", "B", np.nan],
|
||
|
{
|
||
|
"b": [123.23, 13.0, 12.3],
|
||
|
"c": [123.0, 13.0, 233.0],
|
||
|
"d": [1.0, 13.0, 12.0],
|
||
|
},
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
|
||
|
# GH 3729
|
||
|
df_list = [
|
||
|
["B", 12, 12, 12],
|
||
|
[None, 12.3, 233.0, 12],
|
||
|
["A", 123.23, 123, 1],
|
||
|
["B", 1, 1, 1.0],
|
||
|
]
|
||
|
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
|
||
|
grouped = df.groupby("a", dropna=dropna).sum()
|
||
|
|
||
|
expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a"))
|
||
|
|
||
|
tm.assert_frame_equal(grouped, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"dropna, idx, expected",
|
||
|
[
|
||
|
(True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])),
|
||
|
(
|
||
|
False,
|
||
|
["a", "a", "b", np.nan],
|
||
|
pd.Series([3, 3, 3], index=["a", "b", np.nan]),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_groupby_dropna_series_level(dropna, idx, expected):
|
||
|
ser = pd.Series([1, 2, 3, 3], index=idx)
|
||
|
|
||
|
result = ser.groupby(level=0, dropna=dropna).sum()
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"dropna, expected",
|
||
|
[
|
||
|
(True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")),
|
||
|
(
|
||
|
False,
|
||
|
pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_groupby_dropna_series_by(dropna, expected):
|
||
|
ser = pd.Series(
|
||
|
[390.0, 350.0, 30.0, 20.0],
|
||
|
index=["Falcon", "Falcon", "Parrot", "Parrot"],
|
||
|
name="Max Speed",
|
||
|
)
|
||
|
|
||
|
result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean()
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("dropna", (False, True))
|
||
|
def test_grouper_dropna_propagation(dropna):
|
||
|
# GH 36604
|
||
|
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
|
||
|
gb = df.groupby("A", dropna=dropna)
|
||
|
assert gb._grouper.dropna == dropna
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"index",
|
||
|
[
|
||
|
pd.RangeIndex(0, 4),
|
||
|
list("abcd"),
|
||
|
pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
|
||
|
],
|
||
|
)
|
||
|
def test_groupby_dataframe_slice_then_transform(dropna, index):
|
||
|
# GH35014 & GH35612
|
||
|
expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}
|
||
|
|
||
|
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
|
||
|
gb = df.groupby("A", dropna=dropna)
|
||
|
|
||
|
result = gb.transform(len)
|
||
|
expected = pd.DataFrame(expected_data, index=index)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = gb[["B"]].transform(len)
|
||
|
expected = pd.DataFrame(expected_data, index=index)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = gb["B"].transform(len)
|
||
|
expected = pd.Series(expected_data["B"], index=index, name="B")
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"dropna, tuples, outputs",
|
||
|
[
|
||
|
(
|
||
|
True,
|
||
|
[["A", "B"], ["B", "A"]],
|
||
|
{"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]},
|
||
|
),
|
||
|
(
|
||
|
False,
|
||
|
[["A", "B"], ["A", np.nan], ["B", "A"]],
|
||
|
{
|
||
|
"c": [13.0, 12.3, 123.23],
|
||
|
"d": [12.0, 233.0, 123.0],
|
||
|
"e": [1.0, 12.0, 1.0],
|
||
|
},
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs):
|
||
|
# GH 3729
|
||
|
df_list = [
|
||
|
["A", "B", 12, 12, 12],
|
||
|
["A", None, 12.3, 233.0, 12],
|
||
|
["B", "A", 123.23, 123, 1],
|
||
|
["A", "B", 1, 1, 1.0],
|
||
|
]
|
||
|
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||
|
agg_dict = {"c": "sum", "d": "max", "e": "min"}
|
||
|
grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict)
|
||
|
|
||
|
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||
|
|
||
|
# Since right now, by default MI will drop NA from levels when we create MI
|
||
|
# via `from_*`, so we need to add NA for level manually afterwards.
|
||
|
if not dropna:
|
||
|
mi = mi.set_levels(["A", "B", np.nan], level="b")
|
||
|
expected = pd.DataFrame(outputs, index=mi)
|
||
|
|
||
|
tm.assert_frame_equal(grouped, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.arm_slow
|
||
|
@pytest.mark.parametrize(
|
||
|
"datetime1, datetime2",
|
||
|
[
|
||
|
(pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")),
|
||
|
(pd.Timedelta("-2 days"), pd.Timedelta("-1 days")),
|
||
|
(pd.Period("2020-01-01"), pd.Period("2020-02-01")),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])])
|
||
|
def test_groupby_dropna_datetime_like_data(
|
||
|
dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2
|
||
|
):
|
||
|
# 3729
|
||
|
df = pd.DataFrame(
|
||
|
{
|
||
|
"values": [1, 2, 3, 4, 5, 6],
|
||
|
"dt": [
|
||
|
datetime1,
|
||
|
unique_nulls_fixture,
|
||
|
datetime2,
|
||
|
unique_nulls_fixture2,
|
||
|
datetime1,
|
||
|
datetime1,
|
||
|
],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
if dropna:
|
||
|
indexes = [datetime1, datetime2]
|
||
|
else:
|
||
|
indexes = [datetime1, datetime2, np.nan]
|
||
|
|
||
|
grouped = df.groupby("dt", dropna=dropna).agg({"values": "sum"})
|
||
|
expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))
|
||
|
|
||
|
tm.assert_frame_equal(grouped, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"dropna, data, selected_data, levels",
|
||
|
[
|
||
|
pytest.param(
|
||
|
False,
|
||
|
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
|
||
|
{"values": [0, 1, 0, 0]},
|
||
|
["a", "b", np.nan],
|
||
|
id="dropna_false_has_nan",
|
||
|
),
|
||
|
pytest.param(
|
||
|
True,
|
||
|
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
|
||
|
{"values": [0, 1, 0]},
|
||
|
None,
|
||
|
id="dropna_true_has_nan",
|
||
|
),
|
||
|
pytest.param(
|
||
|
# no nan in "groups"; dropna=True|False should be same.
|
||
|
False,
|
||
|
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
|
||
|
{"values": [0, 1, 0, 0]},
|
||
|
None,
|
||
|
id="dropna_false_no_nan",
|
||
|
),
|
||
|
pytest.param(
|
||
|
# no nan in "groups"; dropna=True|False should be same.
|
||
|
True,
|
||
|
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
|
||
|
{"values": [0, 1, 0, 0]},
|
||
|
None,
|
||
|
id="dropna_true_no_nan",
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels):
|
||
|
# GH 35889
|
||
|
|
||
|
df = pd.DataFrame(data)
|
||
|
gb = df.groupby("groups", dropna=dropna)
|
||
|
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||
|
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||
|
result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
|
||
|
|
||
|
mi_tuples = tuple(zip(data["groups"], selected_data["values"]))
|
||
|
mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
|
||
|
# Since right now, by default MI will drop NA from levels when we create MI
|
||
|
# via `from_*`, so we need to add NA for level manually afterwards.
|
||
|
if not dropna and levels:
|
||
|
mi = mi.set_levels(levels, level="groups")
|
||
|
|
||
|
expected = pd.DataFrame(selected_data, index=mi)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("input_index", [None, ["a"], ["a", "b"]])
|
||
|
@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
|
||
|
@pytest.mark.parametrize("series", [True, False])
|
||
|
def test_groupby_dropna_with_multiindex_input(input_index, keys, series):
|
||
|
# GH#46783
|
||
|
obj = pd.DataFrame(
|
||
|
{
|
||
|
"a": [1, np.nan],
|
||
|
"b": [1, 1],
|
||
|
"c": [2, 3],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
expected = obj.set_index(keys)
|
||
|
if series:
|
||
|
expected = expected["c"]
|
||
|
elif input_index == ["a", "b"] and keys == ["a"]:
|
||
|
# Column b should not be aggregated
|
||
|
expected = expected[["c"]]
|
||
|
|
||
|
if input_index is not None:
|
||
|
obj = obj.set_index(input_index)
|
||
|
gb = obj.groupby(keys, dropna=False)
|
||
|
if series:
|
||
|
gb = gb["c"]
|
||
|
result = gb.sum()
|
||
|
|
||
|
tm.assert_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_nan_included():
|
||
|
# GH 35646
|
||
|
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
|
||
|
df = pd.DataFrame(data)
|
||
|
grouped = df.groupby("group", dropna=False)
|
||
|
result = grouped.indices
|
||
|
dtype = np.intp
|
||
|
expected = {
|
||
|
"g1": np.array([0, 2], dtype=dtype),
|
||
|
"g2": np.array([3], dtype=dtype),
|
||
|
np.nan: np.array([1, 4], dtype=dtype),
|
||
|
}
|
||
|
for result_values, expected_values in zip(result.values(), expected.values()):
|
||
|
tm.assert_numpy_array_equal(result_values, expected_values)
|
||
|
assert np.isnan(list(result.keys())[2])
|
||
|
assert list(result.keys())[0:2] == ["g1", "g2"]
|
||
|
|
||
|
|
||
|
def test_groupby_drop_nan_with_multi_index():
|
||
|
# GH 39895
|
||
|
df = pd.DataFrame([[np.nan, 0, 1]], columns=["a", "b", "c"])
|
||
|
df = df.set_index(["a", "b"])
|
||
|
result = df.groupby(["a", "b"], dropna=False).first()
|
||
|
expected = df
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
# sequence_index enumerates all strings made up of x, y, z of length 4
|
||
|
@pytest.mark.parametrize("sequence_index", range(3**4))
|
||
|
@pytest.mark.parametrize(
|
||
|
"dtype",
|
||
|
[
|
||
|
None,
|
||
|
"UInt8",
|
||
|
"Int8",
|
||
|
"UInt16",
|
||
|
"Int16",
|
||
|
"UInt32",
|
||
|
"Int32",
|
||
|
"UInt64",
|
||
|
"Int64",
|
||
|
"Float32",
|
||
|
"Int64",
|
||
|
"Float64",
|
||
|
"category",
|
||
|
"string",
|
||
|
pytest.param(
|
||
|
"string[pyarrow]",
|
||
|
marks=pytest.mark.skipif(
|
||
|
pa_version_under10p1, reason="pyarrow is not installed"
|
||
|
),
|
||
|
),
|
||
|
"datetime64[ns]",
|
||
|
"period[d]",
|
||
|
"Sparse[float]",
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("test_series", [True, False])
|
||
|
def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index):
|
||
|
# GH#46584, GH#48794
|
||
|
|
||
|
# Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz"
|
||
|
# This sequence is used for the grouper.
|
||
|
sequence = "".join(
|
||
|
[{0: "x", 1: "y", 2: "z"}[sequence_index // (3**k) % 3] for k in range(4)]
|
||
|
)
|
||
|
|
||
|
# Unique values to use for grouper, depends on dtype
|
||
|
if dtype in ("string", "string[pyarrow]"):
|
||
|
uniques = {"x": "x", "y": "y", "z": pd.NA}
|
||
|
elif dtype in ("datetime64[ns]", "period[d]"):
|
||
|
uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA}
|
||
|
else:
|
||
|
uniques = {"x": 1, "y": 2, "z": np.nan}
|
||
|
|
||
|
df = pd.DataFrame(
|
||
|
{
|
||
|
"key": pd.Series([uniques[label] for label in sequence], dtype=dtype),
|
||
|
"a": [0, 1, 2, 3],
|
||
|
}
|
||
|
)
|
||
|
gb = df.groupby("key", dropna=False, sort=False, as_index=as_index, observed=False)
|
||
|
if test_series:
|
||
|
gb = gb["a"]
|
||
|
result = gb.sum()
|
||
|
|
||
|
# Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
|
||
|
# issues with hashing np.nan
|
||
|
summed = {}
|
||
|
for idx, label in enumerate(sequence):
|
||
|
summed[label] = summed.get(label, 0) + idx
|
||
|
if dtype == "category":
|
||
|
index = pd.CategoricalIndex(
|
||
|
[uniques[e] for e in summed],
|
||
|
df["key"].cat.categories,
|
||
|
name="key",
|
||
|
)
|
||
|
elif isinstance(dtype, str) and dtype.startswith("Sparse"):
|
||
|
index = pd.Index(
|
||
|
pd.array([uniques[label] for label in summed], dtype=dtype), name="key"
|
||
|
)
|
||
|
else:
|
||
|
index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
|
||
|
expected = pd.Series(summed.values(), index=index, name="a", dtype=None)
|
||
|
if not test_series:
|
||
|
expected = expected.to_frame()
|
||
|
if not as_index:
|
||
|
expected = expected.reset_index()
|
||
|
if dtype is not None and dtype.startswith("Sparse"):
|
||
|
expected["key"] = expected["key"].astype(dtype)
|
||
|
|
||
|
tm.assert_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("test_series", [True, False])
|
||
|
@pytest.mark.parametrize("dtype", [object, None])
|
||
|
def test_null_is_null_for_dtype(
|
||
|
sort, dtype, nulls_fixture, nulls_fixture2, test_series
|
||
|
):
|
||
|
# GH#48506 - groups should always result in using the null for the dtype
|
||
|
df = pd.DataFrame({"a": [1, 2]})
|
||
|
groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype)
|
||
|
obj = df["a"] if test_series else df
|
||
|
gb = obj.groupby(groups, dropna=False, sort=sort)
|
||
|
result = gb.sum()
|
||
|
index = pd.Index([na_value_for_dtype(groups.dtype)])
|
||
|
expected = pd.DataFrame({"a": [3]}, index=index)
|
||
|
if test_series:
|
||
|
tm.assert_series_equal(result, expected["a"])
|
||
|
else:
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
|
||
|
def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind):
|
||
|
# Ensure there is at least one null value by appending to the end
|
||
|
values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
|
||
|
df = pd.DataFrame(
|
||
|
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
|
||
|
)
|
||
|
|
||
|
# Strategy: Compare to dropna=True by filling null values with a new code
|
||
|
df_filled = df.copy()
|
||
|
df_filled["x"] = pd.Categorical(values, categories=[1, 2, 3, 4]).fillna(4)
|
||
|
|
||
|
if index_kind == "range":
|
||
|
keys = ["x"]
|
||
|
elif index_kind == "single":
|
||
|
keys = ["x"]
|
||
|
df = df.set_index("x")
|
||
|
df_filled = df_filled.set_index("x")
|
||
|
else:
|
||
|
keys = ["x", "x2"]
|
||
|
df["x2"] = df["x"]
|
||
|
df = df.set_index(["x", "x2"])
|
||
|
df_filled["x2"] = df_filled["x"]
|
||
|
df_filled = df_filled.set_index(["x", "x2"])
|
||
|
args = get_groupby_method_args(reduction_func, df)
|
||
|
args_filled = get_groupby_method_args(reduction_func, df_filled)
|
||
|
if reduction_func == "corrwith" and index_kind == "range":
|
||
|
# Don't include the grouping columns so we can call reset_index
|
||
|
args = (args[0].drop(columns=keys),)
|
||
|
args_filled = (args_filled[0].drop(columns=keys),)
|
||
|
|
||
|
gb_keepna = df.groupby(
|
||
|
keys, dropna=False, observed=observed, sort=sort, as_index=as_index
|
||
|
)
|
||
|
|
||
|
if not observed and reduction_func in ["idxmin", "idxmax"]:
|
||
|
with pytest.raises(
|
||
|
ValueError, match="empty group due to unobserved categories"
|
||
|
):
|
||
|
getattr(gb_keepna, reduction_func)(*args)
|
||
|
return
|
||
|
|
||
|
gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True)
|
||
|
expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index()
|
||
|
expected["x"] = expected["x"].cat.remove_categories([4])
|
||
|
if index_kind == "multi":
|
||
|
expected["x2"] = expected["x2"].cat.remove_categories([4])
|
||
|
if as_index:
|
||
|
if index_kind == "multi":
|
||
|
expected = expected.set_index(["x", "x2"])
|
||
|
else:
|
||
|
expected = expected.set_index("x")
|
||
|
elif index_kind != "range" and reduction_func != "size":
|
||
|
# size, unlike other methods, has the desired behavior in GH#49519
|
||
|
expected = expected.drop(columns="x")
|
||
|
if index_kind == "multi":
|
||
|
expected = expected.drop(columns="x2")
|
||
|
if reduction_func in ("idxmax", "idxmin") and index_kind != "range":
|
||
|
# expected was computed with a RangeIndex; need to translate to index values
|
||
|
values = expected["y"].values.tolist()
|
||
|
if index_kind == "single":
|
||
|
values = [np.nan if e == 4 else e for e in values]
|
||
|
expected["y"] = pd.Categorical(values, categories=[1, 2, 3])
|
||
|
else:
|
||
|
values = [(np.nan, np.nan) if e == (4, 4) else e for e in values]
|
||
|
expected["y"] = values
|
||
|
if reduction_func == "size":
|
||
|
# size, unlike other methods, has the desired behavior in GH#49519
|
||
|
expected = expected.rename(columns={0: "size"})
|
||
|
if as_index:
|
||
|
expected = expected["size"].rename(None)
|
||
|
|
||
|
if as_index or index_kind == "range" or reduction_func == "size":
|
||
|
warn = None
|
||
|
else:
|
||
|
warn = FutureWarning
|
||
|
msg = "A grouping .* was excluded from the result"
|
||
|
with tm.assert_produces_warning(warn, match=msg):
|
||
|
result = getattr(gb_keepna, reduction_func)(*args)
|
||
|
|
||
|
# size will return a Series, others are DataFrame
|
||
|
tm.assert_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_categorical_transformers(
|
||
|
request, transformation_func, observed, sort, as_index
|
||
|
):
|
||
|
# GH#36327
|
||
|
if transformation_func == "fillna":
|
||
|
msg = "GH#49651 fillna may incorrectly reorders results when dropna=False"
|
||
|
request.applymarker(pytest.mark.xfail(reason=msg, strict=False))
|
||
|
|
||
|
values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
|
||
|
df = pd.DataFrame(
|
||
|
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
|
||
|
)
|
||
|
args = get_groupby_method_args(transformation_func, df)
|
||
|
|
||
|
# Compute result for null group
|
||
|
null_group_values = df[df["x"].isnull()]["y"]
|
||
|
if transformation_func == "cumcount":
|
||
|
null_group_data = list(range(len(null_group_values)))
|
||
|
elif transformation_func == "ngroup":
|
||
|
if sort:
|
||
|
if observed:
|
||
|
na_group = df["x"].nunique(dropna=False) - 1
|
||
|
else:
|
||
|
# TODO: Should this be 3?
|
||
|
na_group = df["x"].nunique(dropna=False) - 1
|
||
|
else:
|
||
|
na_group = df.iloc[: null_group_values.index[0]]["x"].nunique()
|
||
|
null_group_data = len(null_group_values) * [na_group]
|
||
|
else:
|
||
|
null_group_data = getattr(null_group_values, transformation_func)(*args)
|
||
|
null_group_result = pd.DataFrame({"y": null_group_data})
|
||
|
|
||
|
gb_keepna = df.groupby(
|
||
|
"x", dropna=False, observed=observed, sort=sort, as_index=as_index
|
||
|
)
|
||
|
gb_dropna = df.groupby("x", dropna=True, observed=observed, sort=sort)
|
||
|
|
||
|
msg = "The default fill_method='ffill' in DataFrameGroupBy.pct_change is deprecated"
|
||
|
if transformation_func == "pct_change":
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
result = getattr(gb_keepna, "pct_change")(*args)
|
||
|
else:
|
||
|
result = getattr(gb_keepna, transformation_func)(*args)
|
||
|
expected = getattr(gb_dropna, transformation_func)(*args)
|
||
|
|
||
|
for iloc, value in zip(
|
||
|
df[df["x"].isnull()].index.tolist(), null_group_result.values.ravel()
|
||
|
):
|
||
|
if expected.ndim == 1:
|
||
|
expected.iloc[iloc] = value
|
||
|
else:
|
||
|
expected.iloc[iloc, 0] = value
|
||
|
if transformation_func == "ngroup":
|
||
|
expected[df["x"].notnull() & expected.ge(na_group)] += 1
|
||
|
if transformation_func not in ("rank", "diff", "pct_change", "shift"):
|
||
|
expected = expected.astype("int64")
|
||
|
|
||
|
tm.assert_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["head", "tail"])
|
||
|
def test_categorical_head_tail(method, observed, sort, as_index):
|
||
|
# GH#36327
|
||
|
values = np.random.default_rng(2).choice([1, 2, None], 30)
|
||
|
df = pd.DataFrame(
|
||
|
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
|
||
|
)
|
||
|
gb = df.groupby("x", dropna=False, observed=observed, sort=sort, as_index=as_index)
|
||
|
result = getattr(gb, method)()
|
||
|
|
||
|
if method == "tail":
|
||
|
values = values[::-1]
|
||
|
# Take the top 5 values from each group
|
||
|
mask = (
|
||
|
((values == 1) & ((values == 1).cumsum() <= 5))
|
||
|
| ((values == 2) & ((values == 2).cumsum() <= 5))
|
||
|
# flake8 doesn't like the vectorized check for None, thinks we should use `is`
|
||
|
| ((values == None) & ((values == None).cumsum() <= 5)) # noqa: E711
|
||
|
)
|
||
|
if method == "tail":
|
||
|
mask = mask[::-1]
|
||
|
expected = df[mask]
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_categorical_agg():
|
||
|
# GH#36327
|
||
|
values = np.random.default_rng(2).choice([1, 2, None], 30)
|
||
|
df = pd.DataFrame(
|
||
|
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
|
||
|
)
|
||
|
gb = df.groupby("x", dropna=False, observed=False)
|
||
|
result = gb.agg(lambda x: x.sum())
|
||
|
expected = gb.sum()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_categorical_transform():
|
||
|
# GH#36327
|
||
|
values = np.random.default_rng(2).choice([1, 2, None], 30)
|
||
|
df = pd.DataFrame(
|
||
|
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
|
||
|
)
|
||
|
gb = df.groupby("x", dropna=False, observed=False)
|
||
|
result = gb.transform(lambda x: x.sum())
|
||
|
expected = gb.transform("sum")
|
||
|
tm.assert_frame_equal(result, expected)
|