You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2170 lines
72 KiB
2170 lines
72 KiB
from datetime import datetime
import numpy as np
import pytest
import pandas as pd
from pandas import (
import pandas._testing as tm
from pandas.api.typing import SeriesGroupBy
from pandas.tests.groupby import get_groupby_method_args
def cartesian_product_for_groupers(result, args, names, fill_value=np.nan):
"""Reindex to a cartesian production for the groupers,
preserving the nature (Categorical) of each grouper
def f(a):
if isinstance(a, (CategoricalIndex, Categorical)):
categories = a.categories
a = Categorical.from_codes(
np.arange(len(categories)), categories=categories, ordered=a.ordered
return a
index = MultiIndex.from_product(map(f, args), names=names)
return result.reindex(index, fill_value=fill_value).sort_index()
_results_for_groupbys_with_missing_categories = {
# This maps the builtin groupby functions to their expected outputs for
# missing categories when they are called on a categorical grouper with
# observed=False. Some functions are expected to return NaN, some zero.
# These expected values can be used across several tests (i.e. they are
# the same for SeriesGroupBy and DataFrameGroupBy) but they should only be
# hardcoded in one place.
"all": np.nan,
"any": np.nan,
"count": 0,
"corrwith": np.nan,
"first": np.nan,
"idxmax": np.nan,
"idxmin": np.nan,
"last": np.nan,
"max": np.nan,
"mean": np.nan,
"median": np.nan,
"min": np.nan,
"nth": np.nan,
"nunique": 0,
"prod": np.nan,
"quantile": np.nan,
"sem": np.nan,
"size": 0,
"skew": np.nan,
"std": np.nan,
"sum": 0,
"var": np.nan,
def test_apply_use_categorical_name(df):
cats = qcut(df.C, 4)
def get_stats(group):
return {
"min": group.min(),
"max": group.max(),
"count": group.count(),
"mean": group.mean(),
result = df.groupby(cats, observed=False).D.apply(get_stats)
assert result.index.names[0] == "C"
def test_basic(using_infer_string): # TODO: split this test
cats = Categorical(
["a", "a", "a", "b", "b", "b", "c", "c", "c"],
categories=["a", "b", "c", "d"],
data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
exp_index = CategoricalIndex(list("abcd"), name="b", ordered=True)
expected = DataFrame({"a": [1, 2, 4, np.nan]}, index=exp_index)
result = data.groupby("b", observed=False).mean()
tm.assert_frame_equal(result, expected)
cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
# single grouper
gb = df.groupby("A", observed=False)
exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
result = gb.sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
# GH 8623
x = DataFrame(
[[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
columns=["person_id", "person_name"],
x["person_name"] = Categorical(x.person_name)
g = x.groupby(["person_id"], observed=False)
result = g.transform(lambda x: x)
tm.assert_frame_equal(result, x[["person_name"]])
result = x.drop_duplicates("person_name")
expected = x.iloc[[0, 1]]
tm.assert_frame_equal(result, expected)
def f(x):
return x.drop_duplicates("person_name").iloc[0]
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result = g.apply(f)
expected = x.iloc[[0, 1]].copy()
expected.index = Index([1, 2], name="person_id")
dtype = "string[pyarrow_numpy]" if using_infer_string else object
expected["person_name"] = expected["person_name"].astype(dtype)
tm.assert_frame_equal(result, expected)
# GH 9921
# Monotonic
df = DataFrame({"a": [5, 15, 25]})
c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
msg = "using SeriesGroupBy.sum"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#53425
result = df.a.groupby(c, observed=False).transform(sum)
tm.assert_series_equal(result, df["a"])
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]
msg = "using DataFrameGroupBy.sum"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#53425
result = df.groupby(c, observed=False).transform(sum)
expected = df[["a"]]
tm.assert_frame_equal(result, expected)
gbc = df.groupby(c, observed=False)
result = gbc.transform(lambda xs: np.max(xs, axis=0))
tm.assert_frame_equal(result, df[["a"]])
result2 = gbc.transform(lambda xs: np.max(xs, axis=0))
msg = "using DataFrameGroupBy.max"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#53425
result3 = gbc.transform(max)
result4 = gbc.transform(np.maximum.reduce)
result5 = gbc.transform(lambda xs: np.maximum.reduce(xs))
tm.assert_frame_equal(result2, df[["a"]], check_dtype=False)
tm.assert_frame_equal(result3, df[["a"]], check_dtype=False)
tm.assert_frame_equal(result4, df[["a"]])
tm.assert_frame_equal(result5, df[["a"]])
# Filter
tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"])
tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)
# Non-monotonic
df = DataFrame({"a": [5, 15, 25, -5]})
c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
msg = "using SeriesGroupBy.sum"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#53425
result = df.a.groupby(c, observed=False).transform(sum)
tm.assert_series_equal(result, df["a"])
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]
msg = "using DataFrameGroupBy.sum"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#53425
result = df.groupby(c, observed=False).transform(sum)
expected = df[["a"]]
tm.assert_frame_equal(result, expected)
df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]]
# GH 9603
df = DataFrame({"a": [1, 0, 0, 0]})
c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd")))
result = df.groupby(c, observed=False).apply(len)
exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
expected = Series([1, 0, 0, 0], index=exp_index)
| = "a"
tm.assert_series_equal(result, expected)
# more basic
levels = ["foo", "bar", "baz", "qux"]
codes = np.random.default_rng(2).integers(0, 4, size=100)
cats = Categorical.from_codes(codes, levels, ordered=True)
data = DataFrame(np.random.default_rng(2).standard_normal((100, 4)))
result = data.groupby(cats, observed=False).mean()
expected = data.groupby(np.asarray(cats), observed=False).mean()
exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True)
expected = expected.reindex(exp_idx)
tm.assert_frame_equal(result, expected)
grouped = data.groupby(cats, observed=False)
desc_result = grouped.describe()
idx =
ord_labels = np.asarray(cats).take(idx)
ord_data = data.take(idx)
exp_cats = Categorical(
ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"]
expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe()
tm.assert_frame_equal(desc_result, expected)
# GH 10460
expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
exp = CategoricalIndex(expc)
(desc_result.stack(future_stack=True).index.get_level_values(0)), exp
exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
(desc_result.stack(future_stack=True).index.get_level_values(1)), exp
def test_level_get_group(observed):
# GH15155
df = DataFrame(
data=np.arange(2, 22, 2),
levels=[CategoricalIndex(["a", "b"]), range(10)],
codes=[[0] * 5 + [1] * 5, range(10)],
names=["Index1", "Index2"],
g = df.groupby(level=["Index1"], observed=observed)
# expected should equal test.loc[["a"]]
# GH15166
expected = DataFrame(
data=np.arange(2, 12, 2),
levels=[CategoricalIndex(["a", "b"]), range(5)],
codes=[[0] * 5, range(5)],
names=["Index1", "Index2"],
msg = "you will need to pass a length-1 tuple"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#25971 - warn when not passing a length-1 tuple
result = g.get_group("a")
tm.assert_frame_equal(result, expected)
def test_sorting_with_different_categoricals():
# GH 24271
df = DataFrame(
"group": ["A"] * 6 + ["B"] * 6,
"dose": ["high", "med", "low"] * 4,
"outcomes": np.arange(12.0),
df.dose = Categorical(df.dose, categories=["low", "med", "high"], ordered=True)
result = df.groupby("group")["dose"].value_counts()
result = result.sort_index(level=0, sort_remaining=True)
index = ["low", "med", "high", "low", "med", "high"]
index = Categorical(index, categories=["low", "med", "high"], ordered=True)
index = [["A", "A", "A", "B", "B", "B"], CategoricalIndex(index)]
index = MultiIndex.from_arrays(index, names=["group", "dose"])
expected = Series([2] * 6, index=index, name="count")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ordered", [True, False])
def test_apply(ordered):
# GH 10138
dense = Categorical(list("abc"), ordered=ordered)
# 'b' is in the categories but not in the list
missing = Categorical(list("aaa"), categories=["a", "b"], ordered=ordered)
values = np.arange(len(dense))
df = DataFrame({"missing": missing, "dense": dense, "values": values})
grouped = df.groupby(["missing", "dense"], observed=True)
# missing category 'b' should still exist in the output index
idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"])
result = grouped.apply(lambda x: np.mean(x, axis=0))
tm.assert_frame_equal(result, expected)
result = grouped.mean()
tm.assert_frame_equal(result, expected)
msg = "using DataFrameGroupBy.mean"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#53425
result = grouped.agg(np.mean)
tm.assert_frame_equal(result, expected)
# but for transform we should still get back the original index
idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
expected = Series(1, index=idx)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result = grouped.apply(lambda x: 1)
tm.assert_series_equal(result, expected)
def test_observed(observed):
# multiple groupers, don't re-expand the output space
# of the grouper
# gh-14942 (implement)
# gh-10132 (back-compat)
# gh-8138 (back-compat)
# gh-8869
cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
df["C"] = ["foo", "bar"] * 2
# multiple groupers with a non-cat
gb = df.groupby(["A", "B", "C"], observed=observed)
exp_index = MultiIndex.from_arrays(
[cat1, cat2, ["foo", "bar"] * 2], names=["A", "B", "C"]
expected = DataFrame({"values": Series([1, 2, 3, 4], index=exp_index)}).sort_index()
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0
tm.assert_frame_equal(result, expected)
gb = df.groupby(["A", "B"], observed=observed)
exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
expected = DataFrame(
{"values": [1, 2, 3, 4], "C": ["foo", "bar", "foo", "bar"]}, index=exp_index
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
expected, [cat1, cat2], list("AB"), fill_value=0
tm.assert_frame_equal(result, expected)
d = {
"cat": Categorical(
["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True
"ints": [1, 1, 2, 2],
"val": [10, 20, 30, 40],
df = DataFrame(d)
# Grouping on a single column
groups_single_key = df.groupby("cat", observed=observed)
result = groups_single_key.mean()
exp_index = CategoricalIndex(
list("ab"), name="cat", categories=list("abc"), ordered=True
expected = DataFrame({"ints": [1.5, 1.5], "val": [20.0, 30]}, index=exp_index)
if not observed:
index = CategoricalIndex(
list("abc"), name="cat", categories=list("abc"), ordered=True
expected = expected.reindex(index)
tm.assert_frame_equal(result, expected)
# Grouping on two columns
groups_double_key = df.groupby(["cat", "ints"], observed=observed)
result = groups_double_key.agg("mean")
expected = DataFrame(
"val": [10.0, 30.0, 20.0, 40.0],
"cat": Categorical(
["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True
"ints": [1, 2, 1, 2],
).set_index(["cat", "ints"])
if not observed:
expected = cartesian_product_for_groupers(
expected, [, [1, 2]], ["cat", "ints"]
tm.assert_frame_equal(result, expected)
# GH 10132
for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]:
c, i = key
result = groups_double_key.get_group(key)
expected = df[( == c) & (df.ints == i)]
tm.assert_frame_equal(result, expected)
# gh-8869
# with as_index
d = {
"foo": [10, 8, 4, 8, 4, 1, 1],
"bar": [10, 20, 30, 40, 50, 60, 70],
"baz": ["d", "c", "e", "a", "a", "d", "c"],
df = DataFrame(d)
cat = pd.cut(df["foo"], np.linspace(0, 10, 3))
df["range"] = cat
groups = df.groupby(["range", "baz"], as_index=False, observed=observed)
result = groups.agg("mean")
groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed)
expected = groups2.agg("mean").reset_index()
tm.assert_frame_equal(result, expected)
def test_observed_codes_remap(observed):
d = {"C1": [3, 3, 4, 5], "C2": [1, 2, 3, 4], "C3": [10, 100, 200, 34]}
df = DataFrame(d)
values = pd.cut(df["C1"], [1, 2, 3, 6])
| = "cat"
groups_double_key = df.groupby([values, "C2"], observed=observed)
idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"])
expected = DataFrame(
{"C1": [3.0, 3.0, 4.0, 5.0], "C3": [10.0, 100.0, 200.0, 34.0]}, index=idx
if not observed:
expected = cartesian_product_for_groupers(
expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"]
result = groups_double_key.agg("mean")
tm.assert_frame_equal(result, expected)
def test_observed_perf():
# we create a cartesian product, so this is
# non-performant if we don't use observed values
# gh-14942
df = DataFrame(
"cat": np.random.default_rng(2).integers(0, 255, size=30000),
"int_id": np.random.default_rng(2).integers(0, 255, size=30000),
"other_id": np.random.default_rng(2).integers(0, 10000, size=30000),
"foo": 0,
df["cat"] ="category")
grouped = df.groupby(["cat", "int_id", "other_id"], observed=True)
result = grouped.count()
assert result.index.levels[0].nunique() ==
assert result.index.levels[1].nunique() == df.int_id.nunique()
assert result.index.levels[2].nunique() == df.other_id.nunique()
def test_observed_groups(observed):
# gh-20583
# test that we have the appropriate groups
cat = Categorical(["a", "c", "a"], categories=["a", "b", "c"])
df = DataFrame({"cat": cat, "vals": [1, 2, 3]})
g = df.groupby("cat", observed=observed)
result = g.groups
if observed:
expected = {"a": Index([0, 2], dtype="int64"), "c": Index([1], dtype="int64")}
expected = {
"a": Index([0, 2], dtype="int64"),
"b": Index([], dtype="int64"),
"c": Index([1], dtype="int64"),
tm.assert_dict_equal(result, expected)
"keys, expected_values, expected_index_levels",
("a", [15, 9, 0], CategoricalIndex([1, 2, 3], name="a")),
["a", "b"],
[7, 8, 0, 0, 0, 9, 0, 0, 0],
[CategoricalIndex([1, 2, 3], name="a"), Index([4, 5, 6])],
["a", "a2"],
[15, 0, 0, 0, 9, 0, 0, 0, 0],
CategoricalIndex([1, 2, 3], name="a"),
CategoricalIndex([1, 2, 3], name="a"),
@pytest.mark.parametrize("test_series", [True, False])
def test_unobserved_in_index(keys, expected_values, expected_index_levels, test_series):
# GH#49354 - ensure unobserved cats occur when grouping by index levels
df = DataFrame(
"a": Categorical([1, 1, 2], categories=[1, 2, 3]),
"a2": Categorical([1, 1, 2], categories=[1, 2, 3]),
"b": [4, 5, 6],
"c": [7, 8, 9],
).set_index(["a", "a2"])
if "b" not in keys:
# Only keep b when it is used for grouping for consistent columns in the result
df = df.drop(columns="b")
gb = df.groupby(keys, observed=False)
if test_series:
gb = gb["c"]
result = gb.sum()
if len(keys) == 1:
index = expected_index_levels
codes = [[0, 0, 0, 1, 1, 1, 2, 2, 2], 3 * [0, 1, 2]]
index = MultiIndex(
expected = DataFrame({"c": expected_values}, index=index)
if test_series:
expected = expected["c"]
tm.assert_equal(result, expected)
def test_observed_groups_with_nan(observed):
# GH 24740
df = DataFrame(
"cat": Categorical(["a", np.nan, "a"], categories=["a", "b", "d"]),
"vals": [1, 2, 3],
g = df.groupby("cat", observed=observed)
result = g.groups
if observed:
expected = {"a": Index([0, 2], dtype="int64")}
expected = {
"a": Index([0, 2], dtype="int64"),
"b": Index([], dtype="int64"),
"d": Index([], dtype="int64"),
tm.assert_dict_equal(result, expected)
def test_observed_nth():
# GH 26385
cat = Categorical(["a", np.nan, np.nan], categories=["a", "b", "c"])
ser = Series([1, 2, 3])
df = DataFrame({"cat": cat, "ser": ser})
result = df.groupby("cat", observed=False)["ser"].nth(0)
expected = df["ser"].iloc[[0]]
tm.assert_series_equal(result, expected)
def test_dataframe_categorical_with_nan(observed):
# GH 21151
s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"])
s2 = Series([1, 2, 3, 4])
df = DataFrame({"s1": s1, "s2": s2})
result = df.groupby("s1", observed=observed).first().reset_index()
if observed:
expected = DataFrame(
{"s1": Categorical(["a"], categories=["a", "b", "c"]), "s2": [2]}
expected = DataFrame(
"s1": Categorical(["a", "b", "c"], categories=["a", "b", "c"]),
"s2": [2, np.nan, np.nan],
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("ordered", [True, False])
@pytest.mark.parametrize("observed", [True, False])
@pytest.mark.parametrize("sort", [True, False])
def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
# GH 25871: Fix groupby sorting on ordered Categoricals
# GH 25167: Groupby with observed=True doesn't sort
# Build a dataframe with cat having one unobserved category ('missing'),
# and a Series with identical values
label = Categorical(
["d", "a", "b", "a", "d", "b"],
categories=["a", "b", "missing", "d"],
val = Series(["d", "a", "b", "a", "d", "b"])
df = DataFrame({"label": label, "val": val})
# aggregate on the Categorical
result = df.groupby("label", observed=observed, sort=sort)["val"].aggregate("first")
# If ordering works, we expect index labels equal to aggregation results,
# except for 'observed=False': label 'missing' has aggregation None
label = Series(result.index.array, dtype="object")
aggr = Series(result.array)
if not observed:
aggr[aggr.isna()] = "missing"
if not all(label == aggr):
msg = (
"Labels and aggregation results not consistently sorted\n"
f"for (ordered={ordered}, observed={observed}, sort={sort})\n"
assert False, msg
def test_datetime():
# GH9049: ensure backward compatibility
levels = pd.date_range("2014-01-01", periods=4)
codes = np.random.default_rng(2).integers(0, 4, size=100)
cats = Categorical.from_codes(codes, levels, ordered=True)
data = DataFrame(np.random.default_rng(2).standard_normal((100, 4)))
result = data.groupby(cats, observed=False).mean()
expected = data.groupby(np.asarray(cats), observed=False).mean()
expected = expected.reindex(levels)
expected.index = CategoricalIndex(
expected.index, categories=expected.index, ordered=True
tm.assert_frame_equal(result, expected)
grouped = data.groupby(cats, observed=False)
desc_result = grouped.describe()
idx =
ord_labels = cats.take(idx)
ord_data = data.take(idx)
expected = ord_data.groupby(ord_labels, observed=False).describe()
tm.assert_frame_equal(desc_result, expected)
tm.assert_index_equal(desc_result.index, expected.index)
desc_result.index.get_level_values(0), expected.index.get_level_values(0)
# GH 10460
expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
exp = CategoricalIndex(expc)
(desc_result.stack(future_stack=True).index.get_level_values(0)), exp
exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
(desc_result.stack(future_stack=True).index.get_level_values(1)), exp
def test_categorical_index():
s = np.random.default_rng(2)
levels = ["foo", "bar", "baz", "qux"]
codes = s.integers(0, 4, size=20)
cats = Categorical.from_codes(codes, levels, ordered=True)
df = DataFrame(np.repeat(np.arange(20), 4).reshape(-1, 4), columns=list("abcd"))
df["cats"] = cats
# with a cat index
result = df.set_index("cats").groupby(level=0, observed=False).sum()
expected = df[list("abcd")].groupby(, observed=False).sum()
expected.index = CategoricalIndex(
Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats"
tm.assert_frame_equal(result, expected)
# with a cat column, should produce a cat index
result = df.groupby("cats", observed=False).sum()
expected = df[list("abcd")].groupby(, observed=False).sum()
expected.index = CategoricalIndex(
Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats"
tm.assert_frame_equal(result, expected)
def test_describe_categorical_columns():
# GH 11558
cats = CategoricalIndex(
["qux", "foo", "baz", "bar"],
categories=["foo", "bar", "baz", "qux"],
df = DataFrame(np.random.default_rng(2).standard_normal((20, 4)), columns=cats)
result = df.groupby([1, 2, 3, 4] * 5).describe()
tm.assert_index_equal(result.stack(future_stack=True).columns, cats)
result.stack(future_stack=True).columns.values, cats.values
def test_unstack_categorical():
# GH11558 (example is taken from the original issue)
df = DataFrame(
{"a": range(10), "medium": ["A", "B"] * 5, "artist": list("XYXXY") * 2}
df["medium"] = df["medium"].astype("category")
gcat = df.groupby(["artist", "medium"], observed=False)["a"].count().unstack()
result = gcat.describe()
exp_columns = CategoricalIndex(["A", "B"], ordered=False, name="medium")
tm.assert_index_equal(result.columns, exp_columns)
tm.assert_categorical_equal(result.columns.values, exp_columns.values)
result = gcat["A"] + gcat["B"]
expected = Series([6, 4], index=Index(["X", "Y"], name="artist"))
tm.assert_series_equal(result, expected)
def test_bins_unequal_len():
# GH3011
series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
bins = pd.cut(series.dropna().values, 4)
# len(bins) != len(series) here
with pytest.raises(ValueError, match="Grouper and axis must be same length"):
["series", "data"],
# Group a series with length and index equal to those of the grouper.
(Series(range(4)), {"A": [0, 3], "B": [1, 2]}),
# Group a series with length equal to that of the grouper and index unequal to
# that of the grouper.
(Series(range(4)).rename(lambda idx: idx + 1), {"A": [2], "B": [0, 1]}),
# GH44179: Group a series with length unequal to that of the grouper.
(Series(range(7)), {"A": [0, 3], "B": [1, 2]}),
def test_categorical_series(series, data):
# Group the given series by a series with categorical data type such that group A
# takes indices 0 and 3 and group B indices 1 and 2, obtaining the values mapped in
# the given data.
groupby = series.groupby(Series(list("ABBA"), dtype="category"), observed=False)
result = groupby.aggregate(list)
expected = Series(data, index=CategoricalIndex(data.keys()))
tm.assert_series_equal(result, expected)
def test_as_index():
# GH13204
df = DataFrame(
"cat": Categorical([1, 2, 2], [1, 2, 3]),
"A": [10, 11, 11],
"B": [101, 102, 103],
result = df.groupby(["cat", "A"], as_index=False, observed=True).sum()
expected = DataFrame(
"cat": Categorical([1, 2],,
"A": [10, 11],
"B": [101, 205],
columns=["cat", "A", "B"],
tm.assert_frame_equal(result, expected)
# function grouper
f = lambda r: df.loc[r, "A"]
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby(["cat", f], as_index=False, observed=True).sum()
expected = DataFrame(
"cat": Categorical([1, 2],,
"A": [10, 22],
"B": [101, 205],
columns=["cat", "A", "B"],
tm.assert_frame_equal(result, expected)
# another not in-axis grouper (conflicting names in index)
s = Series(["a", "b", "b"], name="cat")
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby(["cat", s], as_index=False, observed=True).sum()
tm.assert_frame_equal(result, expected)
# is original index dropped?
group_columns = ["cat", "A"]
expected = DataFrame(
"cat": Categorical([1, 2],,
"A": [10, 11],
"B": [101, 205],
columns=["cat", "A", "B"],
for name in [None, "X", "B"]:
df.index = Index(list("abc"), name=name)
result = df.groupby(group_columns, as_index=False, observed=True).sum()
tm.assert_frame_equal(result, expected)
def test_preserve_categories():
# GH-13179
categories = list("abc")
# ordered=True
df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=True)})
sort_index = CategoricalIndex(categories, categories, ordered=True, name="A")
nosort_index = CategoricalIndex(list("bac"), categories, ordered=True, name="A")
df.groupby("A", sort=True, observed=False).first().index, sort_index
# GH#42482 - don't sort result when sort=False, even when ordered=True
df.groupby("A", sort=False, observed=False).first().index, nosort_index
# ordered=False
df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
# GH#48749 - don't change order of categories
# GH#42482 - don't sort result when sort=False, even when ordered=True
nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A")
df.groupby("A", sort=True, observed=False).first().index, sort_index
df.groupby("A", sort=False, observed=False).first().index, nosort_index
def test_preserve_categorical_dtype():
# GH13743, GH13854
df = DataFrame(
"A": [1, 2, 1, 1, 2],
"B": [10, 16, 22, 28, 34],
"C1": Categorical(list("abaab"), categories=list("bac"), ordered=False),
"C2": Categorical(list("abaab"), categories=list("bac"), ordered=True),
# single grouper
exp_full = DataFrame(
"A": [2.0, 1.0, np.nan],
"B": [25.0, 20.0, np.nan],
"C1": Categorical(list("bac"), categories=list("bac"), ordered=False),
"C2": Categorical(list("bac"), categories=list("bac"), ordered=True),
for col in ["C1", "C2"]:
result1 = df.groupby(by=col, as_index=False, observed=False).mean(
result2 = (
df.groupby(by=col, as_index=True, observed=False)
expected = exp_full.reindex(columns=result1.columns)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)
"func, values",
("first", ["second", "first"]),
("last", ["fourth", "third"]),
("min", ["fourth", "first"]),
("max", ["second", "third"]),
def test_preserve_on_ordered_ops(func, values):
# gh-18502
# preserve the categoricals on ops
c = Categorical(["first", "second", "third", "fourth"], ordered=True)
df = DataFrame({"payload": [-1, -2, -1, -2], "col": c})
g = df.groupby("payload")
result = getattr(g, func)()
expected = DataFrame(
{"payload": [-2, -1], "col": Series(values, dtype=c.dtype)}
tm.assert_frame_equal(result, expected)
# we should also preserve categorical for SeriesGroupBy
sgb = df.groupby("payload")["col"]
result = getattr(sgb, func)()
expected = expected["col"]
tm.assert_series_equal(result, expected)
def test_categorical_no_compress():
data = Series(np.random.default_rng(2).standard_normal(9))
codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
result = data.groupby(cats, observed=False).mean()
exp = data.groupby(codes, observed=False).mean()
exp.index = CategoricalIndex(
exp.index, categories=cats.categories, ordered=cats.ordered
tm.assert_series_equal(result, exp)
codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
result = data.groupby(cats, observed=False).mean()
exp = data.groupby(codes, observed=False).mean().reindex(cats.categories)
exp.index = CategoricalIndex(
exp.index, categories=cats.categories, ordered=cats.ordered
tm.assert_series_equal(result, exp)
cats = Categorical(
["a", "a", "a", "b", "b", "b", "c", "c", "c"],
categories=["a", "b", "c", "d"],
data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
result = data.groupby("b", observed=False).mean()
result = result["a"].values
exp = np.array([1, 2, 4, np.nan])
tm.assert_numpy_array_equal(result, exp)
def test_groupby_empty_with_category():
# GH-9614
# test fix for when group by on None resulted in
# coercion of dtype categorical -> float
df = DataFrame({"A": [None] * 3, "B": Categorical(["train", "train", "test"])})
result = df.groupby("A").first()["B"]
expected = Series(
Categorical([], categories=["test", "train"]),
index=Series([], dtype="object", name="A"),
tm.assert_series_equal(result, expected)
def test_sort():
# categorical-labels-after-groupby
# This should result in a properly sorted Series so that the plot
# has a sorted x axis
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)})
labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)]
cat_labels = Categorical(labels, labels)
df = df.sort_values(by=["value"], ascending=True)
df["value_group"] = pd.cut(
df.value, range(0, 10500, 500), right=False, labels=cat_labels
res = df.groupby(["value_group"], observed=False)["value_group"].count()
exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
exp.index = CategoricalIndex(exp.index,
tm.assert_series_equal(res, exp)
@pytest.mark.parametrize("ordered", [True, False])
def test_sort2(sort, ordered):
# dataframe groupby sort was being ignored # GH 8868
# GH#48749 - don't change order of categories
# GH#42482 - don't sort result when sort=False, even when ordered=True
df = DataFrame(
["(7.5, 10]", 10, 10],
["(7.5, 10]", 8, 20],
["(2.5, 5]", 5, 30],
["(5, 7.5]", 6, 40],
["(2.5, 5]", 4, 50],
["(0, 2.5]", 1, 60],
["(5, 7.5]", 7, 70],
columns=["range", "foo", "bar"],
df["range"] = Categorical(df["range"], ordered=ordered)
result = df.groupby("range", sort=sort, observed=False).first()
if sort:
data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
index_values = ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"]
data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
index_values = ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"]
expected = DataFrame(
columns=["foo", "bar"],
index=CategoricalIndex(index_values, name="range", ordered=ordered),
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("ordered", [True, False])
def test_sort_datetimelike(sort, ordered):
# GH10505
# GH#42482 - don't sort result when sort=False, even when ordered=True
# use same data as test_groupby_sort_categorical, which category is
# corresponding to datetime.month
df = DataFrame(
"dt": [
datetime(2011, 7, 1),
datetime(2011, 7, 1),
datetime(2011, 2, 1),
datetime(2011, 5, 1),
datetime(2011, 2, 1),
datetime(2011, 1, 1),
datetime(2011, 5, 1),
"foo": [10, 8, 5, 6, 4, 1, 7],
"bar": [10, 20, 30, 40, 50, 60, 70],
columns=["dt", "foo", "bar"],
# ordered=True
df["dt"] = Categorical(df["dt"], ordered=ordered)
if sort:
data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
index_values = [
datetime(2011, 1, 1),
datetime(2011, 2, 1),
datetime(2011, 5, 1),
datetime(2011, 7, 1),
data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
index_values = [
datetime(2011, 7, 1),
datetime(2011, 2, 1),
datetime(2011, 5, 1),
datetime(2011, 1, 1),
expected = DataFrame(
columns=["foo", "bar"],
index=CategoricalIndex(index_values, name="dt", ordered=ordered),
result = df.groupby("dt", sort=sort, observed=False).first()
tm.assert_frame_equal(result, expected)
def test_empty_sum():
df = DataFrame(
{"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]}
expected_idx = CategoricalIndex(["a", "b", "c"], name="A")
# 0 by default
result = df.groupby("A", observed=False).B.sum()
expected = Series([3, 1, 0], expected_idx, name="B")
tm.assert_series_equal(result, expected)
# min_count=0
result = df.groupby("A", observed=False).B.sum(min_count=0)
expected = Series([3, 1, 0], expected_idx, name="B")
tm.assert_series_equal(result, expected)
# min_count=1
result = df.groupby("A", observed=False).B.sum(min_count=1)
expected = Series([3, 1, np.nan], expected_idx, name="B")
tm.assert_series_equal(result, expected)
# min_count>1
result = df.groupby("A", observed=False).B.sum(min_count=2)
expected = Series([3, np.nan, np.nan], expected_idx, name="B")
tm.assert_series_equal(result, expected)
def test_empty_prod():
df = DataFrame(
{"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]}
expected_idx = CategoricalIndex(["a", "b", "c"], name="A")
# 1 by default
result = df.groupby("A", observed=False)
expected = Series([2, 1, 1], expected_idx, name="B")
tm.assert_series_equal(result, expected)
# min_count=0
result = df.groupby("A", observed=False)
expected = Series([2, 1, 1], expected_idx, name="B")
tm.assert_series_equal(result, expected)
# min_count=1
result = df.groupby("A", observed=False)
expected = Series([2, 1, np.nan], expected_idx, name="B")
tm.assert_series_equal(result, expected)
def test_groupby_multiindex_categorical_datetime():
df = DataFrame(
"key1": Categorical(list("abcbabcba")),
"key2": Categorical(
list(pd.date_range("2018-06-01 00", freq="1min", periods=3)) * 3
"values": np.arange(9),
result = df.groupby(["key1", "key2"], observed=False).mean()
idx = MultiIndex.from_product(
Categorical(["a", "b", "c"]),
Categorical(pd.date_range("2018-06-01 00", freq="1min", periods=3)),
names=["key1", "key2"],
expected = DataFrame({"values": [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
tm.assert_frame_equal(result, expected)
"as_index, expected",
[Series([1, 1, 2], dtype="category"), [1, 2, 2]], names=["a", "b"]
data=[1, 2, 3],
"a": Series([1, 1, 2], dtype="category"),
"b": [1, 2, 2],
"x": [1, 2, 3],
def test_groupby_agg_observed_true_single_column(as_index, expected):
# GH-23970
df = DataFrame(
{"a": Series([1, 1, 2], dtype="category"), "b": [1, 2, 2], "x": [1, 2, 3]}
result = df.groupby(["a", "b"], as_index=as_index, observed=True)["x"].sum()
tm.assert_equal(result, expected)
@pytest.mark.parametrize("fill_value", [None, np.nan, pd.NaT])
def test_shift(fill_value):
ct = Categorical(
["a", "b", "c", "d"], categories=["a", "b", "c", "d"], ordered=False
expected = Categorical(
[None, "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False
res = ct.shift(1, fill_value=fill_value)
tm.assert_equal(res, expected)
def df_cat(df):
DataFrame with multiple categorical columns and a column of integers.
Shortened so as not to contain all possible combinations of categories.
Useful for testing `observed` kwarg functionality on GroupBy objects.
df: DataFrame
Non-categorical, longer DataFrame from another fixture, used to derive
this one
df_cat: DataFrame
df_cat = df.copy()[:4] # leave out some groups
df_cat["A"] = df_cat["A"].astype("category")
df_cat["B"] = df_cat["B"].astype("category")
df_cat["C"] = Series([1, 2, 3, 4])
df_cat = df_cat.drop(["D"], axis=1)
return df_cat
@pytest.mark.parametrize("operation", ["agg", "apply"])
def test_seriesgroupby_observed_true(df_cat, operation):
# GH#24880
# GH#49223 - order of results was wrong when grouping by index levels
lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A")
lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B")
index = MultiIndex.from_arrays([lev_a, lev_b])
expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index()
grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
msg = "using np.sum" if operation == "apply" else "using SeriesGroupBy.sum"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#53425
result = getattr(grouped, operation)(sum)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("operation", ["agg", "apply"])
@pytest.mark.parametrize("observed", [False, None])
def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
# GH 24880
# GH#49223 - order of results was wrong when grouping by index levels
index, _ = MultiIndex.from_product(
CategoricalIndex(["bar", "foo"], ordered=False),
CategoricalIndex(["one", "three", "two"], ordered=False),
names=["A", "B"],
expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
if operation == "agg":
msg = "The 'downcast' keyword in fillna is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = expected.fillna(0, downcast="infer")
grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
msg = "using SeriesGroupBy.sum" if operation == "agg" else "using np.sum"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#53425
result = getattr(grouped, operation)(sum)
tm.assert_series_equal(result, expected)
"observed, index, data",
Index(["bar"] * 4 + ["foo"] * 4, dtype="category", name="A"),
["one", "one", "three", "three", "one", "one", "two", "two"],
Index(["min", "max"] * 4),
[2, 2, 4, 4, 1, 1, 3, 3],
CategoricalIndex(["bar", "foo"], ordered=False),
CategoricalIndex(["one", "three", "two"], ordered=False),
Index(["min", "max"]),
names=["A", "B", None],
[2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3],
CategoricalIndex(["bar", "foo"], ordered=False),
CategoricalIndex(["one", "three", "two"], ordered=False),
Index(["min", "max"]),
names=["A", "B", None],
[2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3],
def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data):
# GH 24880
expected = Series(data=data, index=index, name="C")
result = df_cat.groupby(["A", "B"], observed=observed)["C"].apply(
lambda x: {"min": x.min(), "max": x.max()}
tm.assert_series_equal(result, expected)
def test_groupby_categorical_series_dataframe_consistent(df_cat):
# GH 20416
expected = df_cat.groupby(["A", "B"], observed=False)["C"].mean()
result = df_cat.groupby(["A", "B"], observed=False).mean()["C"]
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("code", [([1, 0, 0]), ([0, 0, 0])])
def test_groupby_categorical_axis_1(code):
# GH 13420
df = DataFrame({"a": [1, 2, 3, 4], "b": [-1, -2, -3, -4], "c": [5, 6, 7, 8]})
cat = Categorical.from_codes(code, categories=list("abc"))
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = df.groupby(cat, axis=1, observed=False)
result = gb.mean()
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb2 = df.T.groupby(cat, axis=0, observed=False)
expected = gb2.mean().T
tm.assert_frame_equal(result, expected)
def test_groupby_cat_preserves_structure(observed, ordered):
# GH 28787
df = DataFrame(
{"Name": Categorical(["Bob", "Greg"], ordered=ordered), "Item": [1, 2]},
columns=["Name", "Item"],
expected = df.copy()
result = (
df.groupby("Name", observed=observed)
.agg(DataFrame.sum, skipna=True)
tm.assert_frame_equal(result, expected)
def test_get_nonexistent_category():
# Accessing a Category that is not in the dataframe
df = DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)})
with pytest.raises(KeyError, match="'vau'"):
lambda rows: DataFrame(
{"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]}
def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed):
# GH 17605
if reduction_func == "ngroup":
pytest.skip("ngroup is not truly a reduction")
df = DataFrame(
"cat_1": Categorical(list("AABB"), categories=list("ABCD")),
"cat_2": Categorical(list("AB") * 2, categories=list("ABCD")),
"value": [0.1] * 4,
args = get_groupby_method_args(reduction_func, df)
expected_length = 4 if observed else 16
series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"]
if reduction_func == "corrwith":
# TODO: implemented SeriesGroupBy.corrwith. See GH 32293
assert not hasattr(series_groupby, reduction_func)
agg = getattr(series_groupby, reduction_func)
if not observed and reduction_func in ["idxmin", "idxmax"]:
# idxmin and idxmax are designed to fail on empty inputs
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
result = agg(*args)
assert len(result) == expected_length
def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
reduction_func, request
# GH 17605
# Tests whether the unobserved categories in the result contain 0 or NaN
if reduction_func == "ngroup":
pytest.skip("ngroup is not truly a reduction")
if reduction_func == "corrwith": # GH 32293
mark = pytest.mark.xfail(
reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
df = DataFrame(
"cat_1": Categorical(list("AABB"), categories=list("ABC")),
"cat_2": Categorical(list("AB") * 2, categories=list("ABC")),
"value": [0.1] * 4,
unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
args = get_groupby_method_args(reduction_func, df)
series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
agg = getattr(series_groupby, reduction_func)
if reduction_func in ["idxmin", "idxmax"]:
# idxmin and idxmax are designed to fail on empty inputs
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
result = agg(*args)
zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func]
for idx in unobserved:
val = result.loc[idx]
assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
# If we expect unobserved values to be zero, we also expect the dtype to be int.
# Except for .sum(). If the observed categories sum to dtype=float (i.e. their
# sums have decimals), then the zeros for the missing categories should also be
# floats.
if zero_or_nan == 0 and reduction_func != "sum":
assert np.issubdtype(result.dtype, np.integer)
def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func):
# GH 23865
# GH 27075
# Ensure that df.groupby, when 'by' is two Categorical variables,
# does not return the categories that are not in df when observed=True
if reduction_func == "ngroup":
pytest.skip("ngroup does not return the Categories on the index")
df = DataFrame(
"cat_1": Categorical(list("AABB"), categories=list("ABC")),
"cat_2": Categorical(list("1111"), categories=list("12")),
"value": [0.1, 0.1, 0.1, 0.1],
unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
df_grp = df.groupby(["cat_1", "cat_2"], observed=True)
args = get_groupby_method_args(reduction_func, df)
res = getattr(df_grp, reduction_func)(*args)
for cat in unobserved_cats:
assert cat not in res.index
@pytest.mark.parametrize("observed", [False, None])
def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
reduction_func, observed
# GH 23865
# GH 27075
# Ensure that df.groupby, when 'by' is two Categorical variables,
# returns the categories that are not in df when observed=False/None
if reduction_func == "ngroup":
pytest.skip("ngroup does not return the Categories on the index")
df = DataFrame(
"cat_1": Categorical(list("AABB"), categories=list("ABC")),
"cat_2": Categorical(list("1111"), categories=list("12")),
"value": [0.1, 0.1, 0.1, 0.1],
unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)
args = get_groupby_method_args(reduction_func, df)
if not observed and reduction_func in ["idxmin", "idxmax"]:
# idxmin and idxmax are designed to fail on empty inputs
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
getattr(df_grp, reduction_func)(*args)
res = getattr(df_grp, reduction_func)(*args)
expected = _results_for_groupbys_with_missing_categories[reduction_func]
if expected is np.nan:
assert res.loc[unobserved_cats].isnull().all().all()
assert (res.loc[unobserved_cats] == expected).all().all()
def test_series_groupby_categorical_aggregation_getitem():
# GH 8870
d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]}
df = DataFrame(d)
cat = pd.cut(df["foo"], np.linspace(0, 20, 5))
df["range"] = cat
groups = df.groupby(["range", "baz"], as_index=True, sort=True, observed=False)
result = groups["foo"].agg("mean")
expected = groups.agg("mean")["foo"]
tm.assert_series_equal(result, expected)
"func, expected_values",
[(Series.nunique, [1, 1, 2]), (Series.count, [1, 2, 2])],
def test_groupby_agg_categorical_columns(func, expected_values):
# 31256
df = DataFrame(
"id": [0, 1, 2, 3, 4],
"groups": [0, 1, 1, 2, 2],
"value": Categorical([0, 0, 0, 0, 1]),
result = df.groupby("groups").agg(func)
expected = DataFrame(
{"value": expected_values}, index=Index([0, 1, 2], name="groups")
tm.assert_frame_equal(result, expected)
def test_groupby_agg_non_numeric():
df = DataFrame({"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"])})
expected = DataFrame({"A": [2, 1]}, index=np.array([1, 2]))
result = df.groupby([1, 2, 1]).agg(Series.nunique)
tm.assert_frame_equal(result, expected)
result = df.groupby([1, 2, 1]).nunique()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("func", ["first", "last"])
def test_groupby_first_returned_categorical_instead_of_dataframe(func):
# GH 28641: groupby drops index, when grouping over categorical column with
# first/last. Renamed Categorical instead of DataFrame previously.
df = DataFrame({"A": [1997], "B": Series(["b"], dtype="category").cat.as_ordered()})
df_grouped = df.groupby("A")["B"]
result = getattr(df_grouped, func)()
# ordered categorical dtype should be preserved
expected = Series(
["b"], index=Index([1997], name="A"), name="B", dtype=df["B"].dtype
tm.assert_series_equal(result, expected)
def test_read_only_category_no_sort():
# GH33410
cats = np.array([1, 2])
cats.flags.writeable = False
df = DataFrame(
{"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))}
expected = DataFrame(data={"a": [2.0, 6.0]}, index=CategoricalIndex(cats, name="b"))
result = df.groupby("b", sort=False, observed=False).mean()
tm.assert_frame_equal(result, expected)
def test_sorted_missing_category_values():
# GH 28597
df = DataFrame(
"foo": [
"bar": ["C", "A", "A", "C", "A", "C", "A", "C"],
df["foo"] = (
.cat.set_categories(["tiny", "small", "medium", "large"], ordered=True)
expected = DataFrame(
"tiny": {"A": 0, "C": 0},
"small": {"A": 0, "C": 1},
"medium": {"A": 1, "C": 1},
"large": {"A": 3, "C": 2},
expected = expected.rename_axis("bar", axis="index")
expected.columns = CategoricalIndex(
["tiny", "small", "medium", "large"],
categories=["tiny", "small", "medium", "large"],
result = df.groupby(["bar", "foo"], observed=False).size().unstack()
tm.assert_frame_equal(result, expected)
def test_agg_cython_category_not_implemented_fallback():
df = DataFrame({"col_num": [1, 1, 2, 3]})
df["col_cat"] = df["col_num"].astype("category")
result = df.groupby("col_num").col_cat.first()
# ordered categorical dtype should definitely be preserved;
# this is unordered, so is less-clear case (if anything, it should raise)
expected = Series(
[1, 2, 3],
index=Index([1, 2, 3], name="col_num"),
tm.assert_series_equal(result, expected)
result = df.groupby("col_num").agg({"col_cat": "first"})
expected = expected.to_frame()
tm.assert_frame_equal(result, expected)
def test_aggregate_categorical_with_isnan():
# GH 29837
df = DataFrame(
"A": [1, 1, 1, 1],
"B": [1, 2, 1, 2],
"numerical_col": [0.1, 0.2, np.nan, 0.3],
"object_col": ["foo", "bar", "foo", "fee"],
"categorical_col": ["foo", "bar", "foo", "fee"],
df = df.astype({"categorical_col": "category"})
result = df.groupby(["A", "B"]).agg(lambda df: df.isna().sum())
index = MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B"))
expected = DataFrame(
"numerical_col": [1, 0],
"object_col": [0, 0],
"categorical_col": [0, 0],
tm.assert_frame_equal(result, expected)
def test_categorical_transform():
# GH 29037
df = DataFrame(
"package_id": [1, 1, 1, 2, 2, 3],
"status": [
delivery_status_type = pd.CategoricalDtype(
categories=["Waiting", "OnTheWay", "Delivered"], ordered=True
df["status"] = df["status"].astype(delivery_status_type)
msg = "using SeriesGroupBy.max"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#53425
df["last_status"] = df.groupby("package_id")["status"].transform(max)
result = df.copy()
expected = DataFrame(
"package_id": [1, 1, 1, 2, 2, 3],
"status": [
"last_status": [
expected["status"] = expected["status"].astype(delivery_status_type)
# .transform(max) should preserve ordered categoricals
expected["last_status"] = expected["last_status"].astype(delivery_status_type)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("func", ["first", "last"])
def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals(
func: str, observed: bool
# GH 34951
cat = Categorical([0, 0, 1, 1])
val = [0, 1, 1, 0]
df = DataFrame({"a": cat, "b": cat, "c": val})
cat2 = Categorical([0, 1])
idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"])
expected_dict = {
"first": Series([0, np.nan, np.nan, 1], idx, name="c"),
"last": Series([1, np.nan, np.nan, 0], idx, name="c"),
expected = expected_dict[func]
if observed:
expected = expected.dropna().astype(np.int64)
srs_grp = df.groupby(["a", "b"], observed=observed)["c"]
result = getattr(srs_grp, func)()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("func", ["first", "last"])
def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals(
func: str, observed: bool
# GH 34951
cat = Categorical([0, 0, 1, 1])
val = [0, 1, 1, 0]
df = DataFrame({"a": cat, "b": cat, "c": val})
cat2 = Categorical([0, 1])
idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"])
expected_dict = {
"first": Series([0, np.nan, np.nan, 1], idx, name="c"),
"last": Series([1, np.nan, np.nan, 0], idx, name="c"),
expected = expected_dict[func].to_frame()
if observed:
expected = expected.dropna().astype(np.int64)
df_grp = df.groupby(["a", "b"], observed=observed)
result = getattr(df_grp, func)()
tm.assert_frame_equal(result, expected)
def test_groupby_categorical_indices_unused_categories():
# GH#38642
df = DataFrame(
"key": Categorical(["b", "b", "a"], categories=["a", "b", "c"]),
"col": range(3),
grouped = df.groupby("key", sort=False, observed=False)
result = grouped.indices
expected = {
"b": np.array([0, 1], dtype="intp"),
"a": np.array([2], dtype="intp"),
"c": np.array([], dtype="intp"),
assert result.keys() == expected.keys()
for key in result.keys():
tm.assert_numpy_array_equal(result[key], expected[key])
@pytest.mark.parametrize("func", ["first", "last"])
def test_groupby_last_first_preserve_categoricaldtype(func):
# GH#33090
df = DataFrame({"a": [1, 2, 3]})
df["b"] = df["a"].astype("category")
result = getattr(df.groupby("a")["b"], func)()
expected = Series(
Categorical([1, 2, 3]), name="b", index=Index([1, 2, 3], name="a")
tm.assert_series_equal(expected, result)
def test_groupby_categorical_observed_nunique():
# GH#45128
df = DataFrame({"a": [1, 2], "b": [1, 2], "c": [10, 11]})
df = df.astype(dtype={"a": "category", "b": "category"})
result = df.groupby(["a", "b"], observed=True).nunique()["c"]
expected = Series(
[1, 1],
[CategoricalIndex([1, 2], name="a"), CategoricalIndex([1, 2], name="b")]
tm.assert_series_equal(result, expected)
def test_groupby_categorical_aggregate_functions():
# GH#37275
dtype = pd.CategoricalDtype(categories=["small", "big"], ordered=True)
df = DataFrame(
[[1, "small"], [1, "big"], [2, "small"]], columns=["grp", "description"]
).astype({"description": dtype})
result = df.groupby("grp")["description"].max()
expected = Series(
["big", "small"],
index=Index([1, 2], name="grp"),
dtype=pd.CategoricalDtype(categories=["small", "big"], ordered=True),
tm.assert_series_equal(result, expected)
def test_groupby_categorical_dropna(observed, dropna):
# GH#48645 - dropna should have no impact on the result when there are no NA values
cat = Categorical([1, 2], categories=[1, 2, 3])
df = DataFrame({"x": Categorical([1, 2], categories=[1, 2, 3]), "y": [3, 4]})
gb = df.groupby("x", observed=observed, dropna=dropna)
result = gb.sum()
if observed:
expected = DataFrame({"y": [3, 4]}, index=cat)
index = CategoricalIndex([1, 2, 3], [1, 2, 3])
expected = DataFrame({"y": [3, 4, 0]}, index=index)
| = "x"
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
@pytest.mark.parametrize("ordered", [True, False])
def test_category_order_reducer(
request, as_index, sort, observed, reduction_func, index_kind, ordered
# GH#48749
if reduction_func == "corrwith" and not as_index:
msg = "GH#49950 - corrwith with as_index=False may not have grouping column"
elif index_kind != "range" and not as_index:
pytest.skip(reason="Result doesn't have categories, nothing to test")
df = DataFrame(
"a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
"b": range(4),
if index_kind == "range":
keys = ["a"]
elif index_kind == "single":
keys = ["a"]
df = df.set_index(keys)
elif index_kind == "multi":
keys = ["a", "a2"]
df["a2"] = df["a"]
df = df.set_index(keys)
args = get_groupby_method_args(reduction_func, df)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
if not observed and reduction_func in ["idxmin", "idxmax"]:
# idxmin and idxmax are designed to fail on empty inputs
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
getattr(gb, reduction_func)(*args)
op_result = getattr(gb, reduction_func)(*args)
if as_index:
result = op_result.index.get_level_values("a").categories
result = op_result["a"].cat.categories
expected = Index([1, 4, 3, 2])
tm.assert_index_equal(result, expected)
if index_kind == "multi":
result = op_result.index.get_level_values("a2").categories
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("index_kind", ["single", "multi"])
@pytest.mark.parametrize("ordered", [True, False])
def test_category_order_transformer(
as_index, sort, observed, transformation_func, index_kind, ordered
# GH#48749
df = DataFrame(
"a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
"b": range(4),
if index_kind == "single":
keys = ["a"]
df = df.set_index(keys)
elif index_kind == "multi":
keys = ["a", "a2"]
df["a2"] = df["a"]
df = df.set_index(keys)
args = get_groupby_method_args(transformation_func, df)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
warn = FutureWarning if transformation_func == "fillna" else None
msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=msg):
op_result = getattr(gb, transformation_func)(*args)
result = op_result.index.get_level_values("a").categories
expected = Index([1, 4, 3, 2])
tm.assert_index_equal(result, expected)
if index_kind == "multi":
result = op_result.index.get_level_values("a2").categories
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
@pytest.mark.parametrize("method", ["head", "tail"])
@pytest.mark.parametrize("ordered", [True, False])
def test_category_order_head_tail(
as_index, sort, observed, method, index_kind, ordered
# GH#48749
df = DataFrame(
"a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
"b": range(4),
if index_kind == "range":
keys = ["a"]
elif index_kind == "single":
keys = ["a"]
df = df.set_index(keys)
elif index_kind == "multi":
keys = ["a", "a2"]
df["a2"] = df["a"]
df = df.set_index(keys)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
op_result = getattr(gb, method)()
if index_kind == "range":
result = op_result["a"].cat.categories
result = op_result.index.get_level_values("a").categories
expected = Index([1, 4, 3, 2])
tm.assert_index_equal(result, expected)
if index_kind == "multi":
result = op_result.index.get_level_values("a2").categories
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
@pytest.mark.parametrize("ordered", [True, False])
def test_category_order_apply(as_index, sort, observed, method, index_kind, ordered):
# GH#48749
if (method == "transform" and index_kind == "range") or (
not as_index and index_kind != "range"
pytest.skip("No categories in result, nothing to test")
df = DataFrame(
"a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
"b": range(4),
if index_kind == "range":
keys = ["a"]
elif index_kind == "single":
keys = ["a"]
df = df.set_index(keys)
elif index_kind == "multi":
keys = ["a", "a2"]
df["a2"] = df["a"]
df = df.set_index(keys)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
warn = DeprecationWarning if method == "apply" and index_kind == "range" else None
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(warn, match=msg):
op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True))
if (method == "transform" or not as_index) and index_kind == "range":
result = op_result["a"].cat.categories
result = op_result.index.get_level_values("a").categories
expected = Index([1, 4, 3, 2])
tm.assert_index_equal(result, expected)
if index_kind == "multi":
result = op_result.index.get_level_values("a2").categories
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
def test_many_categories(as_index, sort, index_kind, ordered):
# GH#48749 - Test when the grouper has many categories
if index_kind != "range" and not as_index:
pytest.skip(reason="Result doesn't have categories, nothing to test")
categories = np.arange(9999, -1, -1)
grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered)
df = DataFrame({"a": grouper, "b": range(4)})
if index_kind == "range":
keys = ["a"]
elif index_kind == "single":
keys = ["a"]
df = df.set_index(keys)
elif index_kind == "multi":
keys = ["a", "a2"]
df["a2"] = df["a"]
df = df.set_index(keys)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=True)
result = gb.sum()
# Test is setup so that data and index are the same values
data = [3, 2, 1] if sort else [2, 1, 3]
index = CategoricalIndex(
data, categories=grouper.categories, ordered=ordered, name="a"
if as_index:
expected = DataFrame({"b": data})
if index_kind == "multi":
expected.index = MultiIndex.from_frame(DataFrame({"a": index, "a2": index}))
expected.index = index
elif index_kind == "multi":
expected = DataFrame({"a": Series(index), "a2": Series(index), "b": data})
expected = DataFrame({"a": Series(index), "b": data})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("cat_columns", ["a", "b", ["a", "b"]])
@pytest.mark.parametrize("keys", ["a", "b", ["a", "b"]])
def test_groupby_default_depr(cat_columns, keys):
# GH#43999
df = DataFrame({"a": [1, 1, 2, 3], "b": [4, 5, 6, 7]})
df[cat_columns] = df[cat_columns].astype("category")
msg = "The default of observed=False is deprecated"
klass = FutureWarning if set(cat_columns) & set(keys) else None
with tm.assert_produces_warning(klass, match=msg):
@pytest.mark.parametrize("test_series", [True, False])
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
def test_agg_list(request, as_index, observed, reduction_func, test_series, keys):
# GH#52760
if test_series and reduction_func == "corrwith":
assert not hasattr(SeriesGroupBy, "corrwith")
pytest.skip("corrwith not implemented for SeriesGroupBy")
elif reduction_func == "corrwith":
msg = "GH#32293: attempts to call SeriesGroupBy.corrwith"
elif (
reduction_func == "nunique"
and not test_series
and len(keys) != 1
and not observed
and not as_index
msg = "GH#52848 - raises a ValueError"
df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]})
df = df.astype({"a1": "category", "a2": "category"})
if "a2" not in keys:
df = df.drop(columns="a2")
gb = df.groupby(by=keys, as_index=as_index, observed=observed)
if test_series:
gb = gb["b"]
args = get_groupby_method_args(reduction_func, df)
if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]:
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
gb.agg([reduction_func], *args)
result = gb.agg([reduction_func], *args)
expected = getattr(gb, reduction_func)(*args)
if as_index and (test_series or reduction_func == "size"):
expected = expected.to_frame(reduction_func)
if not test_series:
expected.columns = MultiIndex.from_tuples(
[(ind, "") for ind in expected.columns[:-1]] + [("b", reduction_func)]
elif not as_index:
expected.columns = keys + [reduction_func]
tm.assert_equal(result, expected)