You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3345 lines
106 KiB

from datetime import datetime
import decimal
from decimal import Decimal
import re
import numpy as np
import pytest
from pandas.errors import (
PerformanceWarning,
SpecificationError,
)
import pandas.util._test_decorators as td
from pandas.core.dtypes.common import is_string_dtype
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Grouper,
Index,
Interval,
MultiIndex,
RangeIndex,
Series,
Timedelta,
Timestamp,
date_range,
to_datetime,
)
import pandas._testing as tm
from pandas.core.arrays import BooleanArray
import pandas.core.common as com
pytestmark = pytest.mark.filterwarnings("ignore:Mean of empty slice:RuntimeWarning")
def test_repr():
# GH18203
result = repr(Grouper(key="A", level="B"))
expected = "Grouper(key='A', level='B', axis=0, sort=False, dropna=True)"
assert result == expected
def test_groupby_std_datetimelike(warn_copy_on_write):
# GH#48481
tdi = pd.timedelta_range("1 Day", periods=10000)
ser = Series(tdi)
ser[::5] *= 2 # get different std for different groups
df = ser.to_frame("A").copy()
df["B"] = ser + Timestamp(0)
df["C"] = ser + Timestamp(0, tz="UTC")
df.iloc[-1] = pd.NaT # last group includes NaTs
gb = df.groupby(list(range(5)) * 2000)
result = gb.std()
# Note: this does not _exactly_ match what we would get if we did
# [gb.get_group(i).std() for i in gb.groups]
# but it _does_ match the floating point error we get doing the
# same operation on int64 data xref GH#51332
td1 = Timedelta("2887 days 11:21:02.326710176")
td4 = Timedelta("2886 days 00:42:34.664668096")
exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5))
expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"])
def test_basic_aggregations(dtype):
data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
index = np.arange(9)
np.random.default_rng(2).shuffle(index)
data = data.reindex(index)
grouped = data.groupby(lambda x: x // 3, group_keys=False)
for k, v in grouped:
assert len(v) == 3
msg = "using SeriesGroupBy.mean"
with tm.assert_produces_warning(FutureWarning, match=msg):
agged = grouped.aggregate(np.mean)
assert agged[1] == 1
msg = "using SeriesGroupBy.mean"
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = grouped.agg(np.mean)
tm.assert_series_equal(agged, expected) # shorthand
tm.assert_series_equal(agged, grouped.mean())
result = grouped.sum()
msg = "using SeriesGroupBy.sum"
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = grouped.agg(np.sum)
tm.assert_series_equal(result, expected)
expected = grouped.apply(lambda x: x * x.sum())
transformed = grouped.transform(lambda x: x * x.sum())
assert transformed[7] == 12
tm.assert_series_equal(transformed, expected)
value_grouped = data.groupby(data)
msg = "using SeriesGroupBy.mean"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = value_grouped.aggregate(np.mean)
tm.assert_series_equal(result, agged, check_index_type=False)
# complex agg
msg = "using SeriesGroupBy.[mean|std]"
with tm.assert_produces_warning(FutureWarning, match=msg):
agged = grouped.aggregate([np.mean, np.std])
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
grouped.aggregate({"one": np.mean, "two": np.std})
group_constants = {0: 10, 1: 20, 2: 30}
msg = (
"Pinning the groupby key to each group in SeriesGroupBy.agg is deprecated, "
"and cases that relied on it will raise in a future version"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#41090
agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
assert agged[1] == 21
# corner cases
msg = "Must produce aggregated value"
# exception raised is type Exception
with pytest.raises(Exception, match=msg):
grouped.aggregate(lambda x: x * 2)
def test_groupby_nonobject_dtype(multiindex_dataframe_random_data):
key = multiindex_dataframe_random_data.index.codes[0]
grouped = multiindex_dataframe_random_data.groupby(key)
result = grouped.sum()
expected = multiindex_dataframe_random_data.groupby(key.astype("O")).sum()
assert result.index.dtype == np.int8
assert expected.index.dtype == np.int64
tm.assert_frame_equal(result, expected, check_index_type=False)
def test_groupby_nonobject_dtype_mixed():
# GH 3911, mixed frame non-conversion
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"),
}
)
df["value"] = range(len(df))
def max_value(group):
return group.loc[group["value"].idxmax()]
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
applied = df.groupby("A").apply(max_value)
result = applied.dtypes
expected = df.dtypes
tm.assert_series_equal(result, expected)
def test_inconsistent_return_type():
# GH5592
# inconsistent return type
df = DataFrame(
{
"A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"],
"B": Series(np.arange(7), dtype="int64"),
"C": date_range("20130101", periods=7),
}
)
def f_0(grp):
return grp.iloc[0]
expected = df.groupby("A").first()[["B"]]
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result = df.groupby("A").apply(f_0)[["B"]]
tm.assert_frame_equal(result, expected)
def f_1(grp):
if grp.name == "Tiger":
return None
return grp.iloc[0]
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result = df.groupby("A").apply(f_1)[["B"]]
e = expected.copy()
e.loc["Tiger"] = np.nan
tm.assert_frame_equal(result, e)
def f_2(grp):
if grp.name == "Pony":
return None
return grp.iloc[0]
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result = df.groupby("A").apply(f_2)[["B"]]
e = expected.copy()
e.loc["Pony"] = np.nan
tm.assert_frame_equal(result, e)
# 5592 revisited, with datetimes
def f_3(grp):
if grp.name == "Pony":
return None
return grp.iloc[0]
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result = df.groupby("A").apply(f_3)[["C"]]
e = df.groupby("A").first()[["C"]]
e.loc["Pony"] = pd.NaT
tm.assert_frame_equal(result, e)
# scalar outputs
def f_4(grp):
if grp.name == "Pony":
return None
return grp.iloc[0].loc["C"]
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result = df.groupby("A").apply(f_4)
e = df.groupby("A").first()["C"].copy()
e.loc["Pony"] = np.nan
e.name = None
tm.assert_series_equal(result, e)
def test_pass_args_kwargs(ts, tsframe):
def f(x, q=None, axis=0):
return np.percentile(x, q, axis=axis)
g = lambda x: np.percentile(x, 80, axis=0)
# Series
ts_grouped = ts.groupby(lambda x: x.month)
agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
agg_expected = ts_grouped.quantile(0.8)
trans_expected = ts_grouped.transform(g)
tm.assert_series_equal(apply_result, agg_expected)
tm.assert_series_equal(agg_result, agg_expected)
tm.assert_series_equal(trans_result, trans_expected)
agg_result = ts_grouped.agg(f, q=80)
apply_result = ts_grouped.apply(f, q=80)
trans_result = ts_grouped.transform(f, q=80)
tm.assert_series_equal(agg_result, agg_expected)
tm.assert_series_equal(apply_result, agg_expected)
tm.assert_series_equal(trans_result, trans_expected)
# DataFrame
for as_index in [True, False]:
df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index)
warn = None if as_index else FutureWarning
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(warn, match=msg):
agg_result = df_grouped.agg(np.percentile, 80, axis=0)
with tm.assert_produces_warning(warn, match=msg):
apply_result = df_grouped.apply(DataFrame.quantile, 0.8)
with tm.assert_produces_warning(warn, match=msg):
expected = df_grouped.quantile(0.8)
tm.assert_frame_equal(apply_result, expected, check_names=False)
tm.assert_frame_equal(agg_result, expected)
apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8])
with tm.assert_produces_warning(warn, match=msg):
expected_seq = df_grouped.quantile([0.4, 0.8])
tm.assert_frame_equal(apply_result, expected_seq, check_names=False)
with tm.assert_produces_warning(warn, match=msg):
agg_result = df_grouped.agg(f, q=80)
with tm.assert_produces_warning(warn, match=msg):
apply_result = df_grouped.apply(DataFrame.quantile, q=0.8)
tm.assert_frame_equal(agg_result, expected)
tm.assert_frame_equal(apply_result, expected, check_names=False)
@pytest.mark.parametrize("as_index", [True, False])
def test_pass_args_kwargs_duplicate_columns(tsframe, as_index):
# go through _aggregate_frame with self.axis == 0 and duplicate columns
tsframe.columns = ["A", "B", "A", "C"]
gb = tsframe.groupby(lambda x: x.month, as_index=as_index)
warn = None if as_index else FutureWarning
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(warn, match=msg):
res = gb.agg(np.percentile, 80, axis=0)
ex_data = {
1: tsframe[tsframe.index.month == 1].quantile(0.8),
2: tsframe[tsframe.index.month == 2].quantile(0.8),
}
expected = DataFrame(ex_data).T
if not as_index:
# TODO: try to get this more consistent?
expected.index = Index(range(2))
tm.assert_frame_equal(res, expected)
def test_len():
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])
assert len(grouped) == len(df)
grouped = df.groupby([lambda x: x.year, lambda x: x.month])
expected = len({(x.year, x.month) for x in df.index})
assert len(grouped) == expected
def test_len_nan_group():
# issue 11016
df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]})
assert len(df.groupby("a")) == 0
assert len(df.groupby("b")) == 3
assert len(df.groupby(["a", "b"])) == 3
def test_basic_regression():
# regression
result = Series([1.0 * x for x in list(range(1, 10)) * 10])
data = np.random.default_rng(2).random(1100) * 10.0
groupings = Series(data)
grouped = result.groupby(groupings)
grouped.mean()
@pytest.mark.parametrize(
"dtype", ["float64", "float32", "int64", "int32", "int16", "int8"]
)
def test_with_na_groups(dtype):
index = Index(np.arange(10))
values = Series(np.ones(10), index, dtype=dtype)
labels = Series(
[np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"],
index=index,
)
# this SHOULD be an int
grouped = values.groupby(labels)
agged = grouped.agg(len)
expected = Series([4, 2], index=["bar", "foo"])
tm.assert_series_equal(agged, expected, check_dtype=False)
# assert issubclass(agged.dtype.type, np.integer)
# explicitly return a float from my function
def f(x):
return float(len(x))
agged = grouped.agg(f)
expected = Series([4.0, 2.0], index=["bar", "foo"])
tm.assert_series_equal(agged, expected)
def test_indices_concatenation_order():
# GH 2808
def f1(x):
y = x[(x.b % 2) == 1] ** 2
if y.empty:
multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=["b", "c"])
res = DataFrame(columns=["a"], index=multiindex)
return res
else:
y = y.set_index(["b", "c"])
return y
def f2(x):
y = x[(x.b % 2) == 1] ** 2
if y.empty:
return DataFrame()
else:
y = y.set_index(["b", "c"])
return y
def f3(x):
y = x[(x.b % 2) == 1] ** 2
if y.empty:
multiindex = MultiIndex(
levels=[[]] * 2, codes=[[]] * 2, names=["foo", "bar"]
)
res = DataFrame(columns=["a", "b"], index=multiindex)
return res
else:
return y
df = DataFrame({"a": [1, 2, 2, 2], "b": range(4), "c": range(5, 9)})
df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)})
depr_msg = "The behavior of array concatenation with empty entries is deprecated"
# correct result
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result1 = df.groupby("a").apply(f1)
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result2 = df2.groupby("a").apply(f1)
tm.assert_frame_equal(result1, result2)
# should fail (not the same number of levels)
msg = "Cannot concat indices that do not have the same number of levels"
with pytest.raises(AssertionError, match=msg):
df.groupby("a").apply(f2)
with pytest.raises(AssertionError, match=msg):
df2.groupby("a").apply(f2)
# should fail (incorrect shape)
with pytest.raises(AssertionError, match=msg):
df.groupby("a").apply(f3)
with pytest.raises(AssertionError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
df2.groupby("a").apply(f3)
def test_attr_wrapper(ts):
grouped = ts.groupby(lambda x: x.weekday())
result = grouped.std()
expected = grouped.agg(lambda x: np.std(x, ddof=1))
tm.assert_series_equal(result, expected)
# this is pretty cool
result = grouped.describe()
expected = {name: gp.describe() for name, gp in grouped}
expected = DataFrame(expected).T
tm.assert_frame_equal(result, expected)
# get attribute
result = grouped.dtype
expected = grouped.agg(lambda x: x.dtype)
tm.assert_series_equal(result, expected)
# make sure raises error
msg = "'SeriesGroupBy' object has no attribute 'foo'"
with pytest.raises(AttributeError, match=msg):
getattr(grouped, "foo")
def test_frame_groupby(tsframe):
grouped = tsframe.groupby(lambda x: x.weekday())
# aggregate
aggregated = grouped.aggregate("mean")
assert len(aggregated) == 5
assert len(aggregated.columns) == 4
# by string
tscopy = tsframe.copy()
tscopy["weekday"] = [x.weekday() for x in tscopy.index]
stragged = tscopy.groupby("weekday").aggregate("mean")
tm.assert_frame_equal(stragged, aggregated, check_names=False)
# transform
grouped = tsframe.head(30).groupby(lambda x: x.weekday())
transformed = grouped.transform(lambda x: x - x.mean())
assert len(transformed) == 30
assert len(transformed.columns) == 4
# transform propagate
transformed = grouped.transform(lambda x: x.mean())
for name, group in grouped:
mean = group.mean()
for idx in group.index:
tm.assert_series_equal(transformed.xs(idx), mean, check_names=False)
# iterate
for weekday, group in grouped:
assert group.index[0].weekday() == weekday
# groups / group_indices
groups = grouped.groups
indices = grouped.indices
for k, v in groups.items():
samething = tsframe.index.take(indices[k])
assert (samething == v).all()
def test_frame_groupby_columns(tsframe):
mapping = {"A": 0, "B": 0, "C": 1, "D": 1}
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
grouped = tsframe.groupby(mapping, axis=1)
# aggregate
aggregated = grouped.aggregate("mean")
assert len(aggregated) == len(tsframe)
assert len(aggregated.columns) == 2
# transform
tf = lambda x: x - x.mean()
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
groupedT = tsframe.T.groupby(mapping, axis=0)
tm.assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
# iterate
for k, v in grouped:
assert len(v.columns) == 2
def test_frame_set_name_single(df):
grouped = df.groupby("A")
result = grouped.mean(numeric_only=True)
assert result.index.name == "A"
result = df.groupby("A", as_index=False).mean(numeric_only=True)
assert result.index.name != "A"
result = grouped[["C", "D"]].agg("mean")
assert result.index.name == "A"
result = grouped.agg({"C": "mean", "D": "std"})
assert result.index.name == "A"
result = grouped["C"].mean()
assert result.index.name == "A"
result = grouped["C"].agg("mean")
assert result.index.name == "A"
result = grouped["C"].agg(["mean", "std"])
assert result.index.name == "A"
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
grouped["C"].agg({"foo": "mean", "bar": "std"})
def test_multi_func(df):
col1 = df["A"]
col2 = df["B"]
grouped = df.groupby([col1.get, col2.get])
agged = grouped.mean(numeric_only=True)
expected = df.groupby(["A", "B"]).mean()
# TODO groupby get drops names
tm.assert_frame_equal(
agged.loc[:, ["C", "D"]], expected.loc[:, ["C", "D"]], check_names=False
)
# some "groups" with no data
df = DataFrame(
{
"v1": np.random.default_rng(2).standard_normal(6),
"v2": np.random.default_rng(2).standard_normal(6),
"k1": np.array(["b", "b", "b", "a", "a", "a"]),
"k2": np.array(["1", "1", "1", "2", "2", "2"]),
},
index=["one", "two", "three", "four", "five", "six"],
)
# only verify that it works for now
grouped = df.groupby(["k1", "k2"])
grouped.agg("sum")
def test_multi_key_multiple_functions(df):
grouped = df.groupby(["A", "B"])["C"]
agged = grouped.agg(["mean", "std"])
expected = DataFrame({"mean": grouped.agg("mean"), "std": grouped.agg("std")})
tm.assert_frame_equal(agged, expected)
def test_frame_multi_key_function_list():
data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"D": np.random.default_rng(2).standard_normal(11),
"E": np.random.default_rng(2).standard_normal(11),
"F": np.random.default_rng(2).standard_normal(11),
}
)
grouped = data.groupby(["A", "B"])
funcs = ["mean", "std"]
agged = grouped.agg(funcs)
expected = pd.concat(
[grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)],
keys=["D", "E", "F"],
axis=1,
)
assert isinstance(agged.index, MultiIndex)
assert isinstance(expected.index, MultiIndex)
tm.assert_frame_equal(agged, expected)
def test_frame_multi_key_function_list_partial_failure():
data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.default_rng(2).standard_normal(11),
"E": np.random.default_rng(2).standard_normal(11),
"F": np.random.default_rng(2).standard_normal(11),
}
)
grouped = data.groupby(["A", "B"])
funcs = ["mean", "std"]
msg = re.escape("agg function failed [how->mean,dtype->")
with pytest.raises(TypeError, match=msg):
grouped.agg(funcs)
@pytest.mark.parametrize("op", [lambda x: x.sum(), lambda x: x.mean()])
def test_groupby_multiple_columns(df, op):
data = df
grouped = data.groupby(["A", "B"])
result1 = op(grouped)
keys = []
values = []
for n1, gp1 in data.groupby("A"):
for n2, gp2 in gp1.groupby("B"):
keys.append((n1, n2))
values.append(op(gp2.loc[:, ["C", "D"]]))
mi = MultiIndex.from_tuples(keys, names=["A", "B"])
expected = pd.concat(values, axis=1).T
expected.index = mi
# a little bit crude
for col in ["C", "D"]:
result_col = op(grouped[col])
pivoted = result1[col]
exp = expected[col]
tm.assert_series_equal(result_col, exp)
tm.assert_series_equal(pivoted, exp)
# test single series works the same
result = data["C"].groupby([data["A"], data["B"]]).mean()
expected = data.groupby(["A", "B"]).mean()["C"]
tm.assert_series_equal(result, expected)
def test_as_index_select_column():
# GH 5764
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
result = df.groupby("A", as_index=False)["B"].get_group(1)
expected = Series([2, 4], name="B")
tm.assert_series_equal(result, expected)
result = df.groupby("A", as_index=False, group_keys=True)["B"].apply(
lambda x: x.cumsum()
)
expected = Series(
[2, 6, 6], name="B", index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)])
)
tm.assert_series_equal(result, expected)
def test_obj_arg_get_group_deprecated():
depr_msg = "obj is deprecated"
df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
expected = df.iloc[df.groupby("b").indices.get(4)]
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
result = df.groupby("b").get_group(4, obj=df)
tm.assert_frame_equal(result, expected)
def test_groupby_as_index_select_column_sum_empty_df():
# GH 35246
df = DataFrame(columns=Index(["A", "B", "C"], name="alpha"))
left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False)
expected = DataFrame(columns=df.columns[:2], index=range(0))
# GH#50744 - Columns after selection shouldn't retain names
expected.columns.names = [None]
tm.assert_frame_equal(left, expected)
def test_groupby_as_index_agg(df):
grouped = df.groupby("A", as_index=False)
# single-key
result = grouped[["C", "D"]].agg("mean")
expected = grouped.mean(numeric_only=True)
tm.assert_frame_equal(result, expected)
result2 = grouped.agg({"C": "mean", "D": "sum"})
expected2 = grouped.mean(numeric_only=True)
expected2["D"] = grouped.sum()["D"]
tm.assert_frame_equal(result2, expected2)
grouped = df.groupby("A", as_index=True)
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
grouped["C"].agg({"Q": "sum"})
# multi-key
grouped = df.groupby(["A", "B"], as_index=False)
result = grouped.agg("mean")
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
result2 = grouped.agg({"C": "mean", "D": "sum"})
expected2 = grouped.mean()
expected2["D"] = grouped.sum()["D"]
tm.assert_frame_equal(result2, expected2)
expected3 = grouped["C"].sum()
expected3 = DataFrame(expected3).rename(columns={"C": "Q"})
msg = "Passing a dictionary to SeriesGroupBy.agg is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result3 = grouped["C"].agg({"Q": "sum"})
tm.assert_frame_equal(result3, expected3)
# GH7115 & GH8112 & GH8582
df = DataFrame(
np.random.default_rng(2).integers(0, 100, (50, 3)),
columns=["jim", "joe", "jolie"],
)
ts = Series(np.random.default_rng(2).integers(5, 10, 50), name="jim")
gr = df.groupby(ts)
gr.nth(0) # invokes set_selection_from_grouper internally
msg = "The behavior of DataFrame.sum with axis=None is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
res = gr.apply(sum)
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
alt = df.groupby(ts).apply(sum)
tm.assert_frame_equal(res, alt)
for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]:
gr = df.groupby(ts, as_index=False)
left = getattr(gr, attr)()
gr = df.groupby(ts.values, as_index=True)
right = getattr(gr, attr)().reset_index(drop=True)
tm.assert_frame_equal(left, right)
def test_ops_not_as_index(reduction_func):
# GH 10355, 21090
# Using as_index=False should not modify grouped column
if reduction_func in ("corrwith", "nth", "ngroup"):
pytest.skip(f"GH 5755: Test not applicable for {reduction_func}")
df = DataFrame(
np.random.default_rng(2).integers(0, 5, size=(100, 2)), columns=["a", "b"]
)
expected = getattr(df.groupby("a"), reduction_func)()
if reduction_func == "size":
expected = expected.rename("size")
expected = expected.reset_index()
if reduction_func != "size":
# 32 bit compat -> groupby preserves dtype whereas reset_index casts to int64
expected["a"] = expected["a"].astype(df["a"].dtype)
g = df.groupby("a", as_index=False)
result = getattr(g, reduction_func)()
tm.assert_frame_equal(result, expected)
result = g.agg(reduction_func)
tm.assert_frame_equal(result, expected)
result = getattr(g["b"], reduction_func)()
tm.assert_frame_equal(result, expected)
result = g["b"].agg(reduction_func)
tm.assert_frame_equal(result, expected)
def test_as_index_series_return_frame(df):
grouped = df.groupby("A", as_index=False)
grouped2 = df.groupby(["A", "B"], as_index=False)
result = grouped["C"].agg("sum")
expected = grouped.agg("sum").loc[:, ["A", "C"]]
assert isinstance(result, DataFrame)
tm.assert_frame_equal(result, expected)
result2 = grouped2["C"].agg("sum")
expected2 = grouped2.agg("sum").loc[:, ["A", "B", "C"]]
assert isinstance(result2, DataFrame)
tm.assert_frame_equal(result2, expected2)
result = grouped["C"].sum()
expected = grouped.sum().loc[:, ["A", "C"]]
assert isinstance(result, DataFrame)
tm.assert_frame_equal(result, expected)
result2 = grouped2["C"].sum()
expected2 = grouped2.sum().loc[:, ["A", "B", "C"]]
assert isinstance(result2, DataFrame)
tm.assert_frame_equal(result2, expected2)
def test_as_index_series_column_slice_raises(df):
# GH15072
grouped = df.groupby("A", as_index=False)
msg = r"Column\(s\) C already selected"
with pytest.raises(IndexError, match=msg):
grouped["C"].__getitem__("D")
def test_groupby_as_index_cython(df):
data = df
# single-key
grouped = data.groupby("A", as_index=False)
result = grouped.mean(numeric_only=True)
expected = data.groupby(["A"]).mean(numeric_only=True)
expected.insert(0, "A", expected.index)
expected.index = RangeIndex(len(expected))
tm.assert_frame_equal(result, expected)
# multi-key
grouped = data.groupby(["A", "B"], as_index=False)
result = grouped.mean()
expected = data.groupby(["A", "B"]).mean()
arrays = list(zip(*expected.index.values))
expected.insert(0, "A", arrays[0])
expected.insert(1, "B", arrays[1])
expected.index = RangeIndex(len(expected))
tm.assert_frame_equal(result, expected)
def test_groupby_as_index_series_scalar(df):
grouped = df.groupby(["A", "B"], as_index=False)
# GH #421
result = grouped["C"].agg(len)
expected = grouped.agg(len).loc[:, ["A", "B", "C"]]
tm.assert_frame_equal(result, expected)
def test_groupby_as_index_corner(df, ts):
msg = "as_index=False only valid with DataFrame"
with pytest.raises(TypeError, match=msg):
ts.groupby(lambda x: x.weekday(), as_index=False)
msg = "as_index=False only valid for axis=0"
depr_msg = "DataFrame.groupby with axis=1 is deprecated"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
df.groupby(lambda x: x.lower(), as_index=False, axis=1)
def test_groupby_multiple_key():
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])
agged = grouped.sum()
tm.assert_almost_equal(df.values, agged.values)
depr_msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
grouped = df.T.groupby(
[lambda x: x.year, lambda x: x.month, lambda x: x.day], axis=1
)
agged = grouped.agg(lambda x: x.sum())
tm.assert_index_equal(agged.index, df.columns)
tm.assert_almost_equal(df.T.values, agged.values)
agged = grouped.agg(lambda x: x.sum())
tm.assert_almost_equal(df.T.values, agged.values)
def test_groupby_multi_corner(df):
# test that having an all-NA column doesn't mess you up
df = df.copy()
df["bad"] = np.nan
agged = df.groupby(["A", "B"]).mean()
expected = df.groupby(["A", "B"]).mean()
expected["bad"] = np.nan
tm.assert_frame_equal(agged, expected)
def test_raises_on_nuisance(df):
grouped = df.groupby("A")
msg = re.escape("agg function failed [how->mean,dtype->")
with pytest.raises(TypeError, match=msg):
grouped.agg("mean")
with pytest.raises(TypeError, match=msg):
grouped.mean()
df = df.loc[:, ["A", "C", "D"]]
df["E"] = datetime.now()
grouped = df.groupby("A")
msg = "datetime64 type does not support sum operations"
with pytest.raises(TypeError, match=msg):
grouped.agg("sum")
with pytest.raises(TypeError, match=msg):
grouped.sum()
# won't work with axis = 1
depr_msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1)
msg = "does not support reduction 'sum'"
with pytest.raises(TypeError, match=msg):
grouped.agg(lambda x: x.sum(0, numeric_only=False))
@pytest.mark.parametrize(
"agg_function",
["max", "min"],
)
def test_keep_nuisance_agg(df, agg_function):
# GH 38815
grouped = df.groupby("A")
result = getattr(grouped, agg_function)()
expected = result.copy()
expected.loc["bar", "B"] = getattr(df.loc[df["A"] == "bar", "B"], agg_function)()
expected.loc["foo", "B"] = getattr(df.loc[df["A"] == "foo", "B"], agg_function)()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"agg_function",
["sum", "mean", "prod", "std", "var", "sem", "median"],
)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_omit_nuisance_agg(df, agg_function, numeric_only):
# GH 38774, GH 38815
grouped = df.groupby("A")
no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
if agg_function in no_drop_nuisance and not numeric_only:
# Added numeric_only as part of GH#46560; these do not drop nuisance
# columns when numeric_only is False
if agg_function in ("std", "sem"):
klass = ValueError
msg = "could not convert string to float: 'one'"
else:
klass = TypeError
msg = re.escape(f"agg function failed [how->{agg_function},dtype->")
with pytest.raises(klass, match=msg):
getattr(grouped, agg_function)(numeric_only=numeric_only)
else:
result = getattr(grouped, agg_function)(numeric_only=numeric_only)
if not numeric_only and agg_function == "sum":
# sum is successful on column B
columns = ["A", "B", "C", "D"]
else:
columns = ["A", "C", "D"]
expected = getattr(df.loc[:, columns].groupby("A"), agg_function)(
numeric_only=numeric_only
)
tm.assert_frame_equal(result, expected)
def test_raise_on_nuisance_python_single(df):
# GH 38815
grouped = df.groupby("A")
with pytest.raises(ValueError, match="could not convert"):
grouped.skew()
def test_raise_on_nuisance_python_multiple(three_group):
grouped = three_group.groupby(["A", "B"])
msg = re.escape("agg function failed [how->mean,dtype->")
with pytest.raises(TypeError, match=msg):
grouped.agg("mean")
with pytest.raises(TypeError, match=msg):
grouped.mean()
def test_empty_groups_corner(multiindex_dataframe_random_data):
# handle empty groups
df = DataFrame(
{
"k1": np.array(["b", "b", "b", "a", "a", "a"]),
"k2": np.array(["1", "1", "1", "2", "2", "2"]),
"k3": ["foo", "bar"] * 3,
"v1": np.random.default_rng(2).standard_normal(6),
"v2": np.random.default_rng(2).standard_normal(6),
}
)
grouped = df.groupby(["k1", "k2"])
result = grouped[["v1", "v2"]].agg("mean")
expected = grouped.mean(numeric_only=True)
tm.assert_frame_equal(result, expected)
grouped = multiindex_dataframe_random_data[3:5].groupby(level=0)
agged = grouped.apply(lambda x: x.mean())
agged_A = grouped["A"].apply("mean")
tm.assert_series_equal(agged["A"], agged_A)
assert agged.index.name == "first"
def test_nonsense_func():
df = DataFrame([0])
msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'"
with pytest.raises(TypeError, match=msg):
df.groupby(lambda x: x + "foo")
def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data):
df = multiindex_dataframe_random_data.T
df["baz", "two"] = "peekaboo"
keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
msg = re.escape("agg function failed [how->mean,dtype->")
with pytest.raises(TypeError, match=msg):
df.groupby(keys).agg("mean")
agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean")
assert isinstance(agged.columns, MultiIndex)
def aggfun(ser):
if ser.name == ("foo", "one"):
raise TypeError("Test error message")
return ser.sum()
with pytest.raises(TypeError, match="Test error message"):
df.groupby(keys).aggregate(aggfun)
def test_groupby_level_apply(multiindex_dataframe_random_data):
result = multiindex_dataframe_random_data.groupby(level=0).count()
assert result.index.name == "first"
result = multiindex_dataframe_random_data.groupby(level=1).count()
assert result.index.name == "second"
result = multiindex_dataframe_random_data["A"].groupby(level=0).count()
assert result.index.name == "first"
def test_groupby_level_mapper(multiindex_dataframe_random_data):
deleveled = multiindex_dataframe_random_data.reset_index()
mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1}
mapper1 = {"one": 0, "two": 0, "three": 1}
result0 = multiindex_dataframe_random_data.groupby(mapper0, level=0).sum()
result1 = multiindex_dataframe_random_data.groupby(mapper1, level=1).sum()
mapped_level0 = np.array(
[mapper0.get(x) for x in deleveled["first"]], dtype=np.int64
)
mapped_level1 = np.array(
[mapper1.get(x) for x in deleveled["second"]], dtype=np.int64
)
expected0 = multiindex_dataframe_random_data.groupby(mapped_level0).sum()
expected1 = multiindex_dataframe_random_data.groupby(mapped_level1).sum()
expected0.index.name, expected1.index.name = "first", "second"
tm.assert_frame_equal(result0, expected0)
tm.assert_frame_equal(result1, expected1)
def test_groupby_level_nonmulti():
# GH 1313, GH 13901
s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo"))
expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo"))
result = s.groupby(level=0).sum()
tm.assert_series_equal(result, expected)
result = s.groupby(level=[0]).sum()
tm.assert_series_equal(result, expected)
result = s.groupby(level=-1).sum()
tm.assert_series_equal(result, expected)
result = s.groupby(level=[-1]).sum()
tm.assert_series_equal(result, expected)
msg = "level > 0 or level < -1 only valid with MultiIndex"
with pytest.raises(ValueError, match=msg):
s.groupby(level=1)
with pytest.raises(ValueError, match=msg):
s.groupby(level=-2)
msg = "No group keys passed!"
with pytest.raises(ValueError, match=msg):
s.groupby(level=[])
msg = "multiple levels only valid with MultiIndex"
with pytest.raises(ValueError, match=msg):
s.groupby(level=[0, 0])
with pytest.raises(ValueError, match=msg):
s.groupby(level=[0, 1])
msg = "level > 0 or level < -1 only valid with MultiIndex"
with pytest.raises(ValueError, match=msg):
s.groupby(level=[1])
def test_groupby_complex():
# GH 12902
a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
expected = Series((1 + 2j, 5 + 10j))
result = a.groupby(level=0).sum()
tm.assert_series_equal(result, expected)
def test_groupby_complex_mean():
# GH 26475
df = DataFrame(
[
{"a": 2, "b": 1 + 2j},
{"a": 1, "b": 1 + 1j},
{"a": 1, "b": 1 + 2j},
]
)
result = df.groupby("b").mean()
expected = DataFrame(
[[1.0], [1.5]],
index=Index([(1 + 1j), (1 + 2j)], name="b"),
columns=Index(["a"]),
)
tm.assert_frame_equal(result, expected)
def test_groupby_complex_numbers(using_infer_string):
# GH 17927
df = DataFrame(
[
{"a": 1, "b": 1 + 1j},
{"a": 1, "b": 1 + 2j},
{"a": 4, "b": 1},
]
)
dtype = "string[pyarrow_numpy]" if using_infer_string else object
expected = DataFrame(
np.array([1, 1, 1], dtype=np.int64),
index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"),
columns=Index(["a"], dtype=dtype),
)
result = df.groupby("b", sort=False).count()
tm.assert_frame_equal(result, expected)
# Sorted by the magnitude of the complex numbers
expected.index = Index([(1 + 0j), (1 + 1j), (1 + 2j)], name="b")
result = df.groupby("b", sort=True).count()
tm.assert_frame_equal(result, expected)
def test_groupby_series_indexed_differently():
s1 = Series(
[5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7],
index=Index(["a", "b", "c", "d", "e", "f", "g"]),
)
s2 = Series(
[1.0, 1.0, 4.0, 5.0, 5.0, 7.0], index=Index(["a", "b", "d", "f", "g", "h"])
)
grouped = s1.groupby(s2)
agged = grouped.mean()
exp = s1.groupby(s2.reindex(s1.index).get).mean()
tm.assert_series_equal(agged, exp)
def test_groupby_with_hier_columns():
tuples = list(
zip(
*[
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
)
)
index = MultiIndex.from_tuples(tuples)
columns = MultiIndex.from_tuples(
[("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")]
)
df = DataFrame(
np.random.default_rng(2).standard_normal((8, 4)), index=index, columns=columns
)
result = df.groupby(level=0).mean()
tm.assert_index_equal(result.columns, columns)
depr_msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
gb = df.groupby(level=0, axis=1)
result = gb.mean()
tm.assert_index_equal(result.index, df.index)
result = df.groupby(level=0).agg("mean")
tm.assert_index_equal(result.columns, columns)
result = df.groupby(level=0).apply(lambda x: x.mean())
tm.assert_index_equal(result.columns, columns)
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
gb = df.groupby(level=0, axis=1)
result = gb.agg(lambda x: x.mean(1))
tm.assert_index_equal(result.columns, Index(["A", "B"]))
tm.assert_index_equal(result.index, df.index)
# add a nuisance column
sorted_columns, _ = columns.sortlevel(0)
df["A", "foo"] = "bar"
result = df.groupby(level=0).mean(numeric_only=True)
tm.assert_index_equal(result.columns, df.columns[:-1])
def test_grouping_ndarray(df):
grouped = df.groupby(df["A"].values)
result = grouped.sum()
expected = df.groupby(df["A"].rename(None)).sum()
tm.assert_frame_equal(result, expected)
def test_groupby_wrong_multi_labels():
index = Index([0, 1, 2, 3, 4], name="index")
data = DataFrame(
{
"foo": ["foo1", "foo1", "foo2", "foo1", "foo3"],
"bar": ["bar1", "bar2", "bar2", "bar1", "bar1"],
"baz": ["baz1", "baz1", "baz1", "baz2", "baz2"],
"spam": ["spam2", "spam3", "spam2", "spam1", "spam1"],
"data": [20, 30, 40, 50, 60],
},
index=index,
)
grouped = data.groupby(["foo", "bar", "baz", "spam"])
result = grouped.agg("mean")
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
def test_groupby_series_with_name(df):
result = df.groupby(df["A"]).mean(numeric_only=True)
result2 = df.groupby(df["A"], as_index=False).mean(numeric_only=True)
assert result.index.name == "A"
assert "A" in result2
result = df.groupby([df["A"], df["B"]]).mean()
result2 = df.groupby([df["A"], df["B"]], as_index=False).mean()
assert result.index.names == ("A", "B")
assert "A" in result2
assert "B" in result2
def test_seriesgroupby_name_attr(df):
# GH 6265
result = df.groupby("A")["C"]
assert result.count().name == "C"
assert result.mean().name == "C"
testFunc = lambda x: np.sum(x) * 2
assert result.agg(testFunc).name == "C"
def test_consistency_name():
# GH 12363
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
"D": np.arange(8),
}
)
expected = df.groupby(["A"]).B.count()
result = df.B.groupby(df.A).count()
tm.assert_series_equal(result, expected)
def test_groupby_name_propagation(df):
# GH 6124
def summarize(df, name=None):
return Series({"count": 1, "mean": 2, "omissions": 3}, name=name)
def summarize_random_name(df):
# Provide a different name for each Series. In this case, groupby
# should not attempt to propagate the Series name since they are
# inconsistent.
return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"])
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
metrics = df.groupby("A").apply(summarize)
assert metrics.columns.name is None
with tm.assert_produces_warning(DeprecationWarning, match=msg):
metrics = df.groupby("A").apply(summarize, "metrics")
assert metrics.columns.name == "metrics"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
metrics = df.groupby("A").apply(summarize_random_name)
assert metrics.columns.name is None
def test_groupby_nonstring_columns():
df = DataFrame([np.arange(10) for x in range(10)])
grouped = df.groupby(0)
result = grouped.mean()
expected = df.groupby(df[0]).mean()
tm.assert_frame_equal(result, expected)
def test_groupby_mixed_type_columns():
# GH 13432, unorderable types in py3
df = DataFrame([[0, 1, 2]], columns=["A", "B", 0])
expected = DataFrame([[1, 2]], columns=["B", 0], index=Index([0], name="A"))
result = df.groupby("A").first()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").sum()
tm.assert_frame_equal(result, expected)
def test_cython_grouper_series_bug_noncontig():
arr = np.empty((100, 100))
arr.fill(np.nan)
obj = Series(arr[:, 0])
inds = np.tile(range(10), 10)
result = obj.groupby(inds).agg(Series.median)
assert result.isna().all()
def test_series_grouper_noncontig_index():
index = Index(["a" * 10] * 100)
values = Series(np.random.default_rng(2).standard_normal(50), index=index[::2])
labels = np.random.default_rng(2).integers(0, 5, 50)
# it works!
grouped = values.groupby(labels)
# accessing the index elements causes segfault
f = lambda x: len(set(map(id, x.index)))
grouped.agg(f)
def test_convert_objects_leave_decimal_alone():
s = Series(range(5))
labels = np.array(["a", "b", "c", "d", "e"], dtype="O")
def convert_fast(x):
return Decimal(str(x.mean()))
def convert_force_pure(x):
# base will be length 0
assert len(x.values.base) > 0
return Decimal(str(x.mean()))
grouped = s.groupby(labels)
result = grouped.agg(convert_fast)
assert result.dtype == np.object_
assert isinstance(result.iloc[0], Decimal)
result = grouped.agg(convert_force_pure)
assert result.dtype == np.object_
assert isinstance(result.iloc[0], Decimal)
def test_groupby_dtype_inference_empty():
# GH 6733
df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")})
assert df["x"].dtype == np.float64
result = df.groupby("x").first()
exp_index = Index([], name="x", dtype=np.float64)
expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")})
tm.assert_frame_equal(result, expected, by_blocks=True)
def test_groupby_unit64_float_conversion():
# GH: 30859 groupby converts unit64 to floats sometimes
df = DataFrame({"first": [1], "second": [1], "value": [16148277970000000000]})
result = df.groupby(["first", "second"])["value"].max()
expected = Series(
[16148277970000000000],
MultiIndex.from_product([[1], [1]], names=["first", "second"]),
name="value",
)
tm.assert_series_equal(result, expected)
def test_groupby_list_infer_array_like(df):
result = df.groupby(list(df["A"])).mean(numeric_only=True)
expected = df.groupby(df["A"]).mean(numeric_only=True)
tm.assert_frame_equal(result, expected, check_names=False)
with pytest.raises(KeyError, match=r"^'foo'$"):
df.groupby(list(df["A"][:-1]))
# pathological case of ambiguity
df = DataFrame(
{
"foo": [0, 1],
"bar": [3, 4],
"val": np.random.default_rng(2).standard_normal(2),
}
)
result = df.groupby(["foo", "bar"]).mean()
expected = df.groupby([df["foo"], df["bar"]]).mean()[["val"]]
def test_groupby_keys_same_size_as_index():
# GH 11185
freq = "s"
index = date_range(
start=Timestamp("2015-09-29T11:34:44-0700"), periods=2, freq=freq
)
df = DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index)
result = df.groupby([Grouper(level=0, freq=freq), "metric"]).mean()
expected = df.set_index([df.index, "metric"]).astype(float)
tm.assert_frame_equal(result, expected)
def test_groupby_one_row():
# GH 11741
msg = r"^'Z'$"
df1 = DataFrame(
np.random.default_rng(2).standard_normal((1, 4)), columns=list("ABCD")
)
with pytest.raises(KeyError, match=msg):
df1.groupby("Z")
df2 = DataFrame(
np.random.default_rng(2).standard_normal((2, 4)), columns=list("ABCD")
)
with pytest.raises(KeyError, match=msg):
df2.groupby("Z")
def test_groupby_nat_exclude():
# GH 6992
df = DataFrame(
{
"values": np.random.default_rng(2).standard_normal(8),
"dt": [
np.nan,
Timestamp("2013-01-01"),
np.nan,
Timestamp("2013-02-01"),
np.nan,
Timestamp("2013-02-01"),
np.nan,
Timestamp("2013-01-01"),
],
"str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"],
}
)
grouped = df.groupby("dt")
expected = [Index([1, 7]), Index([3, 5])]
keys = sorted(grouped.groups.keys())
assert len(keys) == 2
for k, e in zip(keys, expected):
# grouped.groups keys are np.datetime64 with system tz
# not to be affected by tz, only compare values
tm.assert_index_equal(grouped.groups[k], e)
# confirm obj is not filtered
tm.assert_frame_equal(grouped._grouper.groupings[0].obj, df)
assert grouped.ngroups == 2
expected = {
Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.intp),
Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.intp),
}
for k in grouped.indices:
tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]])
tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]])
with pytest.raises(KeyError, match=r"^NaT$"):
grouped.get_group(pd.NaT)
nan_df = DataFrame(
{"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]}
)
assert nan_df["nan"].dtype == "float64"
assert nan_df["nat"].dtype == "datetime64[ns]"
for key in ["nan", "nat"]:
grouped = nan_df.groupby(key)
assert grouped.groups == {}
assert grouped.ngroups == 0
assert grouped.indices == {}
with pytest.raises(KeyError, match=r"^nan$"):
grouped.get_group(np.nan)
with pytest.raises(KeyError, match=r"^NaT$"):
grouped.get_group(pd.NaT)
def test_groupby_two_group_keys_all_nan():
# GH #36842: Grouping over two group keys shouldn't raise an error
df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]})
result = df.groupby(["a", "b"]).indices
assert result == {}
def test_groupby_2d_malformed():
d = DataFrame(index=range(2))
d["group"] = ["g1", "g2"]
d["zeros"] = [0, 0]
d["ones"] = [1, 1]
d["label"] = ["l1", "l2"]
tmp = d.groupby(["group"]).mean(numeric_only=True)
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
tm.assert_numpy_array_equal(tmp.values, res_values)
def test_int32_overflow():
B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)))
A = np.arange(25000)
df = DataFrame(
{
"A": A,
"B": B,
"C": A,
"D": B,
"E": np.random.default_rng(2).standard_normal(25000),
}
)
left = df.groupby(["A", "B", "C", "D"]).sum()
right = df.groupby(["D", "C", "B", "A"]).sum()
assert len(left) == len(right)
def test_groupby_sort_multi():
df = DataFrame(
{
"a": ["foo", "bar", "baz"],
"b": [3, 2, 1],
"c": [0, 1, 2],
"d": np.random.default_rng(2).standard_normal(3),
}
)
tups = [tuple(row) for row in df[["a", "b", "c"]].values]
tups = com.asarray_tuplesafe(tups)
result = df.groupby(["a", "b", "c"], sort=True).sum()
tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
tups = [tuple(row) for row in df[["c", "a", "b"]].values]
tups = com.asarray_tuplesafe(tups)
result = df.groupby(["c", "a", "b"], sort=True).sum()
tm.assert_numpy_array_equal(result.index.values, tups)
tups = [tuple(x) for x in df[["b", "c", "a"]].values]
tups = com.asarray_tuplesafe(tups)
result = df.groupby(["b", "c", "a"], sort=True).sum()
tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
df = DataFrame(
{
"a": [0, 1, 2, 0, 1, 2],
"b": [0, 0, 0, 1, 1, 1],
"d": np.random.default_rng(2).standard_normal(6),
}
)
grouped = df.groupby(["a", "b"])["d"]
result = grouped.sum()
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
tups = [tuple(row) for row in df[keys].values]
tups = com.asarray_tuplesafe(tups)
expected = f(df.groupby(tups)[field])
for k, v in expected.items():
assert result[k] == v
_check_groupby(df, result, ["a", "b"], "d")
def test_dont_clobber_name_column():
df = DataFrame(
{"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2}
)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result = df.groupby("key", group_keys=False).apply(lambda x: x)
tm.assert_frame_equal(result, df)
def test_skip_group_keys():
tsf = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
grouped = tsf.groupby(lambda x: x.month, group_keys=False)
result = grouped.apply(lambda x: x.sort_values(by="A")[:3])
pieces = [group.sort_values(by="A")[:3] for key, group in grouped]
expected = pd.concat(pieces)
tm.assert_frame_equal(result, expected)
grouped = tsf["A"].groupby(lambda x: x.month, group_keys=False)
result = grouped.apply(lambda x: x.sort_values()[:3])
pieces = [group.sort_values()[:3] for key, group in grouped]
expected = pd.concat(pieces)
tm.assert_series_equal(result, expected)
def test_no_nonsense_name(float_frame):
# GH #995
s = float_frame["C"].copy()
s.name = None
result = s.groupby(float_frame["A"]).agg("sum")
assert result.name is None
def test_multifunc_sum_bug():
# GH #1065
x = DataFrame(np.arange(9).reshape(3, 3))
x["test"] = 0
x["fl"] = [1.3, 1.5, 1.6]
grouped = x.groupby("test")
result = grouped.agg({"fl": "sum", 2: "size"})
assert result["fl"].dtype == np.float64
def test_handle_dict_return_value(df):
def f(group):
return {"max": group.max(), "min": group.min()}
def g(group):
return Series({"max": group.max(), "min": group.min()})
result = df.groupby("A")["C"].apply(f)
expected = df.groupby("A")["C"].apply(g)
assert isinstance(result, Series)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("grouper", ["A", ["A", "B"]])
def test_set_group_name(df, grouper, using_infer_string):
def f(group):
assert group.name is not None
return group
def freduce(group):
assert group.name is not None
if using_infer_string and grouper == "A" and is_string_dtype(group.dtype):
with pytest.raises(TypeError, match="does not support"):
group.sum()
else:
return group.sum()
def freducex(x):
return freduce(x)
grouped = df.groupby(grouper, group_keys=False)
# make sure all these work
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
grouped.apply(f)
grouped.aggregate(freduce)
grouped.aggregate({"C": freduce, "D": freduce})
grouped.transform(f)
grouped["C"].apply(f)
grouped["C"].aggregate(freduce)
grouped["C"].aggregate([freduce, freducex])
grouped["C"].transform(f)
def test_group_name_available_in_inference_pass():
# gh-15062
df = DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)})
names = []
def f(group):
names.append(group.name)
return group.copy()
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
df.groupby("a", sort=False, group_keys=False).apply(f)
expected_names = [0, 1, 2]
assert names == expected_names
def test_no_dummy_key_names(df):
# see gh-1291
result = df.groupby(df["A"].values).sum()
assert result.index.name is None
result = df.groupby([df["A"].values, df["B"].values]).sum()
assert result.index.names == (None, None)
def test_groupby_sort_multiindex_series():
# series multiindex groupby sort argument was not being passed through
# _compress_group_index
# GH 9444
index = MultiIndex(
levels=[[1, 2], [1, 2]],
codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
names=["a", "b"],
)
mseries = Series([0, 1, 2, 3, 4, 5], index=index)
index = MultiIndex(
levels=[[1, 2], [1, 2]], codes=[[0, 0, 1], [1, 0, 0]], names=["a", "b"]
)
mseries_result = Series([0, 2, 4], index=index)
result = mseries.groupby(level=["a", "b"], sort=False).first()
tm.assert_series_equal(result, mseries_result)
result = mseries.groupby(level=["a", "b"], sort=True).first()
tm.assert_series_equal(result, mseries_result.sort_index())
def test_groupby_reindex_inside_function():
periods = 1000
ind = date_range(start="2012/1/1", freq="5min", periods=periods)
df = DataFrame({"high": np.arange(periods), "low": np.arange(periods)}, index=ind)
def agg_before(func, fix=False):
"""
Run an aggregate func on the subset of data.
"""
def _func(data):
d = data.loc[data.index.map(lambda x: x.hour < 11)].dropna()
if fix:
data[data.index[0]]
if len(d) == 0:
return None
return func(d)
return _func
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
closure_bad = grouped.agg({"high": agg_before(np.max)})
closure_good = grouped.agg({"high": agg_before(np.max, True)})
tm.assert_frame_equal(closure_bad, closure_good)
def test_groupby_multiindex_missing_pair():
# GH9049
df = DataFrame(
{
"group1": ["a", "a", "a", "b"],
"group2": ["c", "c", "d", "c"],
"value": [1, 1, 1, 5],
}
)
df = df.set_index(["group1", "group2"])
df_grouped = df.groupby(level=["group1", "group2"], sort=True)
res = df_grouped.agg("sum")
idx = MultiIndex.from_tuples(
[("a", "c"), ("a", "d"), ("b", "c")], names=["group1", "group2"]
)
exp = DataFrame([[2], [1], [5]], index=idx, columns=["value"])
tm.assert_frame_equal(res, exp)
def test_groupby_multiindex_not_lexsorted():
# GH 11640
# define the lexsorted version
lexsorted_mi = MultiIndex.from_tuples(
[("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
)
lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
assert lexsorted_df.columns._is_lexsorted()
# define the non-lexsorted version
not_lexsorted_df = DataFrame(
columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]
)
not_lexsorted_df = not_lexsorted_df.pivot_table(
index="a", columns=["b", "c"], values="d"
)
not_lexsorted_df = not_lexsorted_df.reset_index()
assert not not_lexsorted_df.columns._is_lexsorted()
expected = lexsorted_df.groupby("a").mean()
with tm.assert_produces_warning(PerformanceWarning):
result = not_lexsorted_df.groupby("a").mean()
tm.assert_frame_equal(expected, result)
# a transforming function should work regardless of sort
# GH 14776
df = DataFrame(
{"x": ["a", "a", "b", "a"], "y": [1, 1, 2, 2], "z": [1, 2, 3, 4]}
).set_index(["x", "y"])
assert not df.index._is_lexsorted()
for level in [0, 1, [0, 1]]:
for sort in [False, True]:
result = df.groupby(level=level, sort=sort, group_keys=False).apply(
DataFrame.drop_duplicates
)
expected = df
tm.assert_frame_equal(expected, result)
result = (
df.sort_index()
.groupby(level=level, sort=sort, group_keys=False)
.apply(DataFrame.drop_duplicates)
)
expected = df.sort_index()
tm.assert_frame_equal(expected, result)
def test_index_label_overlaps_location():
# checking we don't have any label/location confusion in the
# wake of GH5375
df = DataFrame(list("ABCDE"), index=[2, 0, 2, 1, 1])
g = df.groupby(list("ababb"))
actual = g.filter(lambda x: len(x) > 2)
expected = df.iloc[[1, 3, 4]]
tm.assert_frame_equal(actual, expected)
ser = df[0]
g = ser.groupby(list("ababb"))
actual = g.filter(lambda x: len(x) > 2)
expected = ser.take([1, 3, 4])
tm.assert_series_equal(actual, expected)
# and again, with a generic Index of floats
df.index = df.index.astype(float)
g = df.groupby(list("ababb"))
actual = g.filter(lambda x: len(x) > 2)
expected = df.iloc[[1, 3, 4]]
tm.assert_frame_equal(actual, expected)
ser = df[0]
g = ser.groupby(list("ababb"))
actual = g.filter(lambda x: len(x) > 2)
expected = ser.take([1, 3, 4])
tm.assert_series_equal(actual, expected)
def test_transform_doesnt_clobber_ints():
# GH 7972
n = 6
x = np.arange(n)
df = DataFrame({"a": x // 2, "b": 2.0 * x, "c": 3.0 * x})
df2 = DataFrame({"a": x // 2 * 1.0, "b": 2.0 * x, "c": 3.0 * x})
gb = df.groupby("a")
result = gb.transform("mean")
gb2 = df2.groupby("a")
expected = gb2.transform("mean")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"sort_column",
["ints", "floats", "strings", ["ints", "floats"], ["ints", "strings"]],
)
@pytest.mark.parametrize(
"group_column", ["int_groups", "string_groups", ["int_groups", "string_groups"]]
)
def test_groupby_preserves_sort(sort_column, group_column):
# Test to ensure that groupby always preserves sort order of original
# object. Issue #8588 and #9651
df = DataFrame(
{
"int_groups": [3, 1, 0, 1, 0, 3, 3, 3],
"string_groups": ["z", "a", "z", "a", "a", "g", "g", "g"],
"ints": [8, 7, 4, 5, 2, 9, 1, 1],
"floats": [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
"strings": ["z", "d", "a", "e", "word", "word2", "42", "47"],
}
)
# Try sorting on different types and with different group types
df = df.sort_values(by=sort_column)
g = df.groupby(group_column)
def test_sort(x):
tm.assert_frame_equal(x, x.sort_values(by=sort_column))
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
g.apply(test_sort)
def test_pivot_table_values_key_error():
# This test is designed to replicate the error in issue #14938
df = DataFrame(
{
"eventDate": date_range(datetime.today(), periods=20, freq="ME").tolist(),
"thename": range(20),
}
)
df["year"] = df.set_index("eventDate").index.year
df["month"] = df.set_index("eventDate").index.month
with pytest.raises(KeyError, match="'badname'"):
df.reset_index().pivot_table(
index="year", columns="month", values="badname", aggfunc="count"
)
@pytest.mark.parametrize("columns", ["C", ["C"]])
@pytest.mark.parametrize("keys", [["A"], ["A", "B"]])
@pytest.mark.parametrize(
"values",
[
[True],
[0],
[0.0],
["a"],
Categorical([0]),
[to_datetime(0)],
date_range(0, 1, 1, tz="US/Eastern"),
pd.period_range("2016-01-01", periods=3, freq="D"),
pd.array([0], dtype="Int64"),
pd.array([0], dtype="Float64"),
pd.array([False], dtype="boolean"),
],
ids=[
"bool",
"int",
"float",
"str",
"cat",
"dt64",
"dt64tz",
"period",
"Int64",
"Float64",
"boolean",
],
)
@pytest.mark.parametrize("method", ["attr", "agg", "apply"])
@pytest.mark.parametrize(
"op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"]
)
def test_empty_groupby(
columns, keys, values, method, op, using_array_manager, dropna, using_infer_string
):
# GH8093 & GH26411
override_dtype = None
if isinstance(values, BooleanArray) and op in ["sum", "prod"]:
# We expect to get Int64 back for these
override_dtype = "Int64"
if isinstance(values[0], bool) and op in ("prod", "sum"):
# sum/product of bools is an integer
override_dtype = "int64"
df = DataFrame({"A": values, "B": values, "C": values}, columns=list("ABC"))
if hasattr(values, "dtype"):
# check that we did the construction right
assert (df.dtypes == values.dtype).all()
df = df.iloc[:0]
gb = df.groupby(keys, group_keys=False, dropna=dropna, observed=False)[columns]
def get_result(**kwargs):
if method == "attr":
return getattr(gb, op)(**kwargs)
else:
return getattr(gb, method)(op, **kwargs)
def get_categorical_invalid_expected():
# Categorical is special without 'observed=True', we get an NaN entry
# corresponding to the unobserved group. If we passed observed=True
# to groupby, expected would just be 'df.set_index(keys)[columns]'
# as below
lev = Categorical([0], dtype=values.dtype)
if len(keys) != 1:
idx = MultiIndex.from_product([lev, lev], names=keys)
else:
# all columns are dropped, but we end up with one row
# Categorical is special without 'observed=True'
idx = Index(lev, name=keys[0])
if using_infer_string:
columns = Index([], dtype="string[pyarrow_numpy]")
else:
columns = []
expected = DataFrame([], columns=columns, index=idx)
return expected
is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype)
is_dt64 = df.dtypes.iloc[0].kind == "M"
is_cat = isinstance(values, Categorical)
if (
isinstance(values, Categorical)
and not values.ordered
and op in ["min", "max", "idxmin", "idxmax"]
):
if op in ["min", "max"]:
msg = f"Cannot perform {op} with non-ordered Categorical"
klass = TypeError
else:
msg = f"Can't get {op} of an empty group due to unobserved categories"
klass = ValueError
with pytest.raises(klass, match=msg):
get_result()
if op in ["min", "max", "idxmin", "idxmax"] and isinstance(columns, list):
# i.e. DataframeGroupBy, not SeriesGroupBy
result = get_result(numeric_only=True)
expected = get_categorical_invalid_expected()
tm.assert_equal(result, expected)
return
if op in ["prod", "sum", "skew"]:
# ops that require more than just ordered-ness
if is_dt64 or is_cat or is_per:
# GH#41291
# datetime64 -> prod and sum are invalid
if is_dt64:
msg = "datetime64 type does not support"
elif is_per:
msg = "Period type does not support"
else:
msg = "category type does not support"
if op == "skew":
msg = "|".join([msg, "does not support reduction 'skew'"])
with pytest.raises(TypeError, match=msg):
get_result()
if not isinstance(columns, list):
# i.e. SeriesGroupBy
return
elif op == "skew":
# TODO: test the numeric_only=True case
return
else:
# i.e. op in ["prod", "sum"]:
# i.e. DataFrameGroupBy
# ops that require more than just ordered-ness
# GH#41291
result = get_result(numeric_only=True)
# with numeric_only=True, these are dropped, and we get
# an empty DataFrame back
expected = df.set_index(keys)[[]]
if is_cat:
expected = get_categorical_invalid_expected()
tm.assert_equal(result, expected)
return
result = get_result()
expected = df.set_index(keys)[columns]
if op in ["idxmax", "idxmin"]:
expected = expected.astype(df.index.dtype)
if override_dtype is not None:
expected = expected.astype(override_dtype)
if len(keys) == 1:
expected.index.name = keys[0]
tm.assert_equal(result, expected)
def test_empty_groupby_apply_nonunique_columns():
# GH#44417
df = DataFrame(np.random.default_rng(2).standard_normal((0, 4)))
df[3] = df[3].astype(np.int64)
df.columns = [0, 1, 2, 0]
gb = df.groupby(df[1], group_keys=False)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
res = gb.apply(lambda x: x)
assert (res.dtypes == df.dtypes).all()
def test_tuple_as_grouping():
# https://github.com/pandas-dev/pandas/issues/18314
df = DataFrame(
{
("a", "b"): [1, 1, 1, 1],
"a": [2, 2, 2, 2],
"b": [2, 2, 2, 2],
"c": [1, 1, 1, 1],
}
)
with pytest.raises(KeyError, match=r"('a', 'b')"):
df[["a", "b", "c"]].groupby(("a", "b"))
result = df.groupby(("a", "b"))["c"].sum()
expected = Series([4], name="c", index=Index([1], name=("a", "b")))
tm.assert_series_equal(result, expected)
def test_tuple_correct_keyerror():
# https://github.com/pandas-dev/pandas/issues/18798
df = DataFrame(1, index=range(3), columns=MultiIndex.from_product([[1, 2], [3, 4]]))
with pytest.raises(KeyError, match=r"^\(7, 8\)$"):
df.groupby((7, 8)).mean()
def test_groupby_agg_ohlc_non_first():
# GH 21716
df = DataFrame(
[[1], [1]],
columns=Index(["foo"], name="mycols"),
index=date_range("2018-01-01", periods=2, freq="D", name="dti"),
)
expected = DataFrame(
[[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]],
columns=MultiIndex.from_tuples(
(
("foo", "sum", "foo"),
("foo", "ohlc", "open"),
("foo", "ohlc", "high"),
("foo", "ohlc", "low"),
("foo", "ohlc", "close"),
),
names=["mycols", None, None],
),
index=date_range("2018-01-01", periods=2, freq="D", name="dti"),
)
result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"])
tm.assert_frame_equal(result, expected)
def test_groupby_multiindex_nat():
# GH 9236
values = [
(pd.NaT, "a"),
(datetime(2012, 1, 2), "a"),
(datetime(2012, 1, 2), "b"),
(datetime(2012, 1, 3), "a"),
]
mi = MultiIndex.from_tuples(values, names=["date", None])
ser = Series([3, 2, 2.5, 4], index=mi)
result = ser.groupby(level=1).mean()
expected = Series([3.0, 2.5], index=["a", "b"])
tm.assert_series_equal(result, expected)
def test_groupby_empty_list_raises():
# GH 5289
values = zip(range(10), range(10))
df = DataFrame(values, columns=["apple", "b"])
msg = "Grouper and axis must be same length"
with pytest.raises(ValueError, match=msg):
df.groupby([[]])
def test_groupby_multiindex_series_keys_len_equal_group_axis():
# GH 25704
index_array = [["x", "x"], ["a", "b"], ["k", "k"]]
index_names = ["first", "second", "third"]
ri = MultiIndex.from_arrays(index_array, names=index_names)
s = Series(data=[1, 2], index=ri)
result = s.groupby(["first", "third"]).sum()
index_array = [["x"], ["k"]]
index_names = ["first", "third"]
ei = MultiIndex.from_arrays(index_array, names=index_names)
expected = Series([3], index=ei)
tm.assert_series_equal(result, expected)
def test_groupby_groups_in_BaseGrouper():
# GH 26326
# Test if DataFrame grouped with a pandas.Grouper has correct groups
mi = MultiIndex.from_product([["A", "B"], ["C", "D"]], names=["alpha", "beta"])
df = DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi)
result = df.groupby([Grouper(level="alpha"), "beta"])
expected = df.groupby(["alpha", "beta"])
assert result.groups == expected.groups
result = df.groupby(["beta", Grouper(level="alpha")])
expected = df.groupby(["beta", "alpha"])
assert result.groups == expected.groups
@pytest.mark.parametrize("group_name", ["x", ["x"]])
def test_groupby_axis_1(group_name):
# GH 27614
df = DataFrame(
np.arange(12).reshape(3, 4), index=[0, 1, 0], columns=[10, 20, 10, 20]
)
df.index.name = "y"
df.columns.name = "x"
depr_msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
gb = df.groupby(group_name, axis=1)
results = gb.sum()
expected = df.T.groupby(group_name).sum().T
tm.assert_frame_equal(results, expected)
# test on MI column
iterables = [["bar", "baz", "foo"], ["one", "two"]]
mi = MultiIndex.from_product(iterables=iterables, names=["x", "x1"])
df = DataFrame(np.arange(18).reshape(3, 6), index=[0, 1, 0], columns=mi)
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
gb = df.groupby(group_name, axis=1)
results = gb.sum()
expected = df.T.groupby(group_name).sum().T
tm.assert_frame_equal(results, expected)
@pytest.mark.parametrize(
"op, expected",
[
(
"shift",
{
"time": [
None,
None,
Timestamp("2019-01-01 12:00:00"),
Timestamp("2019-01-01 12:30:00"),
None,
None,
]
},
),
(
"bfill",
{
"time": [
Timestamp("2019-01-01 12:00:00"),
Timestamp("2019-01-01 12:30:00"),
Timestamp("2019-01-01 14:00:00"),
Timestamp("2019-01-01 14:30:00"),
Timestamp("2019-01-01 14:00:00"),
Timestamp("2019-01-01 14:30:00"),
]
},
),
(
"ffill",
{
"time": [
Timestamp("2019-01-01 12:00:00"),
Timestamp("2019-01-01 12:30:00"),
Timestamp("2019-01-01 12:00:00"),
Timestamp("2019-01-01 12:30:00"),
Timestamp("2019-01-01 14:00:00"),
Timestamp("2019-01-01 14:30:00"),
]
},
),
],
)
def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected):
# GH19995, GH27992: Check that timezone does not drop in shift, bfill, and ffill
tz = tz_naive_fixture
data = {
"id": ["A", "B", "A", "B", "A", "B"],
"time": [
Timestamp("2019-01-01 12:00:00"),
Timestamp("2019-01-01 12:30:00"),
None,
None,
Timestamp("2019-01-01 14:00:00"),
Timestamp("2019-01-01 14:30:00"),
],
}
df = DataFrame(data).assign(time=lambda x: x.time.dt.tz_localize(tz))
grouped = df.groupby("id")
result = getattr(grouped, op)()
expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz))
tm.assert_frame_equal(result, expected)
def test_groupby_only_none_group():
# see GH21624
# this was crashing with "ValueError: Length of passed values is 1, index implies 0"
df = DataFrame({"g": [None], "x": 1})
actual = df.groupby("g")["x"].transform("sum")
expected = Series([np.nan], name="x")
tm.assert_series_equal(actual, expected)
def test_groupby_duplicate_index():
# GH#29189 the groupby call here used to raise
ser = Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0])
gb = ser.groupby(level=0)
result = gb.mean()
expected = Series([2, 5.5, 8], index=[2.0, 4.0, 5.0])
tm.assert_series_equal(result, expected)
def test_group_on_empty_multiindex(transformation_func, request):
# GH 47787
# With one row, those are transforms so the schema should be the same
df = DataFrame(
data=[[1, Timestamp("today"), 3, 4]],
columns=["col_1", "col_2", "col_3", "col_4"],
)
df["col_3"] = df["col_3"].astype(int)
df["col_4"] = df["col_4"].astype(int)
df = df.set_index(["col_1", "col_2"])
if transformation_func == "fillna":
args = ("ffill",)
else:
args = ()
warn = FutureWarning if transformation_func == "fillna" else None
warn_msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=warn_msg):
result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args)
with tm.assert_produces_warning(warn, match=warn_msg):
expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0]
if transformation_func in ("diff", "shift"):
expected = expected.astype(int)
tm.assert_equal(result, expected)
warn_msg = "SeriesGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=warn_msg):
result = (
df["col_3"]
.iloc[:0]
.groupby(["col_1"])
.transform(transformation_func, *args)
)
warn_msg = "SeriesGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=warn_msg):
expected = (
df["col_3"]
.groupby(["col_1"])
.transform(transformation_func, *args)
.iloc[:0]
)
if transformation_func in ("diff", "shift"):
expected = expected.astype(int)
tm.assert_equal(result, expected)
def test_groupby_crash_on_nunique(axis):
# Fix following 30253
dti = date_range("2016-01-01", periods=2, name="foo")
df = DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]})
df.columns.names = ("bar", "baz")
df.index = dti
axis_number = df._get_axis_number(axis)
if not axis_number:
df = df.T
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
else:
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = df.groupby(axis=axis_number, level=0)
result = gb.nunique()
expected = DataFrame({"A": [1, 2], "D": [1, 1]}, index=dti)
expected.columns.name = "bar"
if not axis_number:
expected = expected.T
tm.assert_frame_equal(result, expected)
if axis_number == 0:
# same thing, but empty columns
with tm.assert_produces_warning(FutureWarning, match=msg):
gb2 = df[[]].groupby(axis=axis_number, level=0)
exp = expected[[]]
else:
# same thing, but empty rows
with tm.assert_produces_warning(FutureWarning, match=msg):
gb2 = df.loc[[]].groupby(axis=axis_number, level=0)
# default for empty when we can't infer a dtype is float64
exp = expected.loc[[]].astype(np.float64)
res = gb2.nunique()
tm.assert_frame_equal(res, exp)
def test_groupby_list_level():
# GH 9790
expected = DataFrame(np.arange(0, 9).reshape(3, 3), dtype=float)
result = expected.groupby(level=[0]).mean()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"max_seq_items, expected",
[
(5, "{0: [0], 1: [1], 2: [2], 3: [3], 4: [4]}"),
(4, "{0: [0], 1: [1], 2: [2], 3: [3], ...}"),
(1, "{0: [0], ...}"),
],
)
def test_groups_repr_truncates(max_seq_items, expected):
# GH 1135
df = DataFrame(np.random.default_rng(2).standard_normal((5, 1)))
df["a"] = df.index
with pd.option_context("display.max_seq_items", max_seq_items):
result = df.groupby("a").groups.__repr__()
assert result == expected
result = df.groupby(np.array(df.a)).groups.__repr__()
assert result == expected
def test_group_on_two_row_multiindex_returns_one_tuple_key():
# GH 18451
df = DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}])
df = df.set_index(["a", "b"])
grp = df.groupby(["a", "b"])
result = grp.indices
expected = {(1, 2): np.array([0, 1], dtype=np.int64)}
assert len(result) == 1
key = (1, 2)
assert (result[key] == expected[key]).all()
@pytest.mark.parametrize(
"klass, attr, value",
[
(DataFrame, "level", "a"),
(DataFrame, "as_index", False),
(DataFrame, "sort", False),
(DataFrame, "group_keys", False),
(DataFrame, "observed", True),
(DataFrame, "dropna", False),
(Series, "level", "a"),
(Series, "as_index", False),
(Series, "sort", False),
(Series, "group_keys", False),
(Series, "observed", True),
(Series, "dropna", False),
],
)
def test_subsetting_columns_keeps_attrs(klass, attr, value):
# GH 9959 - When subsetting columns, don't drop attributes
df = DataFrame({"a": [1], "b": [2], "c": [3]})
if attr != "axis":
df = df.set_index("a")
expected = df.groupby("a", **{attr: value})
result = expected[["b"]] if klass is DataFrame else expected["b"]
assert getattr(result, attr) == getattr(expected, attr)
def test_subsetting_columns_axis_1():
# GH 37725
df = DataFrame({"A": [1], "B": [2], "C": [3]})
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
g = df.groupby([0, 0, 1], axis=1)
match = "Cannot subset columns when using axis=1"
with pytest.raises(ValueError, match=match):
g[["A", "B"]].sum()
@pytest.mark.parametrize("func", ["sum", "any", "shift"])
def test_groupby_column_index_name_lost(func):
# GH: 29764 groupby loses index sometimes
expected = Index(["a"], name="idx")
df = DataFrame([[1]], columns=expected)
df_grouped = df.groupby([1])
result = getattr(df_grouped, func)().columns
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize(
"infer_string",
[
False,
pytest.param(True, marks=td.skip_if_no("pyarrow")),
],
)
def test_groupby_duplicate_columns(infer_string):
# GH: 31735
if infer_string:
pytest.importorskip("pyarrow")
df = DataFrame(
{"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]}
).astype(object)
df.columns = ["A", "B", "B"]
with pd.option_context("future.infer_string", infer_string):
result = df.groupby([0, 0, 0, 0]).min()
expected = DataFrame(
[["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"], dtype=object
)
tm.assert_frame_equal(result, expected)
def test_groupby_series_with_tuple_name():
# GH 37755
ser = Series([1, 2, 3, 4], index=[1, 1, 2, 2], name=("a", "a"))
ser.index.name = ("b", "b")
result = ser.groupby(level=0).last()
expected = Series([2, 4], index=[1, 2], name=("a", "a"))
expected.index.name = ("b", "b")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])]
)
def test_groupby_numerical_stability_sum_mean(func, values):
# GH#38778
data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
result = getattr(df.groupby("group"), func)()
expected = DataFrame({"a": values, "b": values}, index=Index([1, 2], name="group"))
tm.assert_frame_equal(result, expected)
def test_groupby_numerical_stability_cumsum():
# GH#38934
data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
result = df.groupby("group").cumsum()
exp_data = (
[1e16] * 2 + [1e16 + 96, 1e16 + 98] + [5e15 + 97, 5e15 + 98] + [97.0, 98.0]
)
expected = DataFrame({"a": exp_data, "b": exp_data})
tm.assert_frame_equal(result, expected, check_exact=True)
def test_groupby_cumsum_skipna_false():
# GH#46216 don't propagate np.nan above the diagonal
arr = np.random.default_rng(2).standard_normal((5, 5))
df = DataFrame(arr)
for i in range(5):
df.iloc[i, i] = np.nan
df["A"] = 1
gb = df.groupby("A")
res = gb.cumsum(skipna=False)
expected = df[[0, 1, 2, 3, 4]].cumsum(skipna=False)
tm.assert_frame_equal(res, expected)
def test_groupby_cumsum_timedelta64():
# GH#46216 don't ignore is_datetimelike in libgroupby.group_cumsum
dti = date_range("2016-01-01", periods=5)
ser = Series(dti) - dti[0]
ser[2] = pd.NaT
df = DataFrame({"A": 1, "B": ser})
gb = df.groupby("A")
res = gb.cumsum(numeric_only=False, skipna=True)
exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, ser[4], ser[4] * 2]})
tm.assert_frame_equal(res, exp)
res = gb.cumsum(numeric_only=False, skipna=False)
exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, pd.NaT, pd.NaT]})
tm.assert_frame_equal(res, exp)
def test_groupby_mean_duplicate_index(rand_series_with_duplicate_datetimeindex):
dups = rand_series_with_duplicate_datetimeindex
result = dups.groupby(level=0).mean()
expected = dups.groupby(dups.index).mean()
tm.assert_series_equal(result, expected)
def test_groupby_all_nan_groups_drop():
# GH 15036
s = Series([1, 2, 3], [np.nan, np.nan, np.nan])
result = s.groupby(s.index).sum()
expected = Series([], index=Index([], dtype=np.float64), dtype=np.int64)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_groupby_empty_multi_column(as_index, numeric_only):
# GH 15106 & GH 41998
df = DataFrame(data=[], columns=["A", "B", "C"])
gb = df.groupby(["A", "B"], as_index=as_index)
result = gb.sum(numeric_only=numeric_only)
if as_index:
index = MultiIndex([[], []], [[], []], names=["A", "B"])
columns = ["C"] if not numeric_only else []
else:
index = RangeIndex(0)
columns = ["A", "B", "C"] if not numeric_only else ["A", "B"]
expected = DataFrame([], columns=columns, index=index)
tm.assert_frame_equal(result, expected)
def test_groupby_aggregation_non_numeric_dtype():
# GH #43108
df = DataFrame(
[["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"]
)
expected = DataFrame(
{
"v": [[1, 1], [10, 20]],
},
index=Index(["M", "W"], dtype="object", name="MW"),
)
gb = df.groupby(by=["MW"])
result = gb.sum()
tm.assert_frame_equal(result, expected)
def test_groupby_aggregation_multi_non_numeric_dtype():
# GH #42395
df = DataFrame(
{
"x": [1, 0, 1, 1, 0],
"y": [Timedelta(i, "days") for i in range(1, 6)],
"z": [Timedelta(i * 10, "days") for i in range(1, 6)],
}
)
expected = DataFrame(
{
"y": [Timedelta(i, "days") for i in range(7, 9)],
"z": [Timedelta(i * 10, "days") for i in range(7, 9)],
},
index=Index([0, 1], dtype="int64", name="x"),
)
gb = df.groupby(by=["x"])
result = gb.sum()
tm.assert_frame_equal(result, expected)
def test_groupby_aggregation_numeric_with_non_numeric_dtype():
# GH #43108
df = DataFrame(
{
"x": [1, 0, 1, 1, 0],
"y": [Timedelta(i, "days") for i in range(1, 6)],
"z": list(range(1, 6)),
}
)
expected = DataFrame(
{"y": [Timedelta(7, "days"), Timedelta(8, "days")], "z": [7, 8]},
index=Index([0, 1], dtype="int64", name="x"),
)
gb = df.groupby(by=["x"])
result = gb.sum()
tm.assert_frame_equal(result, expected)
def test_groupby_filtered_df_std():
# GH 16174
dicts = [
{"filter_col": False, "groupby_col": True, "bool_col": True, "float_col": 10.5},
{"filter_col": True, "groupby_col": True, "bool_col": True, "float_col": 20.5},
{"filter_col": True, "groupby_col": True, "bool_col": True, "float_col": 30.5},
]
df = DataFrame(dicts)
df_filter = df[df["filter_col"] == True] # noqa: E712
dfgb = df_filter.groupby("groupby_col")
result = dfgb.std()
expected = DataFrame(
[[0.0, 0.0, 7.071068]],
columns=["filter_col", "bool_col", "float_col"],
index=Index([True], name="groupby_col"),
)
tm.assert_frame_equal(result, expected)
def test_datetime_categorical_multikey_groupby_indices():
# GH 26859
df = DataFrame(
{
"a": Series(list("abc")),
"b": Series(
to_datetime(["2018-01-01", "2018-02-01", "2018-03-01"]),
dtype="category",
),
"c": Categorical.from_codes([-1, 0, 1], categories=[0, 1]),
}
)
result = df.groupby(["a", "b"], observed=False).indices
expected = {
("a", Timestamp("2018-01-01 00:00:00")): np.array([0]),
("b", Timestamp("2018-02-01 00:00:00")): np.array([1]),
("c", Timestamp("2018-03-01 00:00:00")): np.array([2]),
}
assert result == expected
def test_rolling_wrong_param_min_period():
# GH34037
name_l = ["Alice"] * 5 + ["Bob"] * 5
val_l = [np.nan, np.nan, 1, 2, 3] + [np.nan, 1, 2, 3, 4]
test_df = DataFrame([name_l, val_l]).T
test_df.columns = ["name", "val"]
result_error_msg = r"__init__\(\) got an unexpected keyword argument 'min_period'"
with pytest.raises(TypeError, match=result_error_msg):
test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()
@pytest.mark.parametrize(
"dtype",
[
object,
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
],
)
def test_by_column_values_with_same_starting_value(dtype):
# GH29635
df = DataFrame(
{
"Name": ["Thomas", "Thomas", "Thomas John"],
"Credit": [1200, 1300, 900],
"Mood": Series(["sad", "happy", "happy"], dtype=dtype),
}
)
aggregate_details = {"Mood": Series.mode, "Credit": "sum"}
result = df.groupby(["Name"]).agg(aggregate_details)
expected_result = DataFrame(
{
"Mood": [["happy", "sad"], "happy"],
"Credit": [2500, 900],
"Name": ["Thomas", "Thomas John"],
}
).set_index("Name")
tm.assert_frame_equal(result, expected_result)
def test_groupby_none_in_first_mi_level():
# GH#47348
arr = [[None, 1, 0, 1], [2, 3, 2, 3]]
ser = Series(1, index=MultiIndex.from_arrays(arr, names=["a", "b"]))
result = ser.groupby(level=[0, 1]).sum()
expected = Series(
[1, 2], MultiIndex.from_tuples([(0.0, 2), (1.0, 3)], names=["a", "b"])
)
tm.assert_series_equal(result, expected)
def test_groupby_none_column_name():
# GH#47348
df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]})
result = df.groupby(by=[None]).sum()
expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("selection", [None, "a", ["a"]])
def test_single_element_list_grouping(selection):
# GH#42795, GH#53500
df = DataFrame({"a": [1, 2], "b": [np.nan, 5], "c": [np.nan, 2]}, index=["x", "y"])
grouped = df.groupby(["a"]) if selection is None else df.groupby(["a"])[selection]
result = [key for key, _ in grouped]
expected = [(1,), (2,)]
assert result == expected
def test_groupby_string_dtype():
# GH 40148
df = DataFrame({"str_col": ["a", "b", "c", "a"], "num_col": [1, 2, 3, 2]})
df["str_col"] = df["str_col"].astype("string")
expected = DataFrame(
{
"str_col": [
"a",
"b",
"c",
],
"num_col": [1.5, 2.0, 3.0],
}
)
expected["str_col"] = expected["str_col"].astype("string")
grouped = df.groupby("str_col", as_index=False)
result = grouped.mean()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"level_arg, multiindex", [([0], False), ((0,), False), ([0], True), ((0,), True)]
)
def test_single_element_listlike_level_grouping_deprecation(level_arg, multiindex):
# GH 51583
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"])
if multiindex:
df = df.set_index(["a", "b"])
depr_msg = (
"Creating a Groupby object with a length-1 list-like "
"level parameter will yield indexes as tuples in a future version. "
"To keep indexes as scalars, create Groupby objects with "
"a scalar level parameter instead."
)
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
[key for key, _ in df.groupby(level=level_arg)]
@pytest.mark.parametrize("func", ["sum", "cumsum", "cumprod", "prod"])
def test_groupby_avoid_casting_to_float(func):
# GH#37493
val = 922337203685477580
df = DataFrame({"a": 1, "b": [val]})
result = getattr(df.groupby("a"), func)() - val
expected = DataFrame({"b": [0]}, index=Index([1], name="a"))
if func in ["cumsum", "cumprod"]:
expected = expected.reset_index(drop=True)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("func, val", [("sum", 3), ("prod", 2)])
def test_groupby_sum_support_mask(any_numeric_ea_dtype, func, val):
# GH#37493
df = DataFrame({"a": 1, "b": [1, 2, pd.NA]}, dtype=any_numeric_ea_dtype)
result = getattr(df.groupby("a"), func)()
expected = DataFrame(
{"b": [val]},
index=Index([1], name="a", dtype=any_numeric_ea_dtype),
dtype=any_numeric_ea_dtype,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("val, dtype", [(111, "int"), (222, "uint")])
def test_groupby_overflow(val, dtype):
# GH#37493
df = DataFrame({"a": 1, "b": [val, val]}, dtype=f"{dtype}8")
result = df.groupby("a").sum()
expected = DataFrame(
{"b": [val * 2]},
index=Index([1], name="a", dtype=f"{dtype}8"),
dtype=f"{dtype}64",
)
tm.assert_frame_equal(result, expected)
result = df.groupby("a").cumsum()
expected = DataFrame({"b": [val, val * 2]}, dtype=f"{dtype}64")
tm.assert_frame_equal(result, expected)
result = df.groupby("a").prod()
expected = DataFrame(
{"b": [val * val]},
index=Index([1], name="a", dtype=f"{dtype}8"),
dtype=f"{dtype}64",
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("skipna, val", [(True, 3), (False, pd.NA)])
def test_groupby_cumsum_mask(any_numeric_ea_dtype, skipna, val):
# GH#37493
df = DataFrame({"a": 1, "b": [1, pd.NA, 2]}, dtype=any_numeric_ea_dtype)
result = df.groupby("a").cumsum(skipna=skipna)
expected = DataFrame(
{"b": [1, pd.NA, val]},
dtype=any_numeric_ea_dtype,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"val_in, index, val_out",
[
(
[1.0, 2.0, 3.0, 4.0, 5.0],
["foo", "foo", "bar", "baz", "blah"],
[3.0, 4.0, 5.0, 3.0],
),
(
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
["foo", "foo", "bar", "baz", "blah", "blah"],
[3.0, 4.0, 11.0, 3.0],
),
],
)
def test_groupby_index_name_in_index_content(val_in, index, val_out):
# GH 48567
series = Series(data=val_in, name="values", index=Index(index, name="blah"))
result = series.groupby("blah").sum()
expected = Series(
data=val_out,
name="values",
index=Index(["bar", "baz", "blah", "foo"], name="blah"),
)
tm.assert_series_equal(result, expected)
result = series.to_frame().groupby("blah").sum()
expected = expected.to_frame()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("n", [1, 10, 32, 100, 1000])
def test_sum_of_booleans(n):
# GH 50347
df = DataFrame({"groupby_col": 1, "bool": [True] * n})
df["bool"] = df["bool"].eq(True)
result = df.groupby("groupby_col").sum()
expected = DataFrame({"bool": [n]}, index=Index([1], name="groupby_col"))
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:invalid value encountered in remainder:RuntimeWarning"
)
@pytest.mark.parametrize("method", ["head", "tail", "nth", "first", "last"])
def test_groupby_method_drop_na(method):
# GH 21755
df = DataFrame({"A": ["a", np.nan, "b", np.nan, "c"], "B": range(5)})
if method == "nth":
result = getattr(df.groupby("A"), method)(n=0)
else:
result = getattr(df.groupby("A"), method)()
if method in ["first", "last"]:
expected = DataFrame({"B": [0, 2, 4]}).set_index(
Series(["a", "b", "c"], name="A")
)
else:
expected = DataFrame({"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=[0, 2, 4])
tm.assert_frame_equal(result, expected)
def test_groupby_reduce_period():
# GH#51040
pi = pd.period_range("2016-01-01", periods=100, freq="D")
grps = list(range(10)) * 10
ser = pi.to_series()
gb = ser.groupby(grps)
with pytest.raises(TypeError, match="Period type does not support sum operations"):
gb.sum()
with pytest.raises(
TypeError, match="Period type does not support cumsum operations"
):
gb.cumsum()
with pytest.raises(TypeError, match="Period type does not support prod operations"):
gb.prod()
with pytest.raises(
TypeError, match="Period type does not support cumprod operations"
):
gb.cumprod()
res = gb.max()
expected = ser[-10:]
expected.index = Index(range(10), dtype=int)
tm.assert_series_equal(res, expected)
res = gb.min()
expected = ser[:10]
expected.index = Index(range(10), dtype=int)
tm.assert_series_equal(res, expected)
def test_obj_with_exclusions_duplicate_columns():
# GH#50806
df = DataFrame([[0, 1, 2, 3]])
df.columns = [0, 1, 2, 0]
gb = df.groupby(df[1])
result = gb._obj_with_exclusions
expected = df.take([0, 2, 3], axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_groupby_numeric_only_std_no_result(numeric_only):
# GH 51080
dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}]
df = DataFrame(dicts_non_numeric)
dfgb = df.groupby("a", as_index=False, sort=False)
if numeric_only:
result = dfgb.std(numeric_only=True)
expected_df = DataFrame(["foo", "car"], columns=["a"])
tm.assert_frame_equal(result, expected_df)
else:
with pytest.raises(
ValueError, match="could not convert string to float: 'bar'"
):
dfgb.std(numeric_only=numeric_only)
def test_grouping_with_categorical_interval_columns():
# GH#34164
df = DataFrame({"x": [0.1, 0.2, 0.3, -0.4, 0.5], "w": ["a", "b", "a", "c", "a"]})
qq = pd.qcut(df["x"], q=np.linspace(0, 1, 5))
result = df.groupby([qq, "w"], observed=False)["x"].agg("mean")
categorical_index_level_1 = Categorical(
[
Interval(-0.401, 0.1, closed="right"),
Interval(0.1, 0.2, closed="right"),
Interval(0.2, 0.3, closed="right"),
Interval(0.3, 0.5, closed="right"),
],
ordered=True,
)
index_level_2 = ["a", "b", "c"]
mi = MultiIndex.from_product(
[categorical_index_level_1, index_level_2], names=["x", "w"]
)
expected = Series(
np.array(
[
0.1,
np.nan,
-0.4,
np.nan,
0.2,
np.nan,
0.3,
np.nan,
np.nan,
0.5,
np.nan,
np.nan,
]
),
index=mi,
name="x",
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("bug_var", [1, "a"])
def test_groupby_sum_on_nan_should_return_nan(bug_var):
# GH 24196
df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]})
dfgb = df.groupby(lambda x: x)
result = dfgb.sum(min_count=1)
expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"])
tm.assert_frame_equal(result, expected_df)
@pytest.mark.parametrize(
"method",
[
"count",
"corr",
"cummax",
"cummin",
"cumprod",
"describe",
"rank",
"quantile",
"diff",
"shift",
"all",
"any",
"idxmin",
"idxmax",
"ffill",
"bfill",
"pct_change",
],
)
def test_groupby_selection_with_methods(df, method):
# some methods which require DatetimeIndex
rng = date_range("2014", periods=len(df))
df.index = rng
g = df.groupby(["A"])[["C"]]
g_exp = df[["C"]].groupby(df["A"])
# TODO check groupby with > 1 col ?
res = getattr(g, method)()
exp = getattr(g_exp, method)()
# should always be frames!
tm.assert_frame_equal(res, exp)
def test_groupby_selection_other_methods(df):
# some methods which require DatetimeIndex
rng = date_range("2014", periods=len(df))
df.columns.name = "foo"
df.index = rng
g = df.groupby(["A"])[["C"]]
g_exp = df[["C"]].groupby(df["A"])
# methods which aren't just .foo()
warn_msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0))
msg = "DataFrameGroupBy.dtypes is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_frame_equal(g.dtypes, g_exp.dtypes)
tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum()))
tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean())
tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc())
tm.assert_frame_equal(
g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3)
)
def test_groupby_with_Time_Grouper(unit):
idx2 = to_datetime(
[
"2016-08-31 22:08:12.000",
"2016-08-31 22:09:12.200",
"2016-08-31 22:20:12.400",
]
).as_unit(unit)
test_data = DataFrame(
{"quant": [1.0, 1.0, 3.0], "quant2": [1.0, 1.0, 3.0], "time2": idx2}
)
time2 = date_range("2016-08-31 22:08:00", periods=13, freq="1min", unit=unit)
expected_output = DataFrame(
{
"time2": time2,
"quant": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
"quant2": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
}
)
gb = test_data.groupby(Grouper(key="time2", freq="1min"))
result = gb.count().reset_index()
tm.assert_frame_equal(result, expected_output)
def test_groupby_series_with_datetimeindex_month_name():
# GH 48509
s = Series([0, 1, 0], index=date_range("2022-01-01", periods=3), name="jan")
result = s.groupby(s).count()
expected = Series([2, 1], name="jan")
expected.index.name = "jan"
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("test_series", [True, False])
@pytest.mark.parametrize(
"kwarg, value, name, warn",
[
("by", "a", 1, None),
("by", ["a"], 1, FutureWarning),
("by", ["a"], (1,), None),
("level", 0, 1, None),
("level", [0], 1, FutureWarning),
("level", [0], (1,), None),
],
)
def test_depr_get_group_len_1_list_likes(test_series, kwarg, value, name, warn):
# GH#25971
obj = DataFrame({"b": [3, 4, 5]}, index=Index([1, 1, 2], name="a"))
if test_series:
obj = obj["b"]
gb = obj.groupby(**{kwarg: value})
msg = "you will need to pass a length-1 tuple"
with tm.assert_produces_warning(warn, match=msg):
result = gb.get_group(name)
if test_series:
expected = Series([3, 4], index=Index([1, 1], name="a"), name="b")
else:
expected = DataFrame({"b": [3, 4]}, index=Index([1, 1], name="a"))
tm.assert_equal(result, expected)
def test_groupby_ngroup_with_nan():
# GH#50100
df = DataFrame({"a": Categorical([np.nan]), "b": [1]})
result = df.groupby(["a", "b"], dropna=False, observed=False).ngroup()
expected = Series([0])
tm.assert_series_equal(result, expected)
def test_get_group_axis_1():
# GH#54858
df = DataFrame(
{
"col1": [0, 3, 2, 3],
"col2": [4, 1, 6, 7],
"col3": [3, 8, 2, 10],
"col4": [1, 13, 6, 15],
"col5": [-4, 5, 6, -7],
}
)
with tm.assert_produces_warning(FutureWarning, match="deprecated"):
grouped = df.groupby(axis=1, by=[1, 2, 3, 2, 1])
result = grouped.get_group(1)
expected = DataFrame(
{
"col1": [0, 3, 2, 3],
"col5": [-4, 5, 6, -7],
}
)
tm.assert_frame_equal(result, expected)
def test_groupby_ffill_with_duplicated_index():
# GH#43412
df = DataFrame({"a": [1, 2, 3, 4, np.nan, np.nan]}, index=[0, 1, 2, 0, 1, 2])
result = df.groupby(level=0).ffill()
expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2])
tm.assert_frame_equal(result, expected, check_dtype=False)
@pytest.mark.parametrize("test_series", [True, False])
def test_decimal_na_sort(test_series):
# GH#54847
# We catch both TypeError and decimal.InvalidOperation exceptions in safe_sort.
# If this next assert raises, we can just catch TypeError
assert not isinstance(decimal.InvalidOperation, TypeError)
df = DataFrame(
{
"key": [Decimal(1), Decimal(1), None, None],
"value": [Decimal(2), Decimal(3), Decimal(4), Decimal(5)],
}
)
gb = df.groupby("key", dropna=False)
if test_series:
gb = gb["value"]
result = gb._grouper.result_index
expected = Index([Decimal(1), None], name="key")
tm.assert_index_equal(result, expected)