You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
522 lines
18 KiB
522 lines
18 KiB
6 months ago
|
import re
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas._libs import lib
|
||
|
|
||
|
import pandas as pd
|
||
|
from pandas import (
|
||
|
DataFrame,
|
||
|
Index,
|
||
|
Series,
|
||
|
Timestamp,
|
||
|
date_range,
|
||
|
)
|
||
|
import pandas._testing as tm
|
||
|
from pandas.tests.groupby import get_groupby_method_args
|
||
|
|
||
|
|
||
|
class TestNumericOnly:
|
||
|
# make sure that we are passing thru kwargs to our agg functions
|
||
|
|
||
|
@pytest.fixture
|
||
|
def df(self):
|
||
|
# GH3668
|
||
|
# GH5724
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"group": [1, 1, 2],
|
||
|
"int": [1, 2, 3],
|
||
|
"float": [4.0, 5.0, 6.0],
|
||
|
"string": list("abc"),
|
||
|
"category_string": Series(list("abc")).astype("category"),
|
||
|
"category_int": [7, 8, 9],
|
||
|
"datetime": date_range("20130101", periods=3),
|
||
|
"datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
|
||
|
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
|
||
|
},
|
||
|
columns=[
|
||
|
"group",
|
||
|
"int",
|
||
|
"float",
|
||
|
"string",
|
||
|
"category_string",
|
||
|
"category_int",
|
||
|
"datetime",
|
||
|
"datetimetz",
|
||
|
"timedelta",
|
||
|
],
|
||
|
)
|
||
|
return df
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["mean", "median"])
|
||
|
def test_averages(self, df, method):
|
||
|
# mean / median
|
||
|
expected_columns_numeric = Index(["int", "float", "category_int"])
|
||
|
|
||
|
gb = df.groupby("group")
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"category_int": [7.5, 9],
|
||
|
"float": [4.5, 6.0],
|
||
|
"timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
|
||
|
"int": [1.5, 3],
|
||
|
"datetime": [
|
||
|
Timestamp("2013-01-01 12:00:00"),
|
||
|
Timestamp("2013-01-03 00:00:00"),
|
||
|
],
|
||
|
"datetimetz": [
|
||
|
Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
|
||
|
Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
|
||
|
],
|
||
|
},
|
||
|
index=Index([1, 2], name="group"),
|
||
|
columns=[
|
||
|
"int",
|
||
|
"float",
|
||
|
"category_int",
|
||
|
],
|
||
|
)
|
||
|
|
||
|
result = getattr(gb, method)(numeric_only=True)
|
||
|
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||
|
|
||
|
expected_columns = expected.columns
|
||
|
|
||
|
self._check(df, method, expected_columns, expected_columns_numeric)
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["min", "max"])
|
||
|
def test_extrema(self, df, method):
|
||
|
# TODO: min, max *should* handle
|
||
|
# categorical (ordered) dtype
|
||
|
|
||
|
expected_columns = Index(
|
||
|
[
|
||
|
"int",
|
||
|
"float",
|
||
|
"string",
|
||
|
"category_int",
|
||
|
"datetime",
|
||
|
"datetimetz",
|
||
|
"timedelta",
|
||
|
]
|
||
|
)
|
||
|
expected_columns_numeric = expected_columns
|
||
|
|
||
|
self._check(df, method, expected_columns, expected_columns_numeric)
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["first", "last"])
|
||
|
def test_first_last(self, df, method):
|
||
|
expected_columns = Index(
|
||
|
[
|
||
|
"int",
|
||
|
"float",
|
||
|
"string",
|
||
|
"category_string",
|
||
|
"category_int",
|
||
|
"datetime",
|
||
|
"datetimetz",
|
||
|
"timedelta",
|
||
|
]
|
||
|
)
|
||
|
expected_columns_numeric = expected_columns
|
||
|
|
||
|
self._check(df, method, expected_columns, expected_columns_numeric)
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["sum", "cumsum"])
|
||
|
def test_sum_cumsum(self, df, method):
|
||
|
expected_columns_numeric = Index(["int", "float", "category_int"])
|
||
|
expected_columns = Index(
|
||
|
["int", "float", "string", "category_int", "timedelta"]
|
||
|
)
|
||
|
if method == "cumsum":
|
||
|
# cumsum loses string
|
||
|
expected_columns = Index(["int", "float", "category_int", "timedelta"])
|
||
|
|
||
|
self._check(df, method, expected_columns, expected_columns_numeric)
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["prod", "cumprod"])
|
||
|
def test_prod_cumprod(self, df, method):
|
||
|
expected_columns = Index(["int", "float", "category_int"])
|
||
|
expected_columns_numeric = expected_columns
|
||
|
|
||
|
self._check(df, method, expected_columns, expected_columns_numeric)
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||
|
def test_cummin_cummax(self, df, method):
|
||
|
# like min, max, but don't include strings
|
||
|
expected_columns = Index(
|
||
|
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
|
||
|
)
|
||
|
|
||
|
# GH#15561: numeric_only=False set by default like min/max
|
||
|
expected_columns_numeric = expected_columns
|
||
|
|
||
|
self._check(df, method, expected_columns, expected_columns_numeric)
|
||
|
|
||
|
def _check(self, df, method, expected_columns, expected_columns_numeric):
|
||
|
gb = df.groupby("group")
|
||
|
|
||
|
# object dtypes for transformations are not implemented in Cython and
|
||
|
# have no Python fallback
|
||
|
exception = NotImplementedError if method.startswith("cum") else TypeError
|
||
|
|
||
|
if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
|
||
|
# The methods default to numeric_only=False and raise TypeError
|
||
|
msg = "|".join(
|
||
|
[
|
||
|
"Categorical is not ordered",
|
||
|
f"Cannot perform {method} with non-ordered Categorical",
|
||
|
re.escape(f"agg function failed [how->{method},dtype->object]"),
|
||
|
# cumsum/cummin/cummax/cumprod
|
||
|
"function is not implemented for this dtype",
|
||
|
]
|
||
|
)
|
||
|
with pytest.raises(exception, match=msg):
|
||
|
getattr(gb, method)()
|
||
|
elif method in ("sum", "mean", "median", "prod"):
|
||
|
msg = "|".join(
|
||
|
[
|
||
|
"category type does not support sum operations",
|
||
|
re.escape(f"agg function failed [how->{method},dtype->object]"),
|
||
|
re.escape(f"agg function failed [how->{method},dtype->string]"),
|
||
|
]
|
||
|
)
|
||
|
with pytest.raises(exception, match=msg):
|
||
|
getattr(gb, method)()
|
||
|
else:
|
||
|
result = getattr(gb, method)()
|
||
|
tm.assert_index_equal(result.columns, expected_columns_numeric)
|
||
|
|
||
|
if method not in ("first", "last"):
|
||
|
msg = "|".join(
|
||
|
[
|
||
|
"Categorical is not ordered",
|
||
|
"category type does not support",
|
||
|
"function is not implemented for this dtype",
|
||
|
f"Cannot perform {method} with non-ordered Categorical",
|
||
|
re.escape(f"agg function failed [how->{method},dtype->object]"),
|
||
|
re.escape(f"agg function failed [how->{method},dtype->string]"),
|
||
|
]
|
||
|
)
|
||
|
with pytest.raises(exception, match=msg):
|
||
|
getattr(gb, method)(numeric_only=False)
|
||
|
else:
|
||
|
result = getattr(gb, method)(numeric_only=False)
|
||
|
tm.assert_index_equal(result.columns, expected_columns)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("numeric_only", [True, False, None])
|
||
|
def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string):
|
||
|
if groupby_func in ("idxmax", "idxmin"):
|
||
|
pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1")
|
||
|
if groupby_func in ("corrwith", "skew"):
|
||
|
msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1"
|
||
|
request.applymarker(pytest.mark.xfail(reason=msg))
|
||
|
|
||
|
df = DataFrame(
|
||
|
np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"]
|
||
|
)
|
||
|
df["E"] = "x"
|
||
|
groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
|
||
|
gb = df.groupby(groups)
|
||
|
method = getattr(gb, groupby_func)
|
||
|
args = get_groupby_method_args(groupby_func, df)
|
||
|
kwargs = {"axis": 1}
|
||
|
if numeric_only is not None:
|
||
|
# when numeric_only is None we don't pass any argument
|
||
|
kwargs["numeric_only"] = numeric_only
|
||
|
|
||
|
# Functions without numeric_only and axis args
|
||
|
no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift")
|
||
|
# Functions with axis args
|
||
|
has_axis = (
|
||
|
"cumprod",
|
||
|
"cumsum",
|
||
|
"diff",
|
||
|
"pct_change",
|
||
|
"rank",
|
||
|
"shift",
|
||
|
"cummax",
|
||
|
"cummin",
|
||
|
"idxmin",
|
||
|
"idxmax",
|
||
|
"fillna",
|
||
|
)
|
||
|
warn_msg = f"DataFrameGroupBy.{groupby_func} with axis=1 is deprecated"
|
||
|
if numeric_only is not None and groupby_func in no_args:
|
||
|
msg = "got an unexpected keyword argument 'numeric_only'"
|
||
|
if groupby_func in ["cumprod", "cumsum"]:
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
|
||
|
method(*args, **kwargs)
|
||
|
else:
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
method(*args, **kwargs)
|
||
|
elif groupby_func not in has_axis:
|
||
|
msg = "got an unexpected keyword argument 'axis'"
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
method(*args, **kwargs)
|
||
|
# fillna and shift are successful even on object dtypes
|
||
|
elif (numeric_only is None or not numeric_only) and groupby_func not in (
|
||
|
"fillna",
|
||
|
"shift",
|
||
|
):
|
||
|
msgs = (
|
||
|
# cummax, cummin, rank
|
||
|
"not supported between instances of",
|
||
|
# cumprod
|
||
|
"can't multiply sequence by non-int of type 'float'",
|
||
|
# cumsum, diff, pct_change
|
||
|
"unsupported operand type",
|
||
|
"has no kernel",
|
||
|
)
|
||
|
if using_infer_string:
|
||
|
import pyarrow as pa
|
||
|
|
||
|
errs = (TypeError, pa.lib.ArrowNotImplementedError)
|
||
|
else:
|
||
|
errs = TypeError
|
||
|
with pytest.raises(errs, match=f"({'|'.join(msgs)})"):
|
||
|
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
|
||
|
method(*args, **kwargs)
|
||
|
else:
|
||
|
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
|
||
|
result = method(*args, **kwargs)
|
||
|
|
||
|
df_expected = df.drop(columns="E").T if numeric_only else df.T
|
||
|
expected = getattr(df_expected, groupby_func)(*args).T
|
||
|
if groupby_func == "shift" and not numeric_only:
|
||
|
# shift with axis=1 leaves the leftmost column as numeric
|
||
|
# but transposing for expected gives us object dtype
|
||
|
expected = expected.astype(float)
|
||
|
|
||
|
tm.assert_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"kernel, has_arg",
|
||
|
[
|
||
|
("all", False),
|
||
|
("any", False),
|
||
|
("bfill", False),
|
||
|
("corr", True),
|
||
|
("corrwith", True),
|
||
|
("cov", True),
|
||
|
("cummax", True),
|
||
|
("cummin", True),
|
||
|
("cumprod", True),
|
||
|
("cumsum", True),
|
||
|
("diff", False),
|
||
|
("ffill", False),
|
||
|
("fillna", False),
|
||
|
("first", True),
|
||
|
("idxmax", True),
|
||
|
("idxmin", True),
|
||
|
("last", True),
|
||
|
("max", True),
|
||
|
("mean", True),
|
||
|
("median", True),
|
||
|
("min", True),
|
||
|
("nth", False),
|
||
|
("nunique", False),
|
||
|
("pct_change", False),
|
||
|
("prod", True),
|
||
|
("quantile", True),
|
||
|
("sem", True),
|
||
|
("skew", True),
|
||
|
("std", True),
|
||
|
("sum", True),
|
||
|
("var", True),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
|
||
|
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
|
||
|
def test_numeric_only(kernel, has_arg, numeric_only, keys):
|
||
|
# GH#46072
|
||
|
# drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False
|
||
|
# has_arg: Whether the op has a numeric_only arg
|
||
|
df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]})
|
||
|
|
||
|
args = get_groupby_method_args(kernel, df)
|
||
|
kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only}
|
||
|
|
||
|
gb = df.groupby(keys)
|
||
|
method = getattr(gb, kernel)
|
||
|
if has_arg and numeric_only is True:
|
||
|
# Cases where b does not appear in the result
|
||
|
result = method(*args, **kwargs)
|
||
|
assert "b" not in result.columns
|
||
|
elif (
|
||
|
# kernels that work on any dtype and have numeric_only arg
|
||
|
kernel in ("first", "last")
|
||
|
or (
|
||
|
# kernels that work on any dtype and don't have numeric_only arg
|
||
|
kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique")
|
||
|
and numeric_only is lib.no_default
|
||
|
)
|
||
|
):
|
||
|
warn = FutureWarning if kernel == "fillna" else None
|
||
|
msg = "DataFrameGroupBy.fillna is deprecated"
|
||
|
with tm.assert_produces_warning(warn, match=msg):
|
||
|
result = method(*args, **kwargs)
|
||
|
assert "b" in result.columns
|
||
|
elif has_arg:
|
||
|
assert numeric_only is not True
|
||
|
# kernels that are successful on any dtype were above; this will fail
|
||
|
|
||
|
# object dtypes for transformations are not implemented in Cython and
|
||
|
# have no Python fallback
|
||
|
exception = NotImplementedError if kernel.startswith("cum") else TypeError
|
||
|
|
||
|
msg = "|".join(
|
||
|
[
|
||
|
"not allowed for this dtype",
|
||
|
"cannot be performed against 'object' dtypes",
|
||
|
# On PY39 message is "a number"; on PY310 and after is "a real number"
|
||
|
"must be a string or a.* number",
|
||
|
"unsupported operand type",
|
||
|
"function is not implemented for this dtype",
|
||
|
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
|
||
|
]
|
||
|
)
|
||
|
if kernel == "idxmin":
|
||
|
msg = "'<' not supported between instances of 'type' and 'type'"
|
||
|
elif kernel == "idxmax":
|
||
|
msg = "'>' not supported between instances of 'type' and 'type'"
|
||
|
with pytest.raises(exception, match=msg):
|
||
|
method(*args, **kwargs)
|
||
|
elif not has_arg and numeric_only is not lib.no_default:
|
||
|
with pytest.raises(
|
||
|
TypeError, match="got an unexpected keyword argument 'numeric_only'"
|
||
|
):
|
||
|
method(*args, **kwargs)
|
||
|
else:
|
||
|
assert kernel in ("diff", "pct_change")
|
||
|
assert numeric_only is lib.no_default
|
||
|
# Doesn't have numeric_only argument and fails on nuisance columns
|
||
|
with pytest.raises(TypeError, match=r"unsupported operand type"):
|
||
|
method(*args, **kwargs)
|
||
|
|
||
|
|
||
|
@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning")
|
||
|
@pytest.mark.parametrize("dtype", [bool, int, float, object])
|
||
|
def test_deprecate_numeric_only_series(dtype, groupby_func, request):
|
||
|
# GH#46560
|
||
|
grouper = [0, 0, 1]
|
||
|
|
||
|
ser = Series([1, 0, 0], dtype=dtype)
|
||
|
gb = ser.groupby(grouper)
|
||
|
|
||
|
if groupby_func == "corrwith":
|
||
|
# corrwith is not implemented on SeriesGroupBy
|
||
|
assert not hasattr(gb, groupby_func)
|
||
|
return
|
||
|
|
||
|
method = getattr(gb, groupby_func)
|
||
|
|
||
|
expected_ser = Series([1, 0, 0])
|
||
|
expected_gb = expected_ser.groupby(grouper)
|
||
|
expected_method = getattr(expected_gb, groupby_func)
|
||
|
|
||
|
args = get_groupby_method_args(groupby_func, ser)
|
||
|
|
||
|
fails_on_numeric_object = (
|
||
|
"corr",
|
||
|
"cov",
|
||
|
"cummax",
|
||
|
"cummin",
|
||
|
"cumprod",
|
||
|
"cumsum",
|
||
|
"quantile",
|
||
|
)
|
||
|
# ops that give an object result on object input
|
||
|
obj_result = (
|
||
|
"first",
|
||
|
"last",
|
||
|
"nth",
|
||
|
"bfill",
|
||
|
"ffill",
|
||
|
"shift",
|
||
|
"sum",
|
||
|
"diff",
|
||
|
"pct_change",
|
||
|
"var",
|
||
|
"mean",
|
||
|
"median",
|
||
|
"min",
|
||
|
"max",
|
||
|
"prod",
|
||
|
"skew",
|
||
|
)
|
||
|
|
||
|
# Test default behavior; kernels that fail may be enabled in the future but kernels
|
||
|
# that succeed should not be allowed to fail (without deprecation, at least)
|
||
|
if groupby_func in fails_on_numeric_object and dtype is object:
|
||
|
if groupby_func == "quantile":
|
||
|
msg = "cannot be performed against 'object' dtypes"
|
||
|
else:
|
||
|
msg = "is not supported for object dtype"
|
||
|
warn = FutureWarning if groupby_func == "fillna" else None
|
||
|
warn_msg = "DataFrameGroupBy.fillna is deprecated"
|
||
|
with tm.assert_produces_warning(warn, match=warn_msg):
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
method(*args)
|
||
|
elif dtype is object:
|
||
|
warn = FutureWarning if groupby_func == "fillna" else None
|
||
|
warn_msg = "SeriesGroupBy.fillna is deprecated"
|
||
|
with tm.assert_produces_warning(warn, match=warn_msg):
|
||
|
result = method(*args)
|
||
|
with tm.assert_produces_warning(warn, match=warn_msg):
|
||
|
expected = expected_method(*args)
|
||
|
if groupby_func in obj_result:
|
||
|
expected = expected.astype(object)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
has_numeric_only = (
|
||
|
"first",
|
||
|
"last",
|
||
|
"max",
|
||
|
"mean",
|
||
|
"median",
|
||
|
"min",
|
||
|
"prod",
|
||
|
"quantile",
|
||
|
"sem",
|
||
|
"skew",
|
||
|
"std",
|
||
|
"sum",
|
||
|
"var",
|
||
|
"cummax",
|
||
|
"cummin",
|
||
|
"cumprod",
|
||
|
"cumsum",
|
||
|
)
|
||
|
if groupby_func not in has_numeric_only:
|
||
|
msg = "got an unexpected keyword argument 'numeric_only'"
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
method(*args, numeric_only=True)
|
||
|
elif dtype is object:
|
||
|
msg = "|".join(
|
||
|
[
|
||
|
"SeriesGroupBy.sem called with numeric_only=True and dtype object",
|
||
|
"Series.skew does not allow numeric_only=True with non-numeric",
|
||
|
"cum(sum|prod|min|max) is not supported for object dtype",
|
||
|
r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric",
|
||
|
]
|
||
|
)
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
method(*args, numeric_only=True)
|
||
|
elif dtype == bool and groupby_func == "quantile":
|
||
|
msg = "Allowing bool dtype in SeriesGroupBy.quantile"
|
||
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||
|
# GH#51424
|
||
|
result = method(*args, numeric_only=True)
|
||
|
expected = method(*args, numeric_only=False)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
else:
|
||
|
result = method(*args, numeric_only=True)
|
||
|
expected = method(*args, numeric_only=False)
|
||
|
tm.assert_series_equal(result, expected)
|