You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

357 lines
12 KiB

7 months ago
import collections
from datetime import timedelta
import numpy as np
import pytest
import pandas as pd
from pandas import (
DatetimeIndex,
Index,
Interval,
IntervalIndex,
MultiIndex,
Series,
Timedelta,
TimedeltaIndex,
array,
)
import pandas._testing as tm
from pandas.tests.base.common import allow_na_ops
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_value_counts(index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
result = obj.value_counts()
counter = collections.Counter(obj)
expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
if obj.dtype != np.float16:
expected.index = expected.index.astype(obj.dtype)
else:
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
expected.index.astype(obj.dtype)
return
if isinstance(expected.index, MultiIndex):
expected.index.names = obj.names
else:
expected.index.name = obj.name
if not isinstance(result.dtype, np.dtype):
if getattr(obj.dtype, "storage", "") == "pyarrow":
expected = expected.astype("int64[pyarrow]")
else:
# i.e IntegerDtype
expected = expected.astype("Int64")
# TODO(GH#32514): Order of entries with the same count is inconsistent
# on CI (gh-32449)
if obj.duplicated().any():
result = result.sort_index()
expected = expected.sort_index()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("null_obj", [np.nan, None])
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_value_counts_null(null_obj, index_or_series_obj):
orig = index_or_series_obj
obj = orig.copy()
if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif len(obj) < 1:
pytest.skip("Test doesn't make sense on empty data")
elif isinstance(orig, MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
values = obj._values
values[0:2] = null_obj
klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)
# because np.nan == np.nan is False, but None == None is True
# np.nan would be duplicated, whereas None wouldn't
counter = collections.Counter(obj.dropna())
expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
if obj.dtype != np.float16:
expected.index = expected.index.astype(obj.dtype)
else:
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
expected.index.astype(obj.dtype)
return
expected.index.name = obj.name
result = obj.value_counts()
if obj.duplicated().any():
# TODO(GH#32514):
# Order of entries with the same count is inconsistent on CI (gh-32449)
expected = expected.sort_index()
result = result.sort_index()
if not isinstance(result.dtype, np.dtype):
if getattr(obj.dtype, "storage", "") == "pyarrow":
expected = expected.astype("int64[pyarrow]")
else:
# i.e IntegerDtype
expected = expected.astype("Int64")
tm.assert_series_equal(result, expected)
expected[null_obj] = 3
result = obj.value_counts(dropna=False)
if obj.duplicated().any():
# TODO(GH#32514):
# Order of entries with the same count is inconsistent on CI (gh-32449)
expected = expected.sort_index()
result = result.sort_index()
tm.assert_series_equal(result, expected)
def test_value_counts_inferred(index_or_series, using_infer_string):
klass = index_or_series
s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
s = klass(s_values)
expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"], name="count")
tm.assert_series_equal(s.value_counts(), expected)
if isinstance(s, Index):
exp = Index(np.unique(np.array(s_values, dtype=np.object_)))
tm.assert_index_equal(s.unique(), exp)
else:
exp = np.unique(np.array(s_values, dtype=np.object_))
if using_infer_string:
exp = array(exp)
tm.assert_equal(s.unique(), exp)
assert s.nunique() == 4
# don't sort, have to sort after the fact as not sorting is
# platform-dep
hist = s.value_counts(sort=False).sort_values()
expected = Series([3, 1, 4, 2], index=list("acbd"), name="count").sort_values()
tm.assert_series_equal(hist, expected)
# sort ascending
hist = s.value_counts(ascending=True)
expected = Series([1, 2, 3, 4], index=list("cdab"), name="count")
tm.assert_series_equal(hist, expected)
# relative histogram.
hist = s.value_counts(normalize=True)
expected = Series(
[0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"], name="proportion"
)
tm.assert_series_equal(hist, expected)
def test_value_counts_bins(index_or_series, using_infer_string):
klass = index_or_series
s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
s = klass(s_values)
# bins
msg = "bins argument only works with numeric data"
with pytest.raises(TypeError, match=msg):
s.value_counts(bins=1)
s1 = Series([1, 1, 2, 3])
res1 = s1.value_counts(bins=1)
exp1 = Series({Interval(0.997, 3.0): 4}, name="count")
tm.assert_series_equal(res1, exp1)
res1n = s1.value_counts(bins=1, normalize=True)
exp1n = Series({Interval(0.997, 3.0): 1.0}, name="proportion")
tm.assert_series_equal(res1n, exp1n)
if isinstance(s1, Index):
tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
else:
exp = np.array([1, 2, 3], dtype=np.int64)
tm.assert_numpy_array_equal(s1.unique(), exp)
assert s1.nunique() == 3
# these return the same
res4 = s1.value_counts(bins=4, dropna=True)
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
tm.assert_series_equal(res4, exp4)
res4 = s1.value_counts(bins=4, dropna=False)
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
tm.assert_series_equal(res4, exp4)
res4n = s1.value_counts(bins=4, normalize=True)
exp4n = Series(
[0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]), name="proportion"
)
tm.assert_series_equal(res4n, exp4n)
# handle NA's properly
s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"]
s = klass(s_values)
expected = Series([4, 3, 2], index=["b", "a", "d"], name="count")
tm.assert_series_equal(s.value_counts(), expected)
if isinstance(s, Index):
exp = Index(["a", "b", np.nan, "d"])
tm.assert_index_equal(s.unique(), exp)
else:
exp = np.array(["a", "b", np.nan, "d"], dtype=object)
if using_infer_string:
exp = array(exp)
tm.assert_equal(s.unique(), exp)
assert s.nunique() == 3
s = klass({}) if klass is dict else klass({}, dtype=object)
expected = Series([], dtype=np.int64, name="count")
tm.assert_series_equal(s.value_counts(), expected, check_index_type=False)
# returned dtype differs depending on original
if isinstance(s, Index):
tm.assert_index_equal(s.unique(), Index([]), exact=False)
else:
tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False)
assert s.nunique() == 0
def test_value_counts_datetime64(index_or_series, unit):
klass = index_or_series
# GH 3002, datetime64[ns]
# don't test names though
df = pd.DataFrame(
{
"person_id": ["xxyyzz", "xxyyzz", "xxyyzz", "xxyyww", "foofoo", "foofoo"],
"dt": pd.to_datetime(
[
"2010-01-01",
"2010-01-01",
"2010-01-01",
"2009-01-01",
"2008-09-09",
"2008-09-09",
]
).as_unit(unit),
"food": ["PIE", "GUM", "EGG", "EGG", "PIE", "GUM"],
}
)
s = klass(df["dt"].copy())
s.name = None
idx = pd.to_datetime(
["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"]
).as_unit(unit)
expected_s = Series([3, 2, 1], index=idx, name="count")
tm.assert_series_equal(s.value_counts(), expected_s)
expected = array(
np.array(
["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"],
dtype=f"datetime64[{unit}]",
)
)
result = s.unique()
if isinstance(s, Index):
tm.assert_index_equal(result, DatetimeIndex(expected))
else:
tm.assert_extension_array_equal(result, expected)
assert s.nunique() == 3
# with NaT
s = df["dt"].copy()
s = klass(list(s.values) + [pd.NaT] * 4)
if klass is Series:
s = s.dt.as_unit(unit)
else:
s = s.as_unit(unit)
result = s.value_counts()
assert result.index.dtype == f"datetime64[{unit}]"
tm.assert_series_equal(result, expected_s)
result = s.value_counts(dropna=False)
expected_s = pd.concat(
[
Series([4], index=DatetimeIndex([pd.NaT]).as_unit(unit), name="count"),
expected_s,
]
)
tm.assert_series_equal(result, expected_s)
assert s.dtype == f"datetime64[{unit}]"
unique = s.unique()
assert unique.dtype == f"datetime64[{unit}]"
# numpy_array_equal cannot compare pd.NaT
if isinstance(s, Index):
exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]).as_unit(unit)
tm.assert_index_equal(unique, exp_idx)
else:
tm.assert_extension_array_equal(unique[:3], expected)
assert pd.isna(unique[3])
assert s.nunique() == 3
assert s.nunique(dropna=False) == 4
def test_value_counts_timedelta64(index_or_series, unit):
# timedelta64[ns]
klass = index_or_series
day = Timedelta(timedelta(1)).as_unit(unit)
tdi = TimedeltaIndex([day], name="dt").as_unit(unit)
tdvals = np.zeros(6, dtype=f"m8[{unit}]") + day
td = klass(tdvals, name="dt")
result = td.value_counts()
expected_s = Series([6], index=tdi, name="count")
tm.assert_series_equal(result, expected_s)
expected = tdi
result = td.unique()
if isinstance(td, Index):
tm.assert_index_equal(result, expected)
else:
tm.assert_extension_array_equal(result, expected._values)
td2 = day + np.zeros(6, dtype=f"m8[{unit}]")
td2 = klass(td2, name="dt")
result2 = td2.value_counts()
tm.assert_series_equal(result2, expected_s)
@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts_with_nan(dropna, index_or_series):
# GH31944
klass = index_or_series
values = [True, pd.NA, np.nan]
obj = klass(values)
res = obj.value_counts(dropna=dropna)
if dropna is True:
expected = Series([1], index=Index([True], dtype=obj.dtype), name="count")
else:
expected = Series([1, 1, 1], index=[True, pd.NA, np.nan], name="count")
tm.assert_series_equal(res, expected)
def test_value_counts_object_inference_deprecated():
# GH#56161
dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
idx = dti.astype(object)
msg = "The behavior of value_counts with object-dtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = idx.value_counts()
exp = dti.value_counts()
tm.assert_series_equal(res, exp)