You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

357 lines
12 KiB

import collections
from datetime import timedelta
import numpy as np
import pytest
import pandas as pd
from pandas import (
DatetimeIndex,
Index,
Interval,
IntervalIndex,
MultiIndex,
Series,
Timedelta,
TimedeltaIndex,
array,
)
import pandas._testing as tm
from pandas.tests.base.common import allow_na_ops
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_value_counts(index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
result = obj.value_counts()
counter = collections.Counter(obj)
expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
if obj.dtype != np.float16:
expected.index = expected.index.astype(obj.dtype)
else:
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
expected.index.astype(obj.dtype)
return
if isinstance(expected.index, MultiIndex):
expected.index.names = obj.names
else:
expected.index.name = obj.name
if not isinstance(result.dtype, np.dtype):
if getattr(obj.dtype, "storage", "") == "pyarrow":
expected = expected.astype("int64[pyarrow]")
else:
# i.e IntegerDtype
expected = expected.astype("Int64")
# TODO(GH#32514): Order of entries with the same count is inconsistent
# on CI (gh-32449)
if obj.duplicated().any():
result = result.sort_index()
expected = expected.sort_index()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("null_obj", [np.nan, None])
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_value_counts_null(null_obj, index_or_series_obj):
orig = index_or_series_obj
obj = orig.copy()
if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif len(obj) < 1:
pytest.skip("Test doesn't make sense on empty data")
elif isinstance(orig, MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
values = obj._values
values[0:2] = null_obj
klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)
# because np.nan == np.nan is False, but None == None is True
# np.nan would be duplicated, whereas None wouldn't
counter = collections.Counter(obj.dropna())
expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
if obj.dtype != np.float16:
expected.index = expected.index.astype(obj.dtype)
else:
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
expected.index.astype(obj.dtype)
return
expected.index.name = obj.name
result = obj.value_counts()
if obj.duplicated().any():
# TODO(GH#32514):
# Order of entries with the same count is inconsistent on CI (gh-32449)
expected = expected.sort_index()
result = result.sort_index()
if not isinstance(result.dtype, np.dtype):
if getattr(obj.dtype, "storage", "") == "pyarrow":
expected = expected.astype("int64[pyarrow]")
else:
# i.e IntegerDtype
expected = expected.astype("Int64")
tm.assert_series_equal(result, expected)
expected[null_obj] = 3
result = obj.value_counts(dropna=False)
if obj.duplicated().any():
# TODO(GH#32514):
# Order of entries with the same count is inconsistent on CI (gh-32449)
expected = expected.sort_index()
result = result.sort_index()
tm.assert_series_equal(result, expected)
def test_value_counts_inferred(index_or_series, using_infer_string):
klass = index_or_series
s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
s = klass(s_values)
expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"], name="count")
tm.assert_series_equal(s.value_counts(), expected)
if isinstance(s, Index):
exp = Index(np.unique(np.array(s_values, dtype=np.object_)))
tm.assert_index_equal(s.unique(), exp)
else:
exp = np.unique(np.array(s_values, dtype=np.object_))
if using_infer_string:
exp = array(exp)
tm.assert_equal(s.unique(), exp)
assert s.nunique() == 4
# don't sort, have to sort after the fact as not sorting is
# platform-dep
hist = s.value_counts(sort=False).sort_values()
expected = Series([3, 1, 4, 2], index=list("acbd"), name="count").sort_values()
tm.assert_series_equal(hist, expected)
# sort ascending
hist = s.value_counts(ascending=True)
expected = Series([1, 2, 3, 4], index=list("cdab"), name="count")
tm.assert_series_equal(hist, expected)
# relative histogram.
hist = s.value_counts(normalize=True)
expected = Series(
[0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"], name="proportion"
)
tm.assert_series_equal(hist, expected)
def test_value_counts_bins(index_or_series, using_infer_string):
klass = index_or_series
s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
s = klass(s_values)
# bins
msg = "bins argument only works with numeric data"
with pytest.raises(TypeError, match=msg):
s.value_counts(bins=1)
s1 = Series([1, 1, 2, 3])
res1 = s1.value_counts(bins=1)
exp1 = Series({Interval(0.997, 3.0): 4}, name="count")
tm.assert_series_equal(res1, exp1)
res1n = s1.value_counts(bins=1, normalize=True)
exp1n = Series({Interval(0.997, 3.0): 1.0}, name="proportion")
tm.assert_series_equal(res1n, exp1n)
if isinstance(s1, Index):
tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
else:
exp = np.array([1, 2, 3], dtype=np.int64)
tm.assert_numpy_array_equal(s1.unique(), exp)
assert s1.nunique() == 3
# these return the same
res4 = s1.value_counts(bins=4, dropna=True)
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
tm.assert_series_equal(res4, exp4)
res4 = s1.value_counts(bins=4, dropna=False)
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
tm.assert_series_equal(res4, exp4)
res4n = s1.value_counts(bins=4, normalize=True)
exp4n = Series(
[0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]), name="proportion"
)
tm.assert_series_equal(res4n, exp4n)
# handle NA's properly
s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"]
s = klass(s_values)
expected = Series([4, 3, 2], index=["b", "a", "d"], name="count")
tm.assert_series_equal(s.value_counts(), expected)
if isinstance(s, Index):
exp = Index(["a", "b", np.nan, "d"])
tm.assert_index_equal(s.unique(), exp)
else:
exp = np.array(["a", "b", np.nan, "d"], dtype=object)
if using_infer_string:
exp = array(exp)
tm.assert_equal(s.unique(), exp)
assert s.nunique() == 3
s = klass({}) if klass is dict else klass({}, dtype=object)
expected = Series([], dtype=np.int64, name="count")
tm.assert_series_equal(s.value_counts(), expected, check_index_type=False)
# returned dtype differs depending on original
if isinstance(s, Index):
tm.assert_index_equal(s.unique(), Index([]), exact=False)
else:
tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False)
assert s.nunique() == 0
def test_value_counts_datetime64(index_or_series, unit):
klass = index_or_series
# GH 3002, datetime64[ns]
# don't test names though
df = pd.DataFrame(
{
"person_id": ["xxyyzz", "xxyyzz", "xxyyzz", "xxyyww", "foofoo", "foofoo"],
"dt": pd.to_datetime(
[
"2010-01-01",
"2010-01-01",
"2010-01-01",
"2009-01-01",
"2008-09-09",
"2008-09-09",
]
).as_unit(unit),
"food": ["PIE", "GUM", "EGG", "EGG", "PIE", "GUM"],
}
)
s = klass(df["dt"].copy())
s.name = None
idx = pd.to_datetime(
["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"]
).as_unit(unit)
expected_s = Series([3, 2, 1], index=idx, name="count")
tm.assert_series_equal(s.value_counts(), expected_s)
expected = array(
np.array(
["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"],
dtype=f"datetime64[{unit}]",
)
)
result = s.unique()
if isinstance(s, Index):
tm.assert_index_equal(result, DatetimeIndex(expected))
else:
tm.assert_extension_array_equal(result, expected)
assert s.nunique() == 3
# with NaT
s = df["dt"].copy()
s = klass(list(s.values) + [pd.NaT] * 4)
if klass is Series:
s = s.dt.as_unit(unit)
else:
s = s.as_unit(unit)
result = s.value_counts()
assert result.index.dtype == f"datetime64[{unit}]"
tm.assert_series_equal(result, expected_s)
result = s.value_counts(dropna=False)
expected_s = pd.concat(
[
Series([4], index=DatetimeIndex([pd.NaT]).as_unit(unit), name="count"),
expected_s,
]
)
tm.assert_series_equal(result, expected_s)
assert s.dtype == f"datetime64[{unit}]"
unique = s.unique()
assert unique.dtype == f"datetime64[{unit}]"
# numpy_array_equal cannot compare pd.NaT
if isinstance(s, Index):
exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]).as_unit(unit)
tm.assert_index_equal(unique, exp_idx)
else:
tm.assert_extension_array_equal(unique[:3], expected)
assert pd.isna(unique[3])
assert s.nunique() == 3
assert s.nunique(dropna=False) == 4
def test_value_counts_timedelta64(index_or_series, unit):
# timedelta64[ns]
klass = index_or_series
day = Timedelta(timedelta(1)).as_unit(unit)
tdi = TimedeltaIndex([day], name="dt").as_unit(unit)
tdvals = np.zeros(6, dtype=f"m8[{unit}]") + day
td = klass(tdvals, name="dt")
result = td.value_counts()
expected_s = Series([6], index=tdi, name="count")
tm.assert_series_equal(result, expected_s)
expected = tdi
result = td.unique()
if isinstance(td, Index):
tm.assert_index_equal(result, expected)
else:
tm.assert_extension_array_equal(result, expected._values)
td2 = day + np.zeros(6, dtype=f"m8[{unit}]")
td2 = klass(td2, name="dt")
result2 = td2.value_counts()
tm.assert_series_equal(result2, expected_s)
@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts_with_nan(dropna, index_or_series):
# GH31944
klass = index_or_series
values = [True, pd.NA, np.nan]
obj = klass(values)
res = obj.value_counts(dropna=dropna)
if dropna is True:
expected = Series([1], index=Index([True], dtype=obj.dtype), name="count")
else:
expected = Series([1, 1, 1], index=[True, pd.NA, np.nan], name="count")
tm.assert_series_equal(res, expected)
def test_value_counts_object_inference_deprecated():
# GH#56161
dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
idx = dti.astype(object)
msg = "The behavior of value_counts with object-dtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = idx.value_counts()
exp = dti.value_counts()
tm.assert_series_equal(res, exp)