You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
306 lines
8.3 KiB
306 lines
8.3 KiB
7 months ago
|
import os
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
import pandas as pd
|
||
|
from pandas import (
|
||
|
Categorical,
|
||
|
DatetimeIndex,
|
||
|
Interval,
|
||
|
IntervalIndex,
|
||
|
NaT,
|
||
|
Series,
|
||
|
Timedelta,
|
||
|
TimedeltaIndex,
|
||
|
Timestamp,
|
||
|
cut,
|
||
|
date_range,
|
||
|
isna,
|
||
|
qcut,
|
||
|
timedelta_range,
|
||
|
)
|
||
|
import pandas._testing as tm
|
||
|
from pandas.api.types import CategoricalDtype
|
||
|
|
||
|
from pandas.tseries.offsets import Day
|
||
|
|
||
|
|
||
|
def test_qcut():
|
||
|
arr = np.random.default_rng(2).standard_normal(1000)
|
||
|
|
||
|
# We store the bins as Index that have been
|
||
|
# rounded to comparisons are a bit tricky.
|
||
|
labels, _ = qcut(arr, 4, retbins=True)
|
||
|
ex_bins = np.quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
|
||
|
|
||
|
result = labels.categories.left.values
|
||
|
assert np.allclose(result, ex_bins[:-1], atol=1e-2)
|
||
|
|
||
|
result = labels.categories.right.values
|
||
|
assert np.allclose(result, ex_bins[1:], atol=1e-2)
|
||
|
|
||
|
ex_levels = cut(arr, ex_bins, include_lowest=True)
|
||
|
tm.assert_categorical_equal(labels, ex_levels)
|
||
|
|
||
|
|
||
|
def test_qcut_bounds():
|
||
|
arr = np.random.default_rng(2).standard_normal(1000)
|
||
|
|
||
|
factor = qcut(arr, 10, labels=False)
|
||
|
assert len(np.unique(factor)) == 10
|
||
|
|
||
|
|
||
|
def test_qcut_specify_quantiles():
|
||
|
arr = np.random.default_rng(2).standard_normal(100)
|
||
|
factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0])
|
||
|
|
||
|
expected = qcut(arr, 4)
|
||
|
tm.assert_categorical_equal(factor, expected)
|
||
|
|
||
|
|
||
|
def test_qcut_all_bins_same():
|
||
|
with pytest.raises(ValueError, match="edges.*unique"):
|
||
|
qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
|
||
|
|
||
|
|
||
|
def test_qcut_include_lowest():
|
||
|
values = np.arange(10)
|
||
|
ii = qcut(values, 4)
|
||
|
|
||
|
ex_levels = IntervalIndex(
|
||
|
[
|
||
|
Interval(-0.001, 2.25),
|
||
|
Interval(2.25, 4.5),
|
||
|
Interval(4.5, 6.75),
|
||
|
Interval(6.75, 9),
|
||
|
]
|
||
|
)
|
||
|
tm.assert_index_equal(ii.categories, ex_levels)
|
||
|
|
||
|
|
||
|
def test_qcut_nas():
|
||
|
arr = np.random.default_rng(2).standard_normal(100)
|
||
|
arr[:20] = np.nan
|
||
|
|
||
|
result = qcut(arr, 4)
|
||
|
assert isna(result[:20]).all()
|
||
|
|
||
|
|
||
|
def test_qcut_index():
|
||
|
result = qcut([0, 2], 2)
|
||
|
intervals = [Interval(-0.001, 1), Interval(1, 2)]
|
||
|
|
||
|
expected = Categorical(intervals, ordered=True)
|
||
|
tm.assert_categorical_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_qcut_binning_issues(datapath):
|
||
|
# see gh-1978, gh-1979
|
||
|
cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv"))
|
||
|
arr = np.loadtxt(cut_file)
|
||
|
result = qcut(arr, 20)
|
||
|
|
||
|
starts = []
|
||
|
ends = []
|
||
|
|
||
|
for lev in np.unique(result):
|
||
|
s = lev.left
|
||
|
e = lev.right
|
||
|
assert s != e
|
||
|
|
||
|
starts.append(float(s))
|
||
|
ends.append(float(e))
|
||
|
|
||
|
for (sp, sn), (ep, en) in zip(
|
||
|
zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:])
|
||
|
):
|
||
|
assert sp < sn
|
||
|
assert ep < en
|
||
|
assert ep <= sn
|
||
|
|
||
|
|
||
|
def test_qcut_return_intervals():
|
||
|
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
|
||
|
res = qcut(ser, [0, 0.333, 0.666, 1])
|
||
|
|
||
|
exp_levels = np.array(
|
||
|
[Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]
|
||
|
)
|
||
|
exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
|
||
|
CategoricalDtype(ordered=True)
|
||
|
)
|
||
|
tm.assert_series_equal(res, exp)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("labels", ["foo", 1, True])
|
||
|
def test_qcut_incorrect_labels(labels):
|
||
|
# GH 13318
|
||
|
values = range(5)
|
||
|
msg = "Bin labels must either be False, None or passed in as a list-like argument"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
qcut(values, 4, labels=labels)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))])
|
||
|
def test_qcut_wrong_length_labels(labels):
|
||
|
# GH 13318
|
||
|
values = range(10)
|
||
|
msg = "Bin labels must be one fewer than the number of bin edges"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
qcut(values, 4, labels=labels)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"labels, expected",
|
||
|
[
|
||
|
(["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)),
|
||
|
(list(range(3)), Categorical([0, 1, 2], ordered=True)),
|
||
|
],
|
||
|
)
|
||
|
def test_qcut_list_like_labels(labels, expected):
|
||
|
# GH 13318
|
||
|
values = range(3)
|
||
|
result = qcut(values, 3, labels=labels)
|
||
|
tm.assert_categorical_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"kwargs,msg",
|
||
|
[
|
||
|
({"duplicates": "drop"}, None),
|
||
|
({}, "Bin edges must be unique"),
|
||
|
({"duplicates": "raise"}, "Bin edges must be unique"),
|
||
|
({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
|
||
|
],
|
||
|
)
|
||
|
def test_qcut_duplicates_bin(kwargs, msg):
|
||
|
# see gh-7751
|
||
|
values = [0, 0, 0, 0, 1, 2, 3]
|
||
|
|
||
|
if msg is not None:
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
qcut(values, 3, **kwargs)
|
||
|
else:
|
||
|
result = qcut(values, 3, **kwargs)
|
||
|
expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
|
||
|
tm.assert_index_equal(result.categories, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)]
|
||
|
)
|
||
|
@pytest.mark.parametrize("length", [1, 2])
|
||
|
@pytest.mark.parametrize("labels", [None, False])
|
||
|
def test_single_quantile(data, start, end, length, labels):
|
||
|
# see gh-15431
|
||
|
ser = Series([data] * length)
|
||
|
result = qcut(ser, 1, labels=labels)
|
||
|
|
||
|
if labels is None:
|
||
|
intervals = IntervalIndex([Interval(start, end)] * length, closed="right")
|
||
|
expected = Series(intervals).astype(CategoricalDtype(ordered=True))
|
||
|
else:
|
||
|
expected = Series([0] * length, dtype=np.intp)
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"ser",
|
||
|
[
|
||
|
Series(DatetimeIndex(["20180101", NaT, "20180103"])),
|
||
|
Series(TimedeltaIndex(["0 days", NaT, "2 days"])),
|
||
|
],
|
||
|
ids=lambda x: str(x.dtype),
|
||
|
)
|
||
|
def test_qcut_nat(ser, unit):
|
||
|
# see gh-19768
|
||
|
ser = ser.dt.as_unit(unit)
|
||
|
td = Timedelta(1, unit=unit).as_unit(unit)
|
||
|
|
||
|
left = Series([ser[0] - td, np.nan, ser[2] - Day()], dtype=ser.dtype)
|
||
|
right = Series([ser[2] - Day(), np.nan, ser[2]], dtype=ser.dtype)
|
||
|
intervals = IntervalIndex.from_arrays(left, right)
|
||
|
expected = Series(Categorical(intervals, ordered=True))
|
||
|
|
||
|
result = qcut(ser, 2)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)])
|
||
|
def test_datetime_tz_qcut(bins):
|
||
|
# see gh-19872
|
||
|
tz = "US/Eastern"
|
||
|
ser = Series(date_range("20130101", periods=3, tz=tz))
|
||
|
|
||
|
result = qcut(ser, bins)
|
||
|
expected = Series(
|
||
|
IntervalIndex(
|
||
|
[
|
||
|
Interval(
|
||
|
Timestamp("2012-12-31 23:59:59.999999999", tz=tz),
|
||
|
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||
|
),
|
||
|
Interval(
|
||
|
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||
|
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||
|
),
|
||
|
Interval(
|
||
|
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||
|
Timestamp("2013-01-03 00:00:00", tz=tz),
|
||
|
),
|
||
|
]
|
||
|
)
|
||
|
).astype(CategoricalDtype(ordered=True))
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"arg,expected_bins",
|
||
|
[
|
||
|
[
|
||
|
timedelta_range("1day", periods=3),
|
||
|
TimedeltaIndex(["1 days", "2 days", "3 days"]),
|
||
|
],
|
||
|
[
|
||
|
date_range("20180101", periods=3),
|
||
|
DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]),
|
||
|
],
|
||
|
],
|
||
|
)
|
||
|
def test_date_like_qcut_bins(arg, expected_bins):
|
||
|
# see gh-19891
|
||
|
ser = Series(arg)
|
||
|
result, result_bins = qcut(ser, 2, retbins=True)
|
||
|
tm.assert_index_equal(result_bins, expected_bins)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("bins", [6, 7])
|
||
|
@pytest.mark.parametrize(
|
||
|
"box, compare",
|
||
|
[
|
||
|
(Series, tm.assert_series_equal),
|
||
|
(np.array, tm.assert_categorical_equal),
|
||
|
(list, tm.assert_equal),
|
||
|
],
|
||
|
)
|
||
|
def test_qcut_bool_coercion_to_int(bins, box, compare):
|
||
|
# issue 20303
|
||
|
data_expected = box([0, 1, 1, 0, 1] * 10)
|
||
|
data_result = box([False, True, True, False, True] * 10)
|
||
|
expected = qcut(data_expected, bins, duplicates="drop")
|
||
|
result = qcut(data_result, bins, duplicates="drop")
|
||
|
compare(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("q", [2, 5, 10])
|
||
|
def test_qcut_nullable_integer(q, any_numeric_ea_dtype):
|
||
|
arr = pd.array(np.arange(100), dtype=any_numeric_ea_dtype)
|
||
|
arr[::2] = pd.NA
|
||
|
|
||
|
result = qcut(arr, q)
|
||
|
expected = qcut(arr.astype(float), q)
|
||
|
|
||
|
tm.assert_categorical_equal(result, expected)
|