You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
243 lines
8.0 KiB
243 lines
8.0 KiB
6 months ago
|
"""
|
||
|
This file contains a minimal set of tests for compliance with the extension
|
||
|
array interface test suite, and should contain no other tests.
|
||
|
The test suite for the full functionality of the array is located in
|
||
|
`pandas/tests/arrays/`.
|
||
|
|
||
|
The tests in this file are inherited from the BaseExtensionTests, and only
|
||
|
minimal tweaks should be applied to get the tests passing (by overwriting a
|
||
|
parent method).
|
||
|
|
||
|
Additional tests should either be added to one of the BaseExtensionTests
|
||
|
classes (if they are relevant for the extension interface for all dtypes), or
|
||
|
be added to the array-specific tests in `pandas/tests/arrays/`.
|
||
|
|
||
|
"""
|
||
|
from __future__ import annotations
|
||
|
|
||
|
import string
|
||
|
from typing import cast
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
import pandas as pd
|
||
|
import pandas._testing as tm
|
||
|
from pandas.api.types import is_string_dtype
|
||
|
from pandas.core.arrays import ArrowStringArray
|
||
|
from pandas.core.arrays.string_ import StringDtype
|
||
|
from pandas.tests.extension import base
|
||
|
|
||
|
|
||
|
def maybe_split_array(arr, chunked):
|
||
|
if not chunked:
|
||
|
return arr
|
||
|
elif arr.dtype.storage != "pyarrow":
|
||
|
return arr
|
||
|
|
||
|
pa = pytest.importorskip("pyarrow")
|
||
|
|
||
|
arrow_array = arr._pa_array
|
||
|
split = len(arrow_array) // 2
|
||
|
arrow_array = pa.chunked_array(
|
||
|
[*arrow_array[:split].chunks, *arrow_array[split:].chunks]
|
||
|
)
|
||
|
assert arrow_array.num_chunks == 2
|
||
|
return type(arr)(arrow_array)
|
||
|
|
||
|
|
||
|
@pytest.fixture(params=[True, False])
|
||
|
def chunked(request):
|
||
|
return request.param
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def dtype(string_storage):
|
||
|
return StringDtype(storage=string_storage)
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def data(dtype, chunked):
|
||
|
strings = np.random.default_rng(2).choice(list(string.ascii_letters), size=100)
|
||
|
while strings[0] == strings[1]:
|
||
|
strings = np.random.default_rng(2).choice(list(string.ascii_letters), size=100)
|
||
|
|
||
|
arr = dtype.construct_array_type()._from_sequence(strings, dtype=dtype)
|
||
|
return maybe_split_array(arr, chunked)
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def data_missing(dtype, chunked):
|
||
|
"""Length 2 array with [NA, Valid]"""
|
||
|
arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"], dtype=dtype)
|
||
|
return maybe_split_array(arr, chunked)
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def data_for_sorting(dtype, chunked):
|
||
|
arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"], dtype=dtype)
|
||
|
return maybe_split_array(arr, chunked)
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def data_missing_for_sorting(dtype, chunked):
|
||
|
arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"], dtype=dtype)
|
||
|
return maybe_split_array(arr, chunked)
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def data_for_grouping(dtype, chunked):
|
||
|
arr = dtype.construct_array_type()._from_sequence(
|
||
|
["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"], dtype=dtype
|
||
|
)
|
||
|
return maybe_split_array(arr, chunked)
|
||
|
|
||
|
|
||
|
class TestStringArray(base.ExtensionTests):
|
||
|
def test_eq_with_str(self, dtype):
|
||
|
assert dtype == f"string[{dtype.storage}]"
|
||
|
super().test_eq_with_str(dtype)
|
||
|
|
||
|
def test_is_not_string_type(self, dtype):
|
||
|
# Different from BaseDtypeTests.test_is_not_string_type
|
||
|
# because StringDtype is a string type
|
||
|
assert is_string_dtype(dtype)
|
||
|
|
||
|
def test_view(self, data, request, arrow_string_storage):
|
||
|
if data.dtype.storage in arrow_string_storage:
|
||
|
pytest.skip(reason="2D support not implemented for ArrowStringArray")
|
||
|
super().test_view(data)
|
||
|
|
||
|
def test_from_dtype(self, data):
|
||
|
# base test uses string representation of dtype
|
||
|
pass
|
||
|
|
||
|
def test_transpose(self, data, request, arrow_string_storage):
|
||
|
if data.dtype.storage in arrow_string_storage:
|
||
|
pytest.skip(reason="2D support not implemented for ArrowStringArray")
|
||
|
super().test_transpose(data)
|
||
|
|
||
|
def test_setitem_preserves_views(self, data, request, arrow_string_storage):
|
||
|
if data.dtype.storage in arrow_string_storage:
|
||
|
pytest.skip(reason="2D support not implemented for ArrowStringArray")
|
||
|
super().test_setitem_preserves_views(data)
|
||
|
|
||
|
def test_dropna_array(self, data_missing):
|
||
|
result = data_missing.dropna()
|
||
|
expected = data_missing[[1]]
|
||
|
tm.assert_extension_array_equal(result, expected)
|
||
|
|
||
|
def test_fillna_no_op_returns_copy(self, data):
|
||
|
data = data[~data.isna()]
|
||
|
|
||
|
valid = data[0]
|
||
|
result = data.fillna(valid)
|
||
|
assert result is not data
|
||
|
tm.assert_extension_array_equal(result, data)
|
||
|
|
||
|
result = data.fillna(method="backfill")
|
||
|
assert result is not data
|
||
|
tm.assert_extension_array_equal(result, data)
|
||
|
|
||
|
def _get_expected_exception(
|
||
|
self, op_name: str, obj, other
|
||
|
) -> type[Exception] | None:
|
||
|
if op_name in ["__divmod__", "__rdivmod__"]:
|
||
|
if isinstance(obj, pd.Series) and cast(
|
||
|
StringDtype, tm.get_dtype(obj)
|
||
|
).storage in [
|
||
|
"pyarrow",
|
||
|
"pyarrow_numpy",
|
||
|
]:
|
||
|
# TODO: re-raise as TypeError?
|
||
|
return NotImplementedError
|
||
|
elif isinstance(other, pd.Series) and cast(
|
||
|
StringDtype, tm.get_dtype(other)
|
||
|
).storage in [
|
||
|
"pyarrow",
|
||
|
"pyarrow_numpy",
|
||
|
]:
|
||
|
# TODO: re-raise as TypeError?
|
||
|
return NotImplementedError
|
||
|
return TypeError
|
||
|
elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]:
|
||
|
if cast(StringDtype, tm.get_dtype(obj)).storage in [
|
||
|
"pyarrow",
|
||
|
"pyarrow_numpy",
|
||
|
]:
|
||
|
return NotImplementedError
|
||
|
return TypeError
|
||
|
elif op_name in ["__mul__", "__rmul__"]:
|
||
|
# Can only multiply strings by integers
|
||
|
return TypeError
|
||
|
elif op_name in [
|
||
|
"__truediv__",
|
||
|
"__rtruediv__",
|
||
|
"__floordiv__",
|
||
|
"__rfloordiv__",
|
||
|
"__sub__",
|
||
|
"__rsub__",
|
||
|
]:
|
||
|
if cast(StringDtype, tm.get_dtype(obj)).storage in [
|
||
|
"pyarrow",
|
||
|
"pyarrow_numpy",
|
||
|
]:
|
||
|
import pyarrow as pa
|
||
|
|
||
|
# TODO: better to re-raise as TypeError?
|
||
|
return pa.ArrowNotImplementedError
|
||
|
return TypeError
|
||
|
|
||
|
return None
|
||
|
|
||
|
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
|
||
|
return (
|
||
|
op_name in ["min", "max"]
|
||
|
or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr]
|
||
|
and op_name in ("any", "all")
|
||
|
)
|
||
|
|
||
|
def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
|
||
|
dtype = cast(StringDtype, tm.get_dtype(obj))
|
||
|
if op_name in ["__add__", "__radd__"]:
|
||
|
cast_to = dtype
|
||
|
elif dtype.storage == "pyarrow":
|
||
|
cast_to = "boolean[pyarrow]" # type: ignore[assignment]
|
||
|
elif dtype.storage == "pyarrow_numpy":
|
||
|
cast_to = np.bool_ # type: ignore[assignment]
|
||
|
else:
|
||
|
cast_to = "boolean" # type: ignore[assignment]
|
||
|
return pointwise_result.astype(cast_to)
|
||
|
|
||
|
def test_compare_scalar(self, data, comparison_op):
|
||
|
ser = pd.Series(data)
|
||
|
self._compare_other(ser, data, comparison_op, "abc")
|
||
|
|
||
|
@pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
|
||
|
def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
|
||
|
super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)
|
||
|
|
||
|
|
||
|
class Test2DCompat(base.Dim2CompatTests):
|
||
|
@pytest.fixture(autouse=True)
|
||
|
def arrow_not_supported(self, data):
|
||
|
if isinstance(data, ArrowStringArray):
|
||
|
pytest.skip(reason="2D support not implemented for ArrowStringArray")
|
||
|
|
||
|
|
||
|
def test_searchsorted_with_na_raises(data_for_sorting, as_series):
|
||
|
# GH50447
|
||
|
b, c, a = data_for_sorting
|
||
|
arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
|
||
|
arr[-1] = pd.NA
|
||
|
|
||
|
if as_series:
|
||
|
arr = pd.Series(arr)
|
||
|
|
||
|
msg = (
|
||
|
"searchsorted requires array to be sorted, "
|
||
|
"which is impossible with NAs present."
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
arr.searchsorted(b)
|