You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

479 lines
15 KiB

import datetime
import decimal
import re
import numpy as np
import pytest
import pytz
import pandas as pd
import pandas._testing as tm
from pandas.api.extensions import register_extension_dtype
from pandas.arrays import (
BooleanArray,
DatetimeArray,
FloatingArray,
IntegerArray,
IntervalArray,
SparseArray,
TimedeltaArray,
)
from pandas.core.arrays import (
NumpyExtensionArray,
period_array,
)
from pandas.tests.extension.decimal import (
DecimalArray,
DecimalDtype,
to_decimal,
)
@pytest.mark.parametrize("dtype_unit", ["M8[h]", "M8[m]", "m8[h]", "M8[m]"])
def test_dt64_array(dtype_unit):
# PR 53817
dtype_var = np.dtype(dtype_unit)
msg = (
r"datetime64 and timedelta64 dtype resolutions other than "
r"'s', 'ms', 'us', and 'ns' are deprecated. "
r"In future releases passing unsupported resolutions will "
r"raise an exception."
)
with tm.assert_produces_warning(FutureWarning, match=re.escape(msg)):
pd.array([], dtype=dtype_var)
@pytest.mark.parametrize(
"data, dtype, expected",
[
# Basic NumPy defaults.
([], None, FloatingArray._from_sequence([], dtype="Float64")),
([1, 2], None, IntegerArray._from_sequence([1, 2], dtype="Int64")),
([1, 2], object, NumpyExtensionArray(np.array([1, 2], dtype=object))),
(
[1, 2],
np.dtype("float32"),
NumpyExtensionArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
),
(
np.array([], dtype=object),
None,
NumpyExtensionArray(np.array([], dtype=object)),
),
(
np.array([1, 2], dtype="int64"),
None,
IntegerArray._from_sequence([1, 2], dtype="Int64"),
),
(
np.array([1.0, 2.0], dtype="float64"),
None,
FloatingArray._from_sequence([1.0, 2.0], dtype="Float64"),
),
# String alias passes through to NumPy
([1, 2], "float32", NumpyExtensionArray(np.array([1, 2], dtype="float32"))),
([1, 2], "int64", NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
# GH#44715 FloatingArray does not support float16, so fall
# back to NumpyExtensionArray
(
np.array([1, 2], dtype=np.float16),
None,
NumpyExtensionArray(np.array([1, 2], dtype=np.float16)),
),
# idempotency with e.g. pd.array(pd.array([1, 2], dtype="int64"))
(
NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
None,
NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
),
# Period alias
(
[pd.Period("2000", "D"), pd.Period("2001", "D")],
"Period[D]",
period_array(["2000", "2001"], freq="D"),
),
# Period dtype
(
[pd.Period("2000", "D")],
pd.PeriodDtype("D"),
period_array(["2000"], freq="D"),
),
# Datetime (naive)
(
[1, 2],
np.dtype("datetime64[ns]"),
DatetimeArray._from_sequence(
np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
),
),
(
[1, 2],
np.dtype("datetime64[s]"),
DatetimeArray._from_sequence(
np.array([1, 2], dtype="M8[s]"), dtype="M8[s]"
),
),
(
np.array([1, 2], dtype="datetime64[ns]"),
None,
DatetimeArray._from_sequence(
np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
),
),
(
pd.DatetimeIndex(["2000", "2001"]),
np.dtype("datetime64[ns]"),
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
),
(
pd.DatetimeIndex(["2000", "2001"]),
None,
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
),
(
["2000", "2001"],
np.dtype("datetime64[ns]"),
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
),
# Datetime (tz-aware)
(
["2000", "2001"],
pd.DatetimeTZDtype(tz="CET"),
DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
),
),
# Timedelta
(
["1h", "2h"],
np.dtype("timedelta64[ns]"),
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
),
(
pd.TimedeltaIndex(["1h", "2h"]),
np.dtype("timedelta64[ns]"),
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
),
(
np.array([1, 2], dtype="m8[s]"),
np.dtype("timedelta64[s]"),
TimedeltaArray._from_sequence(
np.array([1, 2], dtype="m8[s]"), dtype="m8[s]"
),
),
(
pd.TimedeltaIndex(["1h", "2h"]),
None,
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
),
(
# preserve non-nano, i.e. don't cast to NumpyExtensionArray
TimedeltaArray._simple_new(
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
),
None,
TimedeltaArray._simple_new(
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
),
),
(
# preserve non-nano, i.e. don't cast to NumpyExtensionArray
TimedeltaArray._simple_new(
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
),
np.dtype("m8[s]"),
TimedeltaArray._simple_new(
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
),
),
# Category
(["a", "b"], "category", pd.Categorical(["a", "b"])),
(
["a", "b"],
pd.CategoricalDtype(None, ordered=True),
pd.Categorical(["a", "b"], ordered=True),
),
# Interval
(
[pd.Interval(1, 2), pd.Interval(3, 4)],
"interval",
IntervalArray.from_tuples([(1, 2), (3, 4)]),
),
# Sparse
([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),
# IntegerNA
([1, None], "Int16", pd.array([1, None], dtype="Int16")),
(
pd.Series([1, 2]),
None,
NumpyExtensionArray(np.array([1, 2], dtype=np.int64)),
),
# String
(
["a", None],
"string",
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
(
["a", None],
pd.StringDtype(),
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
# Boolean
(
[True, None],
"boolean",
BooleanArray._from_sequence([True, None], dtype="boolean"),
),
(
[True, None],
pd.BooleanDtype(),
BooleanArray._from_sequence([True, None], dtype="boolean"),
),
# Index
(pd.Index([1, 2]), None, NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
# Series[EA] returns the EA
(
pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])),
None,
pd.Categorical(["a", "b"], categories=["a", "b", "c"]),
),
# "3rd party" EAs work
([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])),
# pass an ExtensionArray, but a different dtype
(
period_array(["2000", "2001"], freq="D"),
"category",
pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),
),
],
)
def test_array(data, dtype, expected):
result = pd.array(data, dtype=dtype)
tm.assert_equal(result, expected)
def test_array_copy():
a = np.array([1, 2])
# default is to copy
b = pd.array(a, dtype=a.dtype)
assert not tm.shares_memory(a, b)
# copy=True
b = pd.array(a, dtype=a.dtype, copy=True)
assert not tm.shares_memory(a, b)
# copy=False
b = pd.array(a, dtype=a.dtype, copy=False)
assert tm.shares_memory(a, b)
cet = pytz.timezone("CET")
@pytest.mark.parametrize(
"data, expected",
[
# period
(
[pd.Period("2000", "D"), pd.Period("2001", "D")],
period_array(["2000", "2001"], freq="D"),
),
# interval
([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])),
# datetime
(
[pd.Timestamp("2000"), pd.Timestamp("2001")],
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
),
(
[datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
),
(
np.array([1, 2], dtype="M8[ns]"),
DatetimeArray._from_sequence(np.array([1, 2], dtype="M8[ns]")),
),
(
np.array([1, 2], dtype="M8[us]"),
DatetimeArray._simple_new(
np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]")
),
),
# datetimetz
(
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")],
DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns")
),
),
(
[
datetime.datetime(2000, 1, 1, tzinfo=cet),
datetime.datetime(2001, 1, 1, tzinfo=cet),
],
DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns")
),
),
# timedelta
(
[pd.Timedelta("1h"), pd.Timedelta("2h")],
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
),
(
np.array([1, 2], dtype="m8[ns]"),
TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")),
),
(
np.array([1, 2], dtype="m8[us]"),
TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")),
),
# integer
([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")),
([1, None], IntegerArray._from_sequence([1, None], dtype="Int64")),
([1, pd.NA], IntegerArray._from_sequence([1, pd.NA], dtype="Int64")),
([1, np.nan], IntegerArray._from_sequence([1, np.nan], dtype="Int64")),
# float
([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2], dtype="Float64")),
([0.1, None], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
# integer-like float
([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
([1.0, None], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
# mixed-integer-float
([1, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
(
[1, np.nan, 2.0],
FloatingArray._from_sequence([1.0, None, 2.0], dtype="Float64"),
),
# string
(
["a", "b"],
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
),
(
["a", None],
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
# Boolean
([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),
],
)
def test_array_inference(data, expected):
result = pd.array(data)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
# mix of frequencies
[pd.Period("2000", "D"), pd.Period("2001", "Y")],
# mix of closed
[pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")],
# Mix of timezones
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")],
# Mix of tz-aware and tz-naive
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")],
np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]),
],
)
def test_array_inference_fails(data):
result = pd.array(data)
expected = NumpyExtensionArray(np.array(data, dtype=object))
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("data", [np.array(0)])
def test_nd_raises(data):
with pytest.raises(ValueError, match="NumpyExtensionArray must be 1-dimensional"):
pd.array(data, dtype="int64")
def test_scalar_raises():
with pytest.raises(ValueError, match="Cannot pass scalar '1'"):
pd.array(1)
def test_dataframe_raises():
# GH#51167 don't accidentally cast to StringArray by doing inference on columns
df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
msg = "Cannot pass DataFrame to 'pandas.array'"
with pytest.raises(TypeError, match=msg):
pd.array(df)
def test_bounds_check():
# GH21796
with pytest.raises(
TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16"
):
pd.array([-1, 2, 3], dtype="UInt16")
# ---------------------------------------------------------------------------
# A couple dummy classes to ensure that Series and Indexes are unboxed before
# getting to the EA classes.
@register_extension_dtype
class DecimalDtype2(DecimalDtype):
name = "decimal2"
@classmethod
def construct_array_type(cls):
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return DecimalArray2
class DecimalArray2(DecimalArray):
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
if isinstance(scalars, (pd.Series, pd.Index)):
raise TypeError("scalars should not be of type pd.Series or pd.Index")
return super()._from_sequence(scalars, dtype=dtype, copy=copy)
def test_array_unboxes(index_or_series):
box = index_or_series
data = box([decimal.Decimal("1"), decimal.Decimal("2")])
dtype = DecimalDtype2()
# make sure it works
with pytest.raises(
TypeError, match="scalars should not be of type pd.Series or pd.Index"
):
DecimalArray2._from_sequence(data, dtype=dtype)
result = pd.array(data, dtype="decimal2")
expected = DecimalArray2._from_sequence(data.values, dtype=dtype)
tm.assert_equal(result, expected)
def test_array_to_numpy_na():
# GH#40638
arr = pd.array([pd.NA, 1], dtype="string[python]")
result = arr.to_numpy(na_value=True, dtype=bool)
expected = np.array([True, True])
tm.assert_numpy_array_equal(result, expected)