You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2025 lines
69 KiB

from collections import namedtuple
from datetime import (
datetime,
timedelta,
)
from decimal import Decimal
import re
import numpy as np
import pytest
from pandas._libs import iNaT
from pandas.errors import (
InvalidIndexError,
PerformanceWarning,
SettingWithCopyError,
)
import pandas.util._test_decorators as td
from pandas.core.dtypes.common import is_integer
import pandas as pd
from pandas import (
Categorical,
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
date_range,
isna,
notna,
to_datetime,
)
import pandas._testing as tm
# We pass through a TypeError raised by numpy
_slice_msg = "slice indices must be integers or None or have an __index__ method"
class TestDataFrameIndexing:
def test_getitem(self, float_frame):
# Slicing
sl = float_frame[:20]
assert len(sl.index) == 20
# Column access
for _, series in sl.items():
assert len(series.index) == 20
tm.assert_index_equal(series.index, sl.index)
for key, _ in float_frame._series.items():
assert float_frame[key] is not None
assert "random" not in float_frame
with pytest.raises(KeyError, match="random"):
float_frame["random"]
def test_getitem_numeric_should_not_fallback_to_positional(self, any_numeric_dtype):
# GH51053
dtype = any_numeric_dtype
idx = Index([1, 0, 1], dtype=dtype)
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=idx)
result = df[1]
expected = DataFrame([[1, 3], [4, 6]], columns=Index([1, 1], dtype=dtype))
tm.assert_frame_equal(result, expected, check_exact=True)
def test_getitem2(self, float_frame):
df = float_frame.copy()
df["$10"] = np.random.default_rng(2).standard_normal(len(df))
ad = np.random.default_rng(2).standard_normal(len(df))
df["@awesome_domain"] = ad
with pytest.raises(KeyError, match=re.escape("'df[\"$10\"]'")):
df.__getitem__('df["$10"]')
res = df["@awesome_domain"]
tm.assert_numpy_array_equal(ad, res.values)
def test_setitem_numeric_should_not_fallback_to_positional(self, any_numeric_dtype):
# GH51053
dtype = any_numeric_dtype
idx = Index([1, 0, 1], dtype=dtype)
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=idx)
df[1] = 10
expected = DataFrame([[10, 2, 10], [10, 5, 10]], columns=idx)
tm.assert_frame_equal(df, expected, check_exact=True)
def test_setitem_list(self, float_frame):
float_frame["E"] = "foo"
data = float_frame[["A", "B"]]
float_frame[["B", "A"]] = data
tm.assert_series_equal(float_frame["B"], data["A"], check_names=False)
tm.assert_series_equal(float_frame["A"], data["B"], check_names=False)
msg = "Columns must be same length as key"
with pytest.raises(ValueError, match=msg):
data[["A"]] = float_frame[["A", "B"]]
newcolumndata = range(len(data.index) - 1)
msg = (
rf"Length of values \({len(newcolumndata)}\) "
rf"does not match length of index \({len(data)}\)"
)
with pytest.raises(ValueError, match=msg):
data["A"] = newcolumndata
def test_setitem_list2(self):
df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=int)
df.loc[1, ["tt1", "tt2"]] = [1, 2]
result = df.loc[df.index[1], ["tt1", "tt2"]]
expected = Series([1, 2], df.columns, dtype=int, name=1)
tm.assert_series_equal(result, expected)
df["tt1"] = df["tt2"] = "0"
df.loc[df.index[1], ["tt1", "tt2"]] = ["1", "2"]
result = df.loc[df.index[1], ["tt1", "tt2"]]
expected = Series(["1", "2"], df.columns, name=1)
tm.assert_series_equal(result, expected)
def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_frame):
# boolean indexing
d = datetime_frame.index[10]
indexer = datetime_frame.index > d
indexer_obj = indexer.astype(object)
subindex = datetime_frame.index[indexer]
subframe = datetime_frame[indexer]
tm.assert_index_equal(subindex, subframe.index)
with pytest.raises(ValueError, match="Item wrong length"):
datetime_frame[indexer[:-1]]
subframe_obj = datetime_frame[indexer_obj]
tm.assert_frame_equal(subframe_obj, subframe)
with pytest.raises(ValueError, match="Boolean array expected"):
datetime_frame[datetime_frame]
# test that Series work
indexer_obj = Series(indexer_obj, datetime_frame.index)
subframe_obj = datetime_frame[indexer_obj]
tm.assert_frame_equal(subframe_obj, subframe)
# test that Series indexers reindex
# we are producing a warning that since the passed boolean
# key is not the same as the given index, we will reindex
# not sure this is really necessary
with tm.assert_produces_warning(UserWarning):
indexer_obj = indexer_obj.reindex(datetime_frame.index[::-1])
subframe_obj = datetime_frame[indexer_obj]
tm.assert_frame_equal(subframe_obj, subframe)
# test df[df > 0]
for df in [
datetime_frame,
mixed_float_frame,
mixed_int_frame,
]:
data = df._get_numeric_data()
bif = df[df > 0]
bifw = DataFrame(
{c: np.where(data[c] > 0, data[c], np.nan) for c in data.columns},
index=data.index,
columns=data.columns,
)
# add back other columns to compare
for c in df.columns:
if c not in bifw:
bifw[c] = df[c]
bifw = bifw.reindex(columns=df.columns)
tm.assert_frame_equal(bif, bifw, check_dtype=False)
for c in df.columns:
if bif[c].dtype != bifw[c].dtype:
assert bif[c].dtype == df[c].dtype
def test_getitem_boolean_casting(self, datetime_frame):
# don't upcast if we don't need to
df = datetime_frame.copy()
df["E"] = 1
df["E"] = df["E"].astype("int32")
df["E1"] = df["E"].copy()
df["F"] = 1
df["F"] = df["F"].astype("int64")
df["F1"] = df["F"].copy()
casted = df[df > 0]
result = casted.dtypes
expected = Series(
[np.dtype("float64")] * 4
+ [np.dtype("int32")] * 2
+ [np.dtype("int64")] * 2,
index=["A", "B", "C", "D", "E", "E1", "F", "F1"],
)
tm.assert_series_equal(result, expected)
# int block splitting
df.loc[df.index[1:3], ["E1", "F1"]] = 0
casted = df[df > 0]
result = casted.dtypes
expected = Series(
[np.dtype("float64")] * 4
+ [np.dtype("int32")]
+ [np.dtype("float64")]
+ [np.dtype("int64")]
+ [np.dtype("float64")],
index=["A", "B", "C", "D", "E", "E1", "F", "F1"],
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"lst", [[True, False, True], [True, True, True], [False, False, False]]
)
def test_getitem_boolean_list(self, lst):
df = DataFrame(np.arange(12).reshape(3, 4))
result = df[lst]
expected = df.loc[df.index[lst]]
tm.assert_frame_equal(result, expected)
def test_getitem_boolean_iadd(self):
arr = np.random.default_rng(2).standard_normal((5, 5))
df = DataFrame(arr.copy(), columns=["A", "B", "C", "D", "E"])
df[df < 0] += 1
arr[arr < 0] += 1
tm.assert_almost_equal(df.values, arr)
def test_boolean_index_empty_corner(self):
# #2096
blah = DataFrame(np.empty([0, 1]), columns=["A"], index=DatetimeIndex([]))
# both of these should succeed trivially
k = np.array([], bool)
blah[k]
blah[k] = 0
def test_getitem_ix_mixed_integer(self):
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 3)),
index=[1, 10, "C", "E"],
columns=[1, 2, 3],
)
result = df.iloc[:-1]
expected = df.loc[df.index[:-1]]
tm.assert_frame_equal(result, expected)
result = df.loc[[1, 10]]
expected = df.loc[Index([1, 10])]
tm.assert_frame_equal(result, expected)
def test_getitem_ix_mixed_integer2(self):
# 11320
df = DataFrame(
{
"rna": (1.5, 2.2, 3.2, 4.5),
-1000: [11, 21, 36, 40],
0: [10, 22, 43, 34],
1000: [0, 10, 20, 30],
},
columns=["rna", -1000, 0, 1000],
)
result = df[[1000]]
expected = df.iloc[:, [3]]
tm.assert_frame_equal(result, expected)
result = df[[-1000]]
expected = df.iloc[:, [1]]
tm.assert_frame_equal(result, expected)
def test_getattr(self, float_frame):
tm.assert_series_equal(float_frame.A, float_frame["A"])
msg = "'DataFrame' object has no attribute 'NONEXISTENT_NAME'"
with pytest.raises(AttributeError, match=msg):
float_frame.NONEXISTENT_NAME
def test_setattr_column(self):
df = DataFrame({"foobar": 1}, index=range(10))
df.foobar = 5
assert (df.foobar == 5).all()
def test_setitem(
self, float_frame, using_copy_on_write, warn_copy_on_write, using_infer_string
):
# not sure what else to do here
series = float_frame["A"][::2]
float_frame["col5"] = series
assert "col5" in float_frame
assert len(series) == 15
assert len(float_frame) == 30
exp = np.ravel(np.column_stack((series.values, [np.nan] * 15)))
exp = Series(exp, index=float_frame.index, name="col5")
tm.assert_series_equal(float_frame["col5"], exp)
series = float_frame["A"]
float_frame["col6"] = series
tm.assert_series_equal(series, float_frame["col6"], check_names=False)
# set ndarray
arr = np.random.default_rng(2).standard_normal(len(float_frame))
float_frame["col9"] = arr
assert (float_frame["col9"] == arr).all()
float_frame["col7"] = 5
assert (float_frame["col7"] == 5).all()
float_frame["col0"] = 3.14
assert (float_frame["col0"] == 3.14).all()
float_frame["col8"] = "foo"
assert (float_frame["col8"] == "foo").all()
# this is partially a view (e.g. some blocks are view)
# so raise/warn
smaller = float_frame[:2]
msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
if using_copy_on_write or warn_copy_on_write:
# With CoW, adding a new column doesn't raise a warning
smaller["col10"] = ["1", "2"]
else:
with pytest.raises(SettingWithCopyError, match=msg):
smaller["col10"] = ["1", "2"]
if using_infer_string:
assert smaller["col10"].dtype == "string"
else:
assert smaller["col10"].dtype == np.object_
assert (smaller["col10"] == ["1", "2"]).all()
def test_setitem2(self):
# dtype changing GH4204
df = DataFrame([[0, 0]])
df.iloc[0] = np.nan
expected = DataFrame([[np.nan, np.nan]])
tm.assert_frame_equal(df, expected)
df = DataFrame([[0, 0]])
df.loc[0] = np.nan
tm.assert_frame_equal(df, expected)
def test_setitem_boolean(self, float_frame):
df = float_frame.copy()
values = float_frame.values.copy()
df[df["A"] > 0] = 4
values[values[:, 0] > 0] = 4
tm.assert_almost_equal(df.values, values)
# test that column reindexing works
series = df["A"] == 4
series = series.reindex(df.index[::-1])
df[series] = 1
values[values[:, 0] == 4] = 1
tm.assert_almost_equal(df.values, values)
df[df > 0] = 5
values[values > 0] = 5
tm.assert_almost_equal(df.values, values)
df[df == 5] = 0
values[values == 5] = 0
tm.assert_almost_equal(df.values, values)
# a df that needs alignment first
df[df[:-1] < 0] = 2
np.putmask(values[:-1], values[:-1] < 0, 2)
tm.assert_almost_equal(df.values, values)
# indexed with same shape but rows-reversed df
df[df[::-1] == 2] = 3
values[values == 2] = 3
tm.assert_almost_equal(df.values, values)
msg = "Must pass DataFrame or 2-d ndarray with boolean values only"
with pytest.raises(TypeError, match=msg):
df[df * 0] = 2
# index with DataFrame
df_orig = df.copy()
mask = df > np.abs(df)
df[df > np.abs(df)] = np.nan
values = df_orig.values.copy()
values[mask.values] = np.nan
expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns)
tm.assert_frame_equal(df, expected)
# set from DataFrame
df[df > np.abs(df)] = df * 2
np.putmask(values, mask.values, df.values * 2)
expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns)
tm.assert_frame_equal(df, expected)
def test_setitem_cast(self, float_frame):
float_frame["D"] = float_frame["D"].astype("i8")
assert float_frame["D"].dtype == np.int64
# #669, should not cast?
# this is now set to int64, which means a replacement of the column to
# the value dtype (and nothing to do with the existing dtype)
float_frame["B"] = 0
assert float_frame["B"].dtype == np.int64
# cast if pass array of course
float_frame["B"] = np.arange(len(float_frame))
assert issubclass(float_frame["B"].dtype.type, np.integer)
float_frame["foo"] = "bar"
float_frame["foo"] = 0
assert float_frame["foo"].dtype == np.int64
float_frame["foo"] = "bar"
float_frame["foo"] = 2.5
assert float_frame["foo"].dtype == np.float64
float_frame["something"] = 0
assert float_frame["something"].dtype == np.int64
float_frame["something"] = 2
assert float_frame["something"].dtype == np.int64
float_frame["something"] = 2.5
assert float_frame["something"].dtype == np.float64
def test_setitem_corner(self, float_frame, using_infer_string):
# corner case
df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3))
del df["B"]
df["B"] = [1.0, 2.0, 3.0]
assert "B" in df
assert len(df.columns) == 2
df["A"] = "beginning"
df["E"] = "foo"
df["D"] = "bar"
df[datetime.now()] = "date"
df[datetime.now()] = 5.0
# what to do when empty frame with index
dm = DataFrame(index=float_frame.index)
dm["A"] = "foo"
dm["B"] = "bar"
assert len(dm.columns) == 2
assert dm.values.dtype == np.object_
# upcast
dm["C"] = 1
assert dm["C"].dtype == np.int64
dm["E"] = 1.0
assert dm["E"].dtype == np.float64
# set existing column
dm["A"] = "bar"
assert "bar" == dm["A"].iloc[0]
dm = DataFrame(index=np.arange(3))
dm["A"] = 1
dm["foo"] = "bar"
del dm["foo"]
dm["foo"] = "bar"
if using_infer_string:
assert dm["foo"].dtype == "string"
else:
assert dm["foo"].dtype == np.object_
dm["coercible"] = ["1", "2", "3"]
if using_infer_string:
assert dm["coercible"].dtype == "string"
else:
assert dm["coercible"].dtype == np.object_
def test_setitem_corner2(self):
data = {
"title": ["foobar", "bar", "foobar"] + ["foobar"] * 17,
"cruft": np.random.default_rng(2).random(20),
}
df = DataFrame(data)
ix = df[df["title"] == "bar"].index
df.loc[ix, ["title"]] = "foobar"
df.loc[ix, ["cruft"]] = 0
assert df.loc[1, "title"] == "foobar"
assert df.loc[1, "cruft"] == 0
def test_setitem_ambig(self, using_infer_string):
# Difficulties with mixed-type data
# Created as float type
dm = DataFrame(index=range(3), columns=range(3))
coercable_series = Series([Decimal(1) for _ in range(3)], index=range(3))
uncoercable_series = Series(["foo", "bzr", "baz"], index=range(3))
dm[0] = np.ones(3)
assert len(dm.columns) == 3
dm[1] = coercable_series
assert len(dm.columns) == 3
dm[2] = uncoercable_series
assert len(dm.columns) == 3
if using_infer_string:
assert dm[2].dtype == "string"
else:
assert dm[2].dtype == np.object_
def test_setitem_None(self, float_frame, using_infer_string):
# GH #766
float_frame[None] = float_frame["A"]
key = None if not using_infer_string else np.nan
tm.assert_series_equal(
float_frame.iloc[:, -1], float_frame["A"], check_names=False
)
tm.assert_series_equal(
float_frame.loc[:, key], float_frame["A"], check_names=False
)
tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False)
def test_loc_setitem_boolean_mask_allfalse(self):
# GH 9596
df = DataFrame(
{"a": ["1", "2", "3"], "b": ["11", "22", "33"], "c": ["111", "222", "333"]}
)
result = df.copy()
result.loc[result.b.isna(), "a"] = result.a.copy()
tm.assert_frame_equal(result, df)
def test_getitem_fancy_slice_integers_step(self):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
# this is OK
df.iloc[:8:2]
df.iloc[:8:2] = np.nan
assert isna(df.iloc[:8:2]).values.all()
def test_getitem_setitem_integer_slice_keyerrors(self):
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 5)), index=range(0, 20, 2)
)
# this is OK
cp = df.copy()
cp.iloc[4:10] = 0
assert (cp.iloc[4:10] == 0).values.all()
# so is this
cp = df.copy()
cp.iloc[3:11] = 0
assert (cp.iloc[3:11] == 0).values.all()
result = df.iloc[2:6]
result2 = df.loc[3:11]
expected = df.reindex([4, 6, 8, 10])
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result2, expected)
# non-monotonic, raise KeyError
df2 = df.iloc[list(range(5)) + list(range(5, 10))[::-1]]
with pytest.raises(KeyError, match=r"^3$"):
df2.loc[3:11]
with pytest.raises(KeyError, match=r"^3$"):
df2.loc[3:11] = 0
@td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view
def test_fancy_getitem_slice_mixed(
self, float_frame, float_string_frame, using_copy_on_write, warn_copy_on_write
):
sliced = float_string_frame.iloc[:, -3:]
assert sliced["D"].dtype == np.float64
# get view with single block
# setting it triggers setting with copy
original = float_frame.copy()
sliced = float_frame.iloc[:, -3:]
assert np.shares_memory(sliced["C"]._values, float_frame["C"]._values)
with tm.assert_cow_warning(warn_copy_on_write):
sliced.loc[:, "C"] = 4.0
if not using_copy_on_write:
assert (float_frame["C"] == 4).all()
# with the enforcement of GH#45333 in 2.0, this remains a view
np.shares_memory(sliced["C"]._values, float_frame["C"]._values)
else:
tm.assert_frame_equal(float_frame, original)
def test_getitem_setitem_non_ix_labels(self):
df = DataFrame(range(20), index=date_range("2020-01-01", periods=20))
start, end = df.index[[5, 10]]
result = df.loc[start:end]
result2 = df[start:end]
expected = df[5:11]
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result2, expected)
result = df.copy()
result.loc[start:end] = 0
result2 = df.copy()
result2[start:end] = 0
expected = df.copy()
expected[5:11] = 0
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result2, expected)
def test_ix_multi_take(self):
df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)))
rs = df.loc[df.index == 0, :]
xp = df.reindex([0])
tm.assert_frame_equal(rs, xp)
# GH#1321
df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)))
rs = df.loc[df.index == 0, df.columns == 1]
xp = df.reindex(index=[0], columns=[1])
tm.assert_frame_equal(rs, xp)
def test_getitem_fancy_scalar(self, float_frame):
f = float_frame
ix = f.loc
# individual value
for col in f.columns:
ts = f[col]
for idx in f.index[::5]:
assert ix[idx, col] == ts[idx]
@td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values
def test_setitem_fancy_scalar(self, float_frame):
f = float_frame
expected = float_frame.copy()
ix = f.loc
# individual value
for j, col in enumerate(f.columns):
f[col]
for idx in f.index[::5]:
i = f.index.get_loc(idx)
val = np.random.default_rng(2).standard_normal()
expected.iloc[i, j] = val
ix[idx, col] = val
tm.assert_frame_equal(f, expected)
def test_getitem_fancy_boolean(self, float_frame):
f = float_frame
ix = f.loc
expected = f.reindex(columns=["B", "D"])
result = ix[:, [False, True, False, True]]
tm.assert_frame_equal(result, expected)
expected = f.reindex(index=f.index[5:10], columns=["B", "D"])
result = ix[f.index[5:10], [False, True, False, True]]
tm.assert_frame_equal(result, expected)
boolvec = f.index > f.index[7]
expected = f.reindex(index=f.index[boolvec])
result = ix[boolvec]
tm.assert_frame_equal(result, expected)
result = ix[boolvec, :]
tm.assert_frame_equal(result, expected)
result = ix[boolvec, f.columns[2:]]
expected = f.reindex(index=f.index[boolvec], columns=["C", "D"])
tm.assert_frame_equal(result, expected)
@td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values
def test_setitem_fancy_boolean(self, float_frame):
# from 2d, set with booleans
frame = float_frame.copy()
expected = float_frame.copy()
values = expected.values.copy()
mask = frame["A"] > 0
frame.loc[mask] = 0.0
values[mask.values] = 0.0
expected = DataFrame(values, index=expected.index, columns=expected.columns)
tm.assert_frame_equal(frame, expected)
frame = float_frame.copy()
expected = float_frame.copy()
values = expected.values.copy()
frame.loc[mask, ["A", "B"]] = 0.0
values[mask.values, :2] = 0.0
expected = DataFrame(values, index=expected.index, columns=expected.columns)
tm.assert_frame_equal(frame, expected)
def test_getitem_fancy_ints(self, float_frame):
result = float_frame.iloc[[1, 4, 7]]
expected = float_frame.loc[float_frame.index[[1, 4, 7]]]
tm.assert_frame_equal(result, expected)
result = float_frame.iloc[:, [2, 0, 1]]
expected = float_frame.loc[:, float_frame.columns[[2, 0, 1]]]
tm.assert_frame_equal(result, expected)
def test_getitem_setitem_boolean_misaligned(self, float_frame):
# boolean index misaligned labels
mask = float_frame["A"][::-1] > 1
result = float_frame.loc[mask]
expected = float_frame.loc[mask[::-1]]
tm.assert_frame_equal(result, expected)
cp = float_frame.copy()
expected = float_frame.copy()
cp.loc[mask] = 0
expected.loc[mask] = 0
tm.assert_frame_equal(cp, expected)
def test_getitem_setitem_boolean_multi(self):
df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)))
# get
k1 = np.array([True, False, True])
k2 = np.array([False, True])
result = df.loc[k1, k2]
expected = df.loc[[0, 2], [1]]
tm.assert_frame_equal(result, expected)
expected = df.copy()
df.loc[np.array([True, False, True]), np.array([False, True])] = 5
expected.loc[[0, 2], [1]] = 5
tm.assert_frame_equal(df, expected)
def test_getitem_setitem_float_labels(self, using_array_manager):
index = Index([1.5, 2, 3, 4, 5])
df = DataFrame(np.random.default_rng(2).standard_normal((5, 5)), index=index)
result = df.loc[1.5:4]
expected = df.reindex([1.5, 2, 3, 4])
tm.assert_frame_equal(result, expected)
assert len(result) == 4
result = df.loc[4:5]
expected = df.reindex([4, 5]) # reindex with int
tm.assert_frame_equal(result, expected, check_index_type=False)
assert len(result) == 2
result = df.loc[4:5]
expected = df.reindex([4.0, 5.0]) # reindex with float
tm.assert_frame_equal(result, expected)
assert len(result) == 2
# loc_float changes this to work properly
result = df.loc[1:2]
expected = df.iloc[0:2]
tm.assert_frame_equal(result, expected)
expected = df.iloc[0:2]
msg = r"The behavior of obj\[i:j\] with a float-dtype index"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df[1:2]
tm.assert_frame_equal(result, expected)
# #2727
index = Index([1.0, 2.5, 3.5, 4.5, 5.0])
df = DataFrame(np.random.default_rng(2).standard_normal((5, 5)), index=index)
# positional slicing only via iloc!
msg = (
"cannot do positional indexing on Index with "
r"these indexers \[1.0\] of type float"
)
with pytest.raises(TypeError, match=msg):
df.iloc[1.0:5]
result = df.iloc[4:5]
expected = df.reindex([5.0])
tm.assert_frame_equal(result, expected)
assert len(result) == 1
cp = df.copy()
with pytest.raises(TypeError, match=_slice_msg):
cp.iloc[1.0:5] = 0
with pytest.raises(TypeError, match=msg):
result = cp.iloc[1.0:5] == 0
assert result.values.all()
assert (cp.iloc[0:1] == df.iloc[0:1]).values.all()
cp = df.copy()
cp.iloc[4:5] = 0
assert (cp.iloc[4:5] == 0).values.all()
assert (cp.iloc[0:4] == df.iloc[0:4]).values.all()
# float slicing
result = df.loc[1.0:5]
expected = df
tm.assert_frame_equal(result, expected)
assert len(result) == 5
result = df.loc[1.1:5]
expected = df.reindex([2.5, 3.5, 4.5, 5.0])
tm.assert_frame_equal(result, expected)
assert len(result) == 4
result = df.loc[4.51:5]
expected = df.reindex([5.0])
tm.assert_frame_equal(result, expected)
assert len(result) == 1
result = df.loc[1.0:5.0]
expected = df.reindex([1.0, 2.5, 3.5, 4.5, 5.0])
tm.assert_frame_equal(result, expected)
assert len(result) == 5
cp = df.copy()
cp.loc[1.0:5.0] = 0
result = cp.loc[1.0:5.0]
assert (result == 0).values.all()
def test_setitem_single_column_mixed_datetime(self):
df = DataFrame(
np.random.default_rng(2).standard_normal((5, 3)),
index=["a", "b", "c", "d", "e"],
columns=["foo", "bar", "baz"],
)
df["timestamp"] = Timestamp("20010102")
# check our dtypes
result = df.dtypes
expected = Series(
[np.dtype("float64")] * 3 + [np.dtype("datetime64[s]")],
index=["foo", "bar", "baz", "timestamp"],
)
tm.assert_series_equal(result, expected)
# GH#16674 iNaT is treated as an integer when given by the user
with tm.assert_produces_warning(
FutureWarning, match="Setting an item of incompatible dtype"
):
df.loc["b", "timestamp"] = iNaT
assert not isna(df.loc["b", "timestamp"])
assert df["timestamp"].dtype == np.object_
assert df.loc["b", "timestamp"] == iNaT
# allow this syntax (as of GH#3216)
df.loc["c", "timestamp"] = np.nan
assert isna(df.loc["c", "timestamp"])
# allow this syntax
df.loc["d", :] = np.nan
assert not isna(df.loc["c", :]).all()
def test_setitem_mixed_datetime(self):
# GH 9336
expected = DataFrame(
{
"a": [0, 0, 0, 0, 13, 14],
"b": [
datetime(2012, 1, 1),
1,
"x",
"y",
datetime(2013, 1, 1),
datetime(2014, 1, 1),
],
}
)
df = DataFrame(0, columns=list("ab"), index=range(6))
df["b"] = pd.NaT
df.loc[0, "b"] = datetime(2012, 1, 1)
with tm.assert_produces_warning(
FutureWarning, match="Setting an item of incompatible dtype"
):
df.loc[1, "b"] = 1
df.loc[[2, 3], "b"] = "x", "y"
A = np.array(
[
[13, np.datetime64("2013-01-01T00:00:00")],
[14, np.datetime64("2014-01-01T00:00:00")],
]
)
df.loc[[4, 5], ["a", "b"]] = A
tm.assert_frame_equal(df, expected)
def test_setitem_frame_float(self, float_frame):
piece = float_frame.loc[float_frame.index[:2], ["A", "B"]]
float_frame.loc[float_frame.index[-2] :, ["A", "B"]] = piece.values
result = float_frame.loc[float_frame.index[-2:], ["A", "B"]].values
expected = piece.values
tm.assert_almost_equal(result, expected)
def test_setitem_frame_mixed(self, float_string_frame):
# GH 3216
# already aligned
f = float_string_frame.copy()
piece = DataFrame(
[[1.0, 2.0], [3.0, 4.0]], index=f.index[0:2], columns=["A", "B"]
)
key = (f.index[slice(None, 2)], ["A", "B"])
f.loc[key] = piece
tm.assert_almost_equal(f.loc[f.index[0:2], ["A", "B"]].values, piece.values)
def test_setitem_frame_mixed_rows_unaligned(self, float_string_frame):
# GH#3216 rows unaligned
f = float_string_frame.copy()
piece = DataFrame(
[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]],
index=list(f.index[0:2]) + ["foo", "bar"],
columns=["A", "B"],
)
key = (f.index[slice(None, 2)], ["A", "B"])
f.loc[key] = piece
tm.assert_almost_equal(
f.loc[f.index[0:2:], ["A", "B"]].values, piece.values[0:2]
)
def test_setitem_frame_mixed_key_unaligned(self, float_string_frame):
# GH#3216 key is unaligned with values
f = float_string_frame.copy()
piece = f.loc[f.index[:2], ["A"]]
piece.index = f.index[-2:]
key = (f.index[slice(-2, None)], ["A", "B"])
f.loc[key] = piece
piece["B"] = np.nan
tm.assert_almost_equal(f.loc[f.index[-2:], ["A", "B"]].values, piece.values)
def test_setitem_frame_mixed_ndarray(self, float_string_frame):
# GH#3216 ndarray
f = float_string_frame.copy()
piece = float_string_frame.loc[f.index[:2], ["A", "B"]]
key = (f.index[slice(-2, None)], ["A", "B"])
f.loc[key] = piece.values
tm.assert_almost_equal(f.loc[f.index[-2:], ["A", "B"]].values, piece.values)
def test_setitem_frame_upcast(self):
# needs upcasting
df = DataFrame([[1, 2, "foo"], [3, 4, "bar"]], columns=["A", "B", "C"])
df2 = df.copy()
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5
expected = df.reindex(columns=["A", "B"])
expected += 0.5
expected["C"] = df["C"]
tm.assert_frame_equal(df2, expected)
def test_setitem_frame_align(self, float_frame):
piece = float_frame.loc[float_frame.index[:2], ["A", "B"]]
piece.index = float_frame.index[-2:]
piece.columns = ["A", "B"]
float_frame.loc[float_frame.index[-2:], ["A", "B"]] = piece
result = float_frame.loc[float_frame.index[-2:], ["A", "B"]].values
expected = piece.values
tm.assert_almost_equal(result, expected)
def test_getitem_setitem_ix_duplicates(self):
# #1201
df = DataFrame(
np.random.default_rng(2).standard_normal((5, 3)),
index=["foo", "foo", "bar", "baz", "bar"],
)
result = df.loc["foo"]
expected = df[:2]
tm.assert_frame_equal(result, expected)
result = df.loc["bar"]
expected = df.iloc[[2, 4]]
tm.assert_frame_equal(result, expected)
result = df.loc["baz"]
expected = df.iloc[3]
tm.assert_series_equal(result, expected)
def test_getitem_ix_boolean_duplicates_multiple(self):
# #1201
df = DataFrame(
np.random.default_rng(2).standard_normal((5, 3)),
index=["foo", "foo", "bar", "baz", "bar"],
)
result = df.loc[["bar"]]
exp = df.iloc[[2, 4]]
tm.assert_frame_equal(result, exp)
result = df.loc[df[1] > 0]
exp = df[df[1] > 0]
tm.assert_frame_equal(result, exp)
result = df.loc[df[0] > 0]
exp = df[df[0] > 0]
tm.assert_frame_equal(result, exp)
@pytest.mark.parametrize("bool_value", [True, False])
def test_getitem_setitem_ix_bool_keyerror(self, bool_value):
# #2199
df = DataFrame({"a": [1, 2, 3]})
message = f"{bool_value}: boolean label can not be used without a boolean index"
with pytest.raises(KeyError, match=message):
df.loc[bool_value]
msg = "cannot use a single bool to index into setitem"
with pytest.raises(KeyError, match=msg):
df.loc[bool_value] = 0
# TODO: rename? remove?
def test_single_element_ix_dont_upcast(self, float_frame):
float_frame["E"] = 1
assert issubclass(float_frame["E"].dtype.type, (int, np.integer))
result = float_frame.loc[float_frame.index[5], "E"]
assert is_integer(result)
# GH 11617
df = DataFrame({"a": [1.23]})
df["b"] = 666
result = df.loc[0, "b"]
assert is_integer(result)
expected = Series([666], [0], name="b")
result = df.loc[[0], "b"]
tm.assert_series_equal(result, expected)
def test_iloc_callable_tuple_return_value(self):
# GH53769
df = DataFrame(np.arange(40).reshape(10, 4), index=range(0, 20, 2))
msg = "callable with iloc"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.iloc[lambda _: (0,)]
with tm.assert_produces_warning(FutureWarning, match=msg):
df.iloc[lambda _: (0,)] = 1
def test_iloc_row(self):
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)), index=range(0, 20, 2)
)
result = df.iloc[1]
exp = df.loc[2]
tm.assert_series_equal(result, exp)
result = df.iloc[2]
exp = df.loc[4]
tm.assert_series_equal(result, exp)
# slice
result = df.iloc[slice(4, 8)]
expected = df.loc[8:14]
tm.assert_frame_equal(result, expected)
# list of integers
result = df.iloc[[1, 2, 4, 6]]
expected = df.reindex(df.index[[1, 2, 4, 6]])
tm.assert_frame_equal(result, expected)
def test_iloc_row_slice_view(self, using_copy_on_write, warn_copy_on_write):
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)), index=range(0, 20, 2)
)
original = df.copy()
# verify slice is view
# setting it makes it raise/warn
subset = df.iloc[slice(4, 8)]
assert np.shares_memory(df[2], subset[2])
exp_col = original[2].copy()
with tm.assert_cow_warning(warn_copy_on_write):
subset.loc[:, 2] = 0.0
if not using_copy_on_write:
exp_col._values[4:8] = 0.0
# With the enforcement of GH#45333 in 2.0, this remains a view
assert np.shares_memory(df[2], subset[2])
tm.assert_series_equal(df[2], exp_col)
def test_iloc_col(self):
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 10)), columns=range(0, 20, 2)
)
result = df.iloc[:, 1]
exp = df.loc[:, 2]
tm.assert_series_equal(result, exp)
result = df.iloc[:, 2]
exp = df.loc[:, 4]
tm.assert_series_equal(result, exp)
# slice
result = df.iloc[:, slice(4, 8)]
expected = df.loc[:, 8:14]
tm.assert_frame_equal(result, expected)
# list of integers
result = df.iloc[:, [1, 2, 4, 6]]
expected = df.reindex(columns=df.columns[[1, 2, 4, 6]])
tm.assert_frame_equal(result, expected)
def test_iloc_col_slice_view(
self, using_array_manager, using_copy_on_write, warn_copy_on_write
):
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 10)), columns=range(0, 20, 2)
)
original = df.copy()
subset = df.iloc[:, slice(4, 8)]
if not using_array_manager and not using_copy_on_write:
# verify slice is view
assert np.shares_memory(df[8]._values, subset[8]._values)
with tm.assert_cow_warning(warn_copy_on_write):
subset.loc[:, 8] = 0.0
assert (df[8] == 0).all()
# with the enforcement of GH#45333 in 2.0, this remains a view
assert np.shares_memory(df[8]._values, subset[8]._values)
else:
if using_copy_on_write:
# verify slice is view
assert np.shares_memory(df[8]._values, subset[8]._values)
subset[8] = 0.0
# subset changed
assert (subset[8] == 0).all()
# but df itself did not change (setitem replaces full column)
tm.assert_frame_equal(df, original)
def test_loc_duplicates(self):
# gh-17105
# insert a duplicate element to the index
trange = date_range(
start=Timestamp(year=2017, month=1, day=1),
end=Timestamp(year=2017, month=1, day=5),
)
trange = trange.insert(loc=5, item=Timestamp(year=2017, month=1, day=5))
df = DataFrame(0, index=trange, columns=["A", "B"])
bool_idx = np.array([False, False, False, False, False, True])
# assignment
df.loc[trange[bool_idx], "A"] = 6
expected = DataFrame(
{"A": [0, 0, 0, 0, 6, 6], "B": [0, 0, 0, 0, 0, 0]}, index=trange
)
tm.assert_frame_equal(df, expected)
# in-place
df = DataFrame(0, index=trange, columns=["A", "B"])
df.loc[trange[bool_idx], "A"] += 6
tm.assert_frame_equal(df, expected)
def test_setitem_with_unaligned_tz_aware_datetime_column(self):
# GH 12981
# Assignment of unaligned offset-aware datetime series.
# Make sure timezone isn't lost
column = Series(date_range("2015-01-01", periods=3, tz="utc"), name="dates")
df = DataFrame({"dates": column})
df["dates"] = column[[1, 0, 2]]
tm.assert_series_equal(df["dates"], column)
df = DataFrame({"dates": column})
df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]]
tm.assert_series_equal(df["dates"], column)
def test_loc_setitem_datetimelike_with_inference(self):
# GH 7592
# assignment of timedeltas with NaT
one_hour = timedelta(hours=1)
df = DataFrame(index=date_range("20130101", periods=4))
df["A"] = np.array([1 * one_hour] * 4, dtype="m8[ns]")
df.loc[:, "B"] = np.array([2 * one_hour] * 4, dtype="m8[ns]")
df.loc[df.index[:3], "C"] = np.array([3 * one_hour] * 3, dtype="m8[ns]")
df.loc[:, "D"] = np.array([4 * one_hour] * 4, dtype="m8[ns]")
df.loc[df.index[:3], "E"] = np.array([5 * one_hour] * 3, dtype="m8[ns]")
df["F"] = np.timedelta64("NaT")
df.loc[df.index[:-1], "F"] = np.array([6 * one_hour] * 3, dtype="m8[ns]")
df.loc[df.index[-3] :, "G"] = date_range("20130101", periods=3)
df["H"] = np.datetime64("NaT")
result = df.dtypes
expected = Series(
[np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2,
index=list("ABCDEFGH"),
)
tm.assert_series_equal(result, expected)
def test_getitem_boolean_indexing_mixed(self):
df = DataFrame(
{
0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan},
1: {
35: np.nan,
40: 0.32632316859446198,
43: np.nan,
49: 0.32632316859446198,
50: 0.39114724480578139,
},
2: {
35: np.nan,
40: np.nan,
43: 0.29012581014105987,
49: np.nan,
50: np.nan,
},
3: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan},
4: {
35: 0.34215328467153283,
40: np.nan,
43: np.nan,
49: np.nan,
50: np.nan,
},
"y": {35: 0, 40: 0, 43: 0, 49: 0, 50: 1},
}
)
# mixed int/float ok
df2 = df.copy()
df2[df2 > 0.3] = 1
expected = df.copy()
expected.loc[40, 1] = 1
expected.loc[49, 1] = 1
expected.loc[50, 1] = 1
expected.loc[35, 4] = 1
tm.assert_frame_equal(df2, expected)
df["foo"] = "test"
msg = "not supported between instances|unorderable types"
with pytest.raises(TypeError, match=msg):
df[df > 0.3] = 1
def test_type_error_multiindex(self):
# See gh-12218
mi = MultiIndex.from_product([["x", "y"], [0, 1]], names=[None, "c"])
dg = DataFrame(
[[1, 1, 2, 2], [3, 3, 4, 4]], columns=mi, index=Index([0, 1], name="i")
)
with pytest.raises(InvalidIndexError, match="slice"):
dg[:, 0]
index = Index(range(2), name="i")
columns = MultiIndex(
levels=[["x", "y"], [0, 1]], codes=[[0, 1], [0, 0]], names=[None, "c"]
)
expected = DataFrame([[1, 2], [3, 4]], columns=columns, index=index)
result = dg.loc[:, (slice(None), 0)]
tm.assert_frame_equal(result, expected)
name = ("x", 0)
index = Index(range(2), name="i")
expected = Series([1, 3], index=index, name=name)
result = dg["x", 0]
tm.assert_series_equal(result, expected)
def test_getitem_interval_index_partial_indexing(self):
# GH#36490
df = DataFrame(
np.ones((3, 4)), columns=pd.IntervalIndex.from_breaks(np.arange(5))
)
expected = df.iloc[:, 0]
res = df[0.5]
tm.assert_series_equal(res, expected)
res = df.loc[:, 0.5]
tm.assert_series_equal(res, expected)
def test_setitem_array_as_cell_value(self):
# GH#43422
df = DataFrame(columns=["a", "b"], dtype=object)
df.loc[0] = {"a": np.zeros((2,)), "b": np.zeros((2, 2))}
expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]})
tm.assert_frame_equal(df, expected)
def test_iloc_setitem_nullable_2d_values(self):
df = DataFrame({"A": [1, 2, 3]}, dtype="Int64")
orig = df.copy()
df.loc[:] = df.values[:, ::-1]
tm.assert_frame_equal(df, orig)
df.loc[:] = pd.core.arrays.NumpyExtensionArray(df.values[:, ::-1])
tm.assert_frame_equal(df, orig)
df.iloc[:] = df.iloc[:, :].copy()
tm.assert_frame_equal(df, orig)
def test_getitem_segfault_with_empty_like_object(self):
# GH#46848
df = DataFrame(np.empty((1, 1), dtype=object))
df[0] = np.empty_like(df[0])
# this produces the segfault
df[[0]]
@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
@pytest.mark.parametrize(
"null", [pd.NaT, pd.NaT.to_numpy("M8[ns]"), pd.NaT.to_numpy("m8[ns]")]
)
def test_setting_mismatched_na_into_nullable_fails(
self, null, any_numeric_ea_dtype
):
# GH#44514 don't cast mismatched nulls to pd.NA
df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype)
ser = df["A"].copy()
arr = ser._values
msg = "|".join(
[
r"timedelta64\[ns\] cannot be converted to (Floating|Integer)Dtype",
r"datetime64\[ns\] cannot be converted to (Floating|Integer)Dtype",
"'values' contains non-numeric NA",
r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}",
]
)
with pytest.raises(TypeError, match=msg):
arr[0] = null
with pytest.raises(TypeError, match=msg):
arr[:2] = [null, null]
with pytest.raises(TypeError, match=msg):
ser[0] = null
with pytest.raises(TypeError, match=msg):
ser[:2] = [null, null]
with pytest.raises(TypeError, match=msg):
ser.iloc[0] = null
with pytest.raises(TypeError, match=msg):
ser.iloc[:2] = [null, null]
with pytest.raises(TypeError, match=msg):
df.iloc[0, 0] = null
with pytest.raises(TypeError, match=msg):
df.iloc[:2, 0] = [null, null]
# Multi-Block
df2 = df.copy()
df2["B"] = ser.copy()
with pytest.raises(TypeError, match=msg):
df2.iloc[0, 0] = null
with pytest.raises(TypeError, match=msg):
df2.iloc[:2, 0] = [null, null]
def test_loc_expand_empty_frame_keep_index_name(self):
# GH#45621
df = DataFrame(columns=["b"], index=Index([], name="a"))
df.loc[0] = 1
expected = DataFrame({"b": [1]}, index=Index([0], name="a"))
tm.assert_frame_equal(df, expected)
def test_loc_expand_empty_frame_keep_midx_names(self):
# GH#46317
df = DataFrame(
columns=["d"], index=MultiIndex.from_tuples([], names=["a", "b", "c"])
)
df.loc[(1, 2, 3)] = "foo"
expected = DataFrame(
{"d": ["foo"]},
index=MultiIndex.from_tuples([(1, 2, 3)], names=["a", "b", "c"]),
)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize(
"val, idxr",
[
("x", "a"),
("x", ["a"]),
(1, "a"),
(1, ["a"]),
],
)
def test_loc_setitem_rhs_frame(self, idxr, val):
# GH#47578
df = DataFrame({"a": [1, 2]})
with tm.assert_produces_warning(
FutureWarning, match="Setting an item of incompatible dtype"
):
df.loc[:, idxr] = DataFrame({"a": [val, 11]}, index=[1, 2])
expected = DataFrame({"a": [np.nan, val]})
tm.assert_frame_equal(df, expected)
@td.skip_array_manager_invalid_test
def test_iloc_setitem_enlarge_no_warning(self, warn_copy_on_write):
# GH#47381
df = DataFrame(columns=["a", "b"])
expected = df.copy()
view = df[:]
df.iloc[:, 0] = np.array([1, 2], dtype=np.float64)
tm.assert_frame_equal(view, expected)
def test_loc_internals_not_updated_correctly(self):
# GH#47867 all steps are necessary to reproduce the initial bug
df = DataFrame(
{"bool_col": True, "a": 1, "b": 2.5},
index=MultiIndex.from_arrays([[1, 2], [1, 2]], names=["idx1", "idx2"]),
)
idx = [(1, 1)]
df["c"] = 3
df.loc[idx, "c"] = 0
df.loc[idx, "c"]
df.loc[idx, ["a", "b"]]
df.loc[idx, "c"] = 15
result = df.loc[idx, "c"]
expected = df = Series(
15,
index=MultiIndex.from_arrays([[1], [1]], names=["idx1", "idx2"]),
name="c",
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("val", [None, [None], pd.NA, [pd.NA]])
def test_iloc_setitem_string_list_na(self, val):
# GH#45469
df = DataFrame({"a": ["a", "b", "c"]}, dtype="string")
df.iloc[[0], :] = val
expected = DataFrame({"a": [pd.NA, "b", "c"]}, dtype="string")
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("val", [None, pd.NA])
def test_iloc_setitem_string_na(self, val):
# GH#45469
df = DataFrame({"a": ["a", "b", "c"]}, dtype="string")
df.iloc[0, :] = val
expected = DataFrame({"a": [pd.NA, "b", "c"]}, dtype="string")
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("func", [list, Series, np.array])
def test_iloc_setitem_ea_null_slice_length_one_list(self, func):
# GH#48016
df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
df.iloc[:, func([0])] = 5
expected = DataFrame({"a": [5, 5, 5]}, dtype="Int64")
tm.assert_frame_equal(df, expected)
def test_loc_named_tuple_for_midx(self):
# GH#48124
df = DataFrame(
index=MultiIndex.from_product(
[["A", "B"], ["a", "b", "c"]], names=["first", "second"]
)
)
indexer_tuple = namedtuple("Indexer", df.index.names)
idxr = indexer_tuple(first="A", second=["a", "b"])
result = df.loc[idxr, :]
expected = DataFrame(
index=MultiIndex.from_tuples(
[("A", "a"), ("A", "b")], names=["first", "second"]
)
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("indexer", [["a"], "a"])
@pytest.mark.parametrize("col", [{}, {"b": 1}])
def test_set_2d_casting_date_to_int(self, col, indexer):
# GH#49159
df = DataFrame(
{"a": [Timestamp("2022-12-29"), Timestamp("2022-12-30")], **col},
)
df.loc[[1], indexer] = df["a"] + pd.Timedelta(days=1)
expected = DataFrame(
{"a": [Timestamp("2022-12-29"), Timestamp("2022-12-31")], **col},
)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("col", [{}, {"name": "a"}])
def test_loc_setitem_reordering_with_all_true_indexer(self, col):
# GH#48701
n = 17
df = DataFrame({**col, "x": range(n), "y": range(n)})
expected = df.copy()
df.loc[n * [True], ["x", "y"]] = df[["x", "y"]]
tm.assert_frame_equal(df, expected)
def test_loc_rhs_empty_warning(self):
# GH48480
df = DataFrame(columns=["a", "b"])
expected = df.copy()
rhs = DataFrame(columns=["a"])
with tm.assert_produces_warning(None):
df.loc[:, "a"] = rhs
tm.assert_frame_equal(df, expected)
def test_iloc_ea_series_indexer(self):
# GH#49521
df = DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
indexer = Series([0, 1], dtype="Int64")
row_indexer = Series([1], dtype="Int64")
result = df.iloc[row_indexer, indexer]
expected = DataFrame([[5, 6]], index=[1])
tm.assert_frame_equal(result, expected)
result = df.iloc[row_indexer.values, indexer.values]
tm.assert_frame_equal(result, expected)
def test_iloc_ea_series_indexer_with_na(self):
# GH#49521
df = DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
indexer = Series([0, pd.NA], dtype="Int64")
msg = "cannot convert"
with pytest.raises(ValueError, match=msg):
df.iloc[:, indexer]
with pytest.raises(ValueError, match=msg):
df.iloc[:, indexer.values]
@pytest.mark.parametrize("indexer", [True, (True,)])
@pytest.mark.parametrize("dtype", [bool, "boolean"])
def test_loc_bool_multiindex(self, dtype, indexer):
# GH#47687
midx = MultiIndex.from_arrays(
[
Series([True, True, False, False], dtype=dtype),
Series([True, False, True, False], dtype=dtype),
],
names=["a", "b"],
)
df = DataFrame({"c": [1, 2, 3, 4]}, index=midx)
with tm.maybe_produces_warning(PerformanceWarning, isinstance(indexer, tuple)):
result = df.loc[indexer]
expected = DataFrame(
{"c": [1, 2]}, index=Index([True, False], name="b", dtype=dtype)
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("utc", [False, True])
@pytest.mark.parametrize("indexer", ["date", ["date"]])
def test_loc_datetime_assignment_dtype_does_not_change(self, utc, indexer):
# GH#49837
df = DataFrame(
{
"date": to_datetime(
[datetime(2022, 1, 20), datetime(2022, 1, 22)], utc=utc
),
"update": [True, False],
}
)
expected = df.copy(deep=True)
update_df = df[df["update"]]
df.loc[df["update"], indexer] = update_df["date"]
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("indexer, idx", [(tm.loc, 1), (tm.iloc, 2)])
def test_setitem_value_coercing_dtypes(self, indexer, idx):
# GH#50467
df = DataFrame([["1", np.nan], ["2", np.nan], ["3", np.nan]], dtype=object)
rhs = DataFrame([[1, np.nan], [2, np.nan]])
indexer(df)[:idx, :] = rhs
expected = DataFrame([[1, np.nan], [2, np.nan], ["3", np.nan]], dtype=object)
tm.assert_frame_equal(df, expected)
class TestDataFrameIndexingUInt64:
def test_setitem(self):
df = DataFrame(
{"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]},
dtype=np.uint64,
)
idx = df["A"].rename("foo")
# setitem
assert "C" not in df.columns
df["C"] = idx
tm.assert_series_equal(df["C"], Series(idx, name="C"))
assert "D" not in df.columns
df["D"] = "foo"
df["D"] = idx
tm.assert_series_equal(df["D"], Series(idx, name="D"))
del df["D"]
# With NaN: because uint64 has no NaN element,
# the column should be cast to object.
df2 = df.copy()
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
df2.iloc[1, 1] = pd.NaT
df2.iloc[1, 2] = pd.NaT
result = df2["B"]
tm.assert_series_equal(notna(result), Series([True, False, True], name="B"))
tm.assert_series_equal(
df2.dtypes,
Series(
[np.dtype("uint64"), np.dtype("O"), np.dtype("O")],
index=["A", "B", "C"],
),
)
def test_object_casting_indexing_wraps_datetimelike(using_array_manager):
# GH#31649, check the indexing methods all the way down the stack
df = DataFrame(
{
"A": [1, 2],
"B": date_range("2000", periods=2),
"C": pd.timedelta_range("1 Day", periods=2),
}
)
ser = df.loc[0]
assert isinstance(ser.values[1], Timestamp)
assert isinstance(ser.values[2], pd.Timedelta)
ser = df.iloc[0]
assert isinstance(ser.values[1], Timestamp)
assert isinstance(ser.values[2], pd.Timedelta)
ser = df.xs(0, axis=0)
assert isinstance(ser.values[1], Timestamp)
assert isinstance(ser.values[2], pd.Timedelta)
if using_array_manager:
# remainder of the test checking BlockManager internals
return
mgr = df._mgr
mgr._rebuild_blknos_and_blklocs()
arr = mgr.fast_xs(0).array
assert isinstance(arr[1], Timestamp)
assert isinstance(arr[2], pd.Timedelta)
blk = mgr.blocks[mgr.blknos[1]]
assert blk.dtype == "M8[ns]" # we got the right block
val = blk.iget((0, 0))
assert isinstance(val, Timestamp)
blk = mgr.blocks[mgr.blknos[2]]
assert blk.dtype == "m8[ns]" # we got the right block
val = blk.iget((0, 0))
assert isinstance(val, pd.Timedelta)
msg1 = r"Cannot setitem on a Categorical with a new category( \(.*\))?, set the"
msg2 = "Cannot set a Categorical with another, without identical categories"
class TestLocILocDataFrameCategorical:
@pytest.fixture
def orig(self):
cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"])
idx = Index(["h", "i", "j", "k", "l", "m", "n"])
values = [1, 1, 1, 1, 1, 1, 1]
orig = DataFrame({"cats": cats, "values": values}, index=idx)
return orig
@pytest.fixture
def exp_single_row(self):
# The expected values if we change a single row
cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"])
idx1 = Index(["h", "i", "j", "k", "l", "m", "n"])
values1 = [1, 1, 2, 1, 1, 1, 1]
exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1)
return exp_single_row
@pytest.fixture
def exp_multi_row(self):
# assign multiple rows (mixed values) (-> array) -> exp_multi_row
# changed multiple rows
cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"])
idx2 = Index(["h", "i", "j", "k", "l", "m", "n"])
values2 = [1, 1, 2, 2, 1, 1, 1]
exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2)
return exp_multi_row
@pytest.fixture
def exp_parts_cats_col(self):
# changed part of the cats column
cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"])
idx3 = Index(["h", "i", "j", "k", "l", "m", "n"])
values3 = [1, 1, 1, 1, 1, 1, 1]
exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3)
return exp_parts_cats_col
@pytest.fixture
def exp_single_cats_value(self):
# changed single value in cats col
cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"])
idx4 = Index(["h", "i", "j", "k", "l", "m", "n"])
values4 = [1, 1, 1, 1, 1, 1, 1]
exp_single_cats_value = DataFrame(
{"cats": cats4, "values": values4}, index=idx4
)
return exp_single_cats_value
@pytest.mark.parametrize("indexer", [tm.loc, tm.iloc])
def test_loc_iloc_setitem_list_of_lists(self, orig, exp_multi_row, indexer):
# - assign multiple rows (mixed values) -> exp_multi_row
df = orig.copy()
key = slice(2, 4)
if indexer is tm.loc:
key = slice("j", "k")
indexer(df)[key, :] = [["b", 2], ["b", 2]]
tm.assert_frame_equal(df, exp_multi_row)
df = orig.copy()
with pytest.raises(TypeError, match=msg1):
indexer(df)[key, :] = [["c", 2], ["c", 2]]
@pytest.mark.parametrize("indexer", [tm.loc, tm.iloc, tm.at, tm.iat])
def test_loc_iloc_at_iat_setitem_single_value_in_categories(
self, orig, exp_single_cats_value, indexer
):
# - assign a single value -> exp_single_cats_value
df = orig.copy()
key = (2, 0)
if indexer in [tm.loc, tm.at]:
key = (df.index[2], df.columns[0])
# "b" is among the categories for df["cat"}]
indexer(df)[key] = "b"
tm.assert_frame_equal(df, exp_single_cats_value)
# "c" is not among the categories for df["cat"]
with pytest.raises(TypeError, match=msg1):
indexer(df)[key] = "c"
@pytest.mark.parametrize("indexer", [tm.loc, tm.iloc])
def test_loc_iloc_setitem_mask_single_value_in_categories(
self, orig, exp_single_cats_value, indexer
):
# mask with single True
df = orig.copy()
mask = df.index == "j"
key = 0
if indexer is tm.loc:
key = df.columns[key]
indexer(df)[mask, key] = "b"
tm.assert_frame_equal(df, exp_single_cats_value)
@pytest.mark.parametrize("indexer", [tm.loc, tm.iloc])
def test_loc_iloc_setitem_full_row_non_categorical_rhs(
self, orig, exp_single_row, indexer
):
# - assign a complete row (mixed values) -> exp_single_row
df = orig.copy()
key = 2
if indexer is tm.loc:
key = df.index[2]
# not categorical dtype, but "b" _is_ among the categories for df["cat"]
indexer(df)[key, :] = ["b", 2]
tm.assert_frame_equal(df, exp_single_row)
# "c" is not among the categories for df["cat"]
with pytest.raises(TypeError, match=msg1):
indexer(df)[key, :] = ["c", 2]
@pytest.mark.parametrize("indexer", [tm.loc, tm.iloc])
def test_loc_iloc_setitem_partial_col_categorical_rhs(
self, orig, exp_parts_cats_col, indexer
):
# assign a part of a column with dtype == categorical ->
# exp_parts_cats_col
df = orig.copy()
key = (slice(2, 4), 0)
if indexer is tm.loc:
key = (slice("j", "k"), df.columns[0])
# same categories as we currently have in df["cats"]
compat = Categorical(["b", "b"], categories=["a", "b"])
indexer(df)[key] = compat
tm.assert_frame_equal(df, exp_parts_cats_col)
# categories do not match df["cat"]'s, but "b" is among them
semi_compat = Categorical(list("bb"), categories=list("abc"))
with pytest.raises(TypeError, match=msg2):
# different categories but holdable values
# -> not sure if this should fail or pass
indexer(df)[key] = semi_compat
# categories do not match df["cat"]'s, and "c" is not among them
incompat = Categorical(list("cc"), categories=list("abc"))
with pytest.raises(TypeError, match=msg2):
# different values
indexer(df)[key] = incompat
@pytest.mark.parametrize("indexer", [tm.loc, tm.iloc])
def test_loc_iloc_setitem_non_categorical_rhs(
self, orig, exp_parts_cats_col, indexer
):
# assign a part of a column with dtype != categorical -> exp_parts_cats_col
df = orig.copy()
key = (slice(2, 4), 0)
if indexer is tm.loc:
key = (slice("j", "k"), df.columns[0])
# "b" is among the categories for df["cat"]
indexer(df)[key] = ["b", "b"]
tm.assert_frame_equal(df, exp_parts_cats_col)
# "c" not part of the categories
with pytest.raises(TypeError, match=msg1):
indexer(df)[key] = ["c", "c"]
@pytest.mark.parametrize("indexer", [tm.getitem, tm.loc, tm.iloc])
def test_getitem_preserve_object_index_with_dates(self, indexer):
# https://github.com/pandas-dev/pandas/pull/42950 - when selecting a column
# from dataframe, don't try to infer object dtype index on Series construction
idx = date_range("2012", periods=3).astype(object)
df = DataFrame({0: [1, 2, 3]}, index=idx)
assert df.index.dtype == object
if indexer is tm.getitem:
ser = indexer(df)[0]
else:
ser = indexer(df)[:, 0]
assert ser.index.dtype == object
def test_loc_on_multiindex_one_level(self):
# GH#45779
df = DataFrame(
data=[[0], [1]],
index=MultiIndex.from_tuples([("a",), ("b",)], names=["first"]),
)
expected = DataFrame(
data=[[0]], index=MultiIndex.from_tuples([("a",)], names=["first"])
)
result = df.loc["a"]
tm.assert_frame_equal(result, expected)
class TestDeprecatedIndexers:
@pytest.mark.parametrize(
"key", [{1}, {1: 1}, ({1}, "a"), ({1: 1}, "a"), (1, {"a"}), (1, {"a": "a"})]
)
def test_getitem_dict_and_set_deprecated(self, key):
# GH#42825 enforced in 2.0
df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
with pytest.raises(TypeError, match="as an indexer is not supported"):
df.loc[key]
@pytest.mark.parametrize(
"key",
[
{1},
{1: 1},
(({1}, 2), "a"),
(({1: 1}, 2), "a"),
((1, 2), {"a"}),
((1, 2), {"a": "a"}),
],
)
def test_getitem_dict_and_set_deprecated_multiindex(self, key):
# GH#42825 enforced in 2.0
df = DataFrame(
[[1, 2], [3, 4]],
columns=["a", "b"],
index=MultiIndex.from_tuples([(1, 2), (3, 4)]),
)
with pytest.raises(TypeError, match="as an indexer is not supported"):
df.loc[key]
@pytest.mark.parametrize(
"key", [{1}, {1: 1}, ({1}, "a"), ({1: 1}, "a"), (1, {"a"}), (1, {"a": "a"})]
)
def test_setitem_dict_and_set_disallowed(self, key):
# GH#42825 enforced in 2.0
df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
with pytest.raises(TypeError, match="as an indexer is not supported"):
df.loc[key] = 1
@pytest.mark.parametrize(
"key",
[
{1},
{1: 1},
(({1}, 2), "a"),
(({1: 1}, 2), "a"),
((1, 2), {"a"}),
((1, 2), {"a": "a"}),
],
)
def test_setitem_dict_and_set_disallowed_multiindex(self, key):
# GH#42825 enforced in 2.0
df = DataFrame(
[[1, 2], [3, 4]],
columns=["a", "b"],
index=MultiIndex.from_tuples([(1, 2), (3, 4)]),
)
with pytest.raises(TypeError, match="as an indexer is not supported"):
df.loc[key] = 1
def test_adding_new_conditional_column() -> None:
# https://github.com/pandas-dev/pandas/issues/55025
df = DataFrame({"x": [1]})
df.loc[df["x"] == 1, "y"] = "1"
expected = DataFrame({"x": [1], "y": ["1"]})
tm.assert_frame_equal(df, expected)
df = DataFrame({"x": [1]})
# try inserting something which numpy would store as 'object'
value = lambda x: x
df.loc[df["x"] == 1, "y"] = value
expected = DataFrame({"x": [1], "y": [value]})
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize(
("dtype", "infer_string"),
[
(object, False),
("string[pyarrow_numpy]", True),
],
)
def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
# https://github.com/pandas-dev/pandas/issues/56204
pytest.importorskip("pyarrow")
df = DataFrame({"a": [1, 2], "b": [3, 4]})
with pd.option_context("future.infer_string", infer_string):
df.loc[df["a"] == 1, "c"] = "1"
expected = DataFrame({"a": [1, 2], "b": [3, 4], "c": ["1", float("nan")]}).astype(
{"a": "int64", "b": "int64", "c": dtype}
)
tm.assert_frame_equal(df, expected)
def test_add_new_column_infer_string():
# GH#55366
pytest.importorskip("pyarrow")
df = DataFrame({"x": [1]})
with pd.option_context("future.infer_string", True):
df.loc[df["x"] == 1, "y"] = "1"
expected = DataFrame(
{"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")},
columns=Index(["x", "y"], dtype=object),
)
tm.assert_frame_equal(df, expected)
class TestSetitemValidation:
# This is adapted from pandas/tests/arrays/masked/test_indexing.py
# but checks for warnings instead of errors.
def _check_setitem_invalid(self, df, invalid, indexer, warn):
msg = "Setting an item of incompatible dtype is deprecated"
msg = re.escape(msg)
orig_df = df.copy()
# iloc
with tm.assert_produces_warning(warn, match=msg):
df.iloc[indexer, 0] = invalid
df = orig_df.copy()
# loc
with tm.assert_produces_warning(warn, match=msg):
df.loc[indexer, "a"] = invalid
df = orig_df.copy()
_invalid_scalars = [
1 + 2j,
"True",
"1",
"1.0",
pd.NaT,
np.datetime64("NaT"),
np.timedelta64("NaT"),
]
_indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)]
@pytest.mark.parametrize(
"invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)]
)
@pytest.mark.parametrize("indexer", _indexers)
def test_setitem_validation_scalar_bool(self, invalid, indexer):
df = DataFrame({"a": [True, False, False]}, dtype="bool")
self._check_setitem_invalid(df, invalid, indexer, FutureWarning)
@pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)])
@pytest.mark.parametrize("indexer", _indexers)
def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer):
df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype)
if isna(invalid) and invalid is not pd.NaT and not np.isnat(invalid):
warn = None
else:
warn = FutureWarning
self._check_setitem_invalid(df, invalid, indexer, warn)
@pytest.mark.parametrize("invalid", _invalid_scalars + [True])
@pytest.mark.parametrize("indexer", _indexers)
def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer):
df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype)
self._check_setitem_invalid(df, invalid, indexer, FutureWarning)