from collections import deque from datetime import ( datetime, timezone, ) from enum import Enum import functools import operator import re import numpy as np import pytest from pandas._config import using_pyarrow_string_dtype import pandas.util._test_decorators as td import pandas as pd from pandas import ( DataFrame, Index, MultiIndex, Series, ) import pandas._testing as tm from pandas.core.computation import expressions as expr from pandas.tests.frame.common import ( _check_mixed_float, _check_mixed_int, ) @pytest.fixture def simple_frame(): """ Fixture for simple 3x3 DataFrame Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c']. one two three a 1.0 2.0 3.0 b 4.0 5.0 6.0 c 7.0 8.0 9.0 """ arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"]) @pytest.fixture(autouse=True, params=[0, 100], ids=["numexpr", "python"]) def switch_numexpr_min_elements(request, monkeypatch): with monkeypatch.context() as m: m.setattr(expr, "_MIN_ELEMENTS", request.param) yield request.param class DummyElement: def __init__(self, value, dtype) -> None: self.value = value self.dtype = np.dtype(dtype) def __array__(self): return np.array(self.value, dtype=self.dtype) def __str__(self) -> str: return f"DummyElement({self.value}, {self.dtype})" def __repr__(self) -> str: return str(self) def astype(self, dtype, copy=False): self.dtype = dtype return self def view(self, dtype): return type(self)(self.value.view(dtype), dtype) def any(self, axis=None): return bool(self.value) # ------------------------------------------------------------------- # Comparisons class TestFrameComparisons: # Specifically _not_ flex-comparisons def test_comparison_with_categorical_dtype(self): # GH#12564 df = DataFrame({"A": ["foo", "bar", "baz"]}) exp = DataFrame({"A": [True, False, False]}) res = df == "foo" tm.assert_frame_equal(res, exp) # casting to categorical shouldn't affect the result df["A"] = df["A"].astype("category") res = df == "foo" tm.assert_frame_equal(res, exp) def test_frame_in_list(self): # GH#12689 this should raise at the DataFrame level, not blocks df = DataFrame( np.random.default_rng(2).standard_normal((6, 4)), columns=list("ABCD") ) msg = "The truth value of a DataFrame is ambiguous" with pytest.raises(ValueError, match=msg): df in [None] @pytest.mark.parametrize( "arg, arg2", [ [ { "a": np.random.default_rng(2).integers(10, size=10), "b": pd.date_range("20010101", periods=10), }, { "a": np.random.default_rng(2).integers(10, size=10), "b": np.random.default_rng(2).integers(10, size=10), }, ], [ { "a": np.random.default_rng(2).integers(10, size=10), "b": np.random.default_rng(2).integers(10, size=10), }, { "a": np.random.default_rng(2).integers(10, size=10), "b": pd.date_range("20010101", periods=10), }, ], [ { "a": pd.date_range("20010101", periods=10), "b": pd.date_range("20010101", periods=10), }, { "a": np.random.default_rng(2).integers(10, size=10), "b": np.random.default_rng(2).integers(10, size=10), }, ], [ { "a": np.random.default_rng(2).integers(10, size=10), "b": pd.date_range("20010101", periods=10), }, { "a": pd.date_range("20010101", periods=10), "b": pd.date_range("20010101", periods=10), }, ], ], ) def test_comparison_invalid(self, arg, arg2): # GH4968 # invalid date/int comparisons x = DataFrame(arg) y = DataFrame(arg2) # we expect the result to match Series comparisons for # == and !=, inequalities should raise result = x == y expected = DataFrame( {col: x[col] == y[col] for col in x.columns}, index=x.index, columns=x.columns, ) tm.assert_frame_equal(result, expected) result = x != y expected = DataFrame( {col: x[col] != y[col] for col in x.columns}, index=x.index, columns=x.columns, ) tm.assert_frame_equal(result, expected) msgs = [ r"Invalid comparison between dtype=datetime64\[ns\] and ndarray", "invalid type promotion", ( # npdev 1.20.0 r"The DTypes and " r" do not have a common DType." ), ] msg = "|".join(msgs) with pytest.raises(TypeError, match=msg): x >= y with pytest.raises(TypeError, match=msg): x > y with pytest.raises(TypeError, match=msg): x < y with pytest.raises(TypeError, match=msg): x <= y @pytest.mark.parametrize( "left, right", [ ("gt", "lt"), ("lt", "gt"), ("ge", "le"), ("le", "ge"), ("eq", "eq"), ("ne", "ne"), ], ) def test_timestamp_compare(self, left, right): # make sure we can compare Timestamps on the right AND left hand side # GH#4982 df = DataFrame( { "dates1": pd.date_range("20010101", periods=10), "dates2": pd.date_range("20010102", periods=10), "intcol": np.random.default_rng(2).integers(1000000000, size=10), "floatcol": np.random.default_rng(2).standard_normal(10), "stringcol": [chr(100 + i) for i in range(10)], } ) df.loc[np.random.default_rng(2).random(len(df)) > 0.5, "dates2"] = pd.NaT left_f = getattr(operator, left) right_f = getattr(operator, right) # no nats if left in ["eq", "ne"]: expected = left_f(df, pd.Timestamp("20010109")) result = right_f(pd.Timestamp("20010109"), df) tm.assert_frame_equal(result, expected) else: msg = ( "'(<|>)=?' not supported between " "instances of 'numpy.ndarray' and 'Timestamp'" ) with pytest.raises(TypeError, match=msg): left_f(df, pd.Timestamp("20010109")) with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("20010109"), df) # nats if left in ["eq", "ne"]: expected = left_f(df, pd.Timestamp("nat")) result = right_f(pd.Timestamp("nat"), df) tm.assert_frame_equal(result, expected) else: msg = ( "'(<|>)=?' not supported between " "instances of 'numpy.ndarray' and 'NaTType'" ) with pytest.raises(TypeError, match=msg): left_f(df, pd.Timestamp("nat")) with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) @pytest.mark.xfail( using_pyarrow_string_dtype(), reason="can't compare string and int" ) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError # (this appears to be fixed before GH#22163, not sure when) df = DataFrame([["1989-08-01", 1], ["1989-08-01", 2]]) other = DataFrame([["a", "b"], ["c", "d"]]) result = df == other assert not result.any().any() result = df != other assert result.all().all() def test_df_boolean_comparison_error(self): # GH#4576, GH#22880 # comparing DataFrame against list/tuple with len(obj) matching # len(df.columns) is supported as of GH#22800 df = DataFrame(np.arange(6).reshape((3, 2))) expected = DataFrame([[False, False], [True, False], [False, False]]) result = df == (2, 2) tm.assert_frame_equal(result, expected) result = df == [2, 2] tm.assert_frame_equal(result, expected) def test_df_float_none_comparison(self): df = DataFrame( np.random.default_rng(2).standard_normal((8, 3)), index=range(8), columns=["A", "B", "C"], ) result = df.__eq__(None) assert not result.any().any() def test_df_string_comparison(self): df = DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}]) mask_a = df.a > 1 tm.assert_frame_equal(df[mask_a], df.loc[1:1, :]) tm.assert_frame_equal(df[-mask_a], df.loc[0:0, :]) mask_b = df.b == "foo" tm.assert_frame_equal(df[mask_b], df.loc[0:0, :]) tm.assert_frame_equal(df[-mask_b], df.loc[1:1, :]) class TestFrameFlexComparisons: # TODO: test_bool_flex_frame needs a better name @pytest.mark.parametrize("op", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_bool_flex_frame(self, op): data = np.random.default_rng(2).standard_normal((5, 3)) other_data = np.random.default_rng(2).standard_normal((5, 3)) df = DataFrame(data) other = DataFrame(other_data) ndim_5 = np.ones(df.shape + (1, 3)) # DataFrame assert df.eq(df).values.all() assert not df.ne(df).values.any() f = getattr(df, op) o = getattr(operator, op) # No NAs tm.assert_frame_equal(f(other), o(df, other)) # Unaligned part_o = other.loc[3:, 1:].copy() rs = f(part_o) xp = o(df, part_o.reindex(index=df.index, columns=df.columns)) tm.assert_frame_equal(rs, xp) # ndarray tm.assert_frame_equal(f(other.values), o(df, other.values)) # scalar tm.assert_frame_equal(f(0), o(df, 0)) # NAs msg = "Unable to coerce to Series/DataFrame" tm.assert_frame_equal(f(np.nan), o(df, np.nan)) with pytest.raises(ValueError, match=msg): f(ndim_5) @pytest.mark.parametrize("box", [np.array, Series]) def test_bool_flex_series(self, box): # Series # list/tuple data = np.random.default_rng(2).standard_normal((5, 3)) df = DataFrame(data) idx_ser = box(np.random.default_rng(2).standard_normal(5)) col_ser = box(np.random.default_rng(2).standard_normal(3)) idx_eq = df.eq(idx_ser, axis=0) col_eq = df.eq(col_ser) idx_ne = df.ne(idx_ser, axis=0) col_ne = df.ne(col_ser) tm.assert_frame_equal(col_eq, df == Series(col_ser)) tm.assert_frame_equal(col_eq, -col_ne) tm.assert_frame_equal(idx_eq, -idx_ne) tm.assert_frame_equal(idx_eq, df.T.eq(idx_ser).T) tm.assert_frame_equal(col_eq, df.eq(list(col_ser))) tm.assert_frame_equal(idx_eq, df.eq(Series(idx_ser), axis=0)) tm.assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0)) idx_gt = df.gt(idx_ser, axis=0) col_gt = df.gt(col_ser) idx_le = df.le(idx_ser, axis=0) col_le = df.le(col_ser) tm.assert_frame_equal(col_gt, df > Series(col_ser)) tm.assert_frame_equal(col_gt, -col_le) tm.assert_frame_equal(idx_gt, -idx_le) tm.assert_frame_equal(idx_gt, df.T.gt(idx_ser).T) idx_ge = df.ge(idx_ser, axis=0) col_ge = df.ge(col_ser) idx_lt = df.lt(idx_ser, axis=0) col_lt = df.lt(col_ser) tm.assert_frame_equal(col_ge, df >= Series(col_ser)) tm.assert_frame_equal(col_ge, -col_lt) tm.assert_frame_equal(idx_ge, -idx_lt) tm.assert_frame_equal(idx_ge, df.T.ge(idx_ser).T) idx_ser = Series(np.random.default_rng(2).standard_normal(5)) col_ser = Series(np.random.default_rng(2).standard_normal(3)) def test_bool_flex_frame_na(self): df = DataFrame(np.random.default_rng(2).standard_normal((5, 3))) # NA df.loc[0, 0] = np.nan rs = df.eq(df) assert not rs.loc[0, 0] rs = df.ne(df) assert rs.loc[0, 0] rs = df.gt(df) assert not rs.loc[0, 0] rs = df.lt(df) assert not rs.loc[0, 0] rs = df.ge(df) assert not rs.loc[0, 0] rs = df.le(df) assert not rs.loc[0, 0] def test_bool_flex_frame_complex_dtype(self): # complex arr = np.array([np.nan, 1, 6, np.nan]) arr2 = np.array([2j, np.nan, 7, None]) df = DataFrame({"a": arr}) df2 = DataFrame({"a": arr2}) msg = "|".join( [ "'>' not supported between instances of '.*' and 'complex'", r"unorderable types: .*complex\(\)", # PY35 ] ) with pytest.raises(TypeError, match=msg): # inequalities are not well-defined for complex numbers df.gt(df2) with pytest.raises(TypeError, match=msg): # regression test that we get the same behavior for Series df["a"].gt(df2["a"]) with pytest.raises(TypeError, match=msg): # Check that we match numpy behavior here df.values > df2.values rs = df.ne(df2) assert rs.values.all() arr3 = np.array([2j, np.nan, None]) df3 = DataFrame({"a": arr3}) with pytest.raises(TypeError, match=msg): # inequalities are not well-defined for complex numbers df3.gt(2j) with pytest.raises(TypeError, match=msg): # regression test that we get the same behavior for Series df3["a"].gt(2j) with pytest.raises(TypeError, match=msg): # Check that we match numpy behavior here df3.values > 2j def test_bool_flex_frame_object_dtype(self): # corner, dtype=object df1 = DataFrame({"col": ["foo", np.nan, "bar"]}, dtype=object) df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}, dtype=object) result = df1.ne(df2) exp = DataFrame({"col": [False, True, False]}) tm.assert_frame_equal(result, exp) def test_flex_comparison_nat(self): # GH 15697, GH 22163 df.eq(pd.NaT) should behave like df == pd.NaT, # and _definitely_ not be NaN df = DataFrame([pd.NaT]) result = df == pd.NaT # result.iloc[0, 0] is a np.bool_ object assert result.iloc[0, 0].item() is False result = df.eq(pd.NaT) assert result.iloc[0, 0].item() is False result = df != pd.NaT assert result.iloc[0, 0].item() is True result = df.ne(pd.NaT) assert result.iloc[0, 0].item() is True @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_df_flex_cmp_constant_return_types(self, opname): # GH 15077, non-empty DataFrame df = DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) const = 2 result = getattr(df, opname)(const).dtypes.value_counts() tm.assert_series_equal( result, Series([2], index=[np.dtype(bool)], name="count") ) @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_df_flex_cmp_constant_return_types_empty(self, opname): # GH 15077 empty DataFrame df = DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) const = 2 empty = df.iloc[:0] result = getattr(empty, opname)(const).dtypes.value_counts() tm.assert_series_equal( result, Series([2], index=[np.dtype(bool)], name="count") ) def test_df_flex_cmp_ea_dtype_with_ndarray_series(self): ii = pd.IntervalIndex.from_breaks([1, 2, 3]) df = DataFrame({"A": ii, "B": ii}) ser = Series([0, 0]) res = df.eq(ser, axis=0) expected = DataFrame({"A": [False, False], "B": [False, False]}) tm.assert_frame_equal(res, expected) ser2 = Series([1, 2], index=["A", "B"]) res2 = df.eq(ser2, axis=1) tm.assert_frame_equal(res2, expected) # ------------------------------------------------------------------- # Arithmetic class TestFrameFlexArithmetic: def test_floordiv_axis0(self): # make sure we df.floordiv(ser, axis=0) matches column-wise result arr = np.arange(3) ser = Series(arr) df = DataFrame({"A": ser, "B": ser}) result = df.floordiv(ser, axis=0) expected = DataFrame({col: df[col] // ser for col in df.columns}) tm.assert_frame_equal(result, expected) result2 = df.floordiv(ser.values, axis=0) tm.assert_frame_equal(result2, expected) def test_df_add_td64_columnwise(self): # GH 22534 Check that column-wise addition broadcasts correctly dti = pd.date_range("2016-01-01", periods=10) tdi = pd.timedelta_range("1", periods=10) tser = Series(tdi) df = DataFrame({0: dti, 1: tdi}) result = df.add(tser, axis=0) expected = DataFrame({0: dti + tdi, 1: tdi + tdi}) tm.assert_frame_equal(result, expected) def test_df_add_flex_filled_mixed_dtypes(self): # GH 19611 dti = pd.date_range("2016-01-01", periods=3) ser = Series(["1 Day", "NaT", "2 Days"], dtype="timedelta64[ns]") df = DataFrame({"A": dti, "B": ser}) other = DataFrame({"A": ser, "B": ser}) fill = pd.Timedelta(days=1).to_timedelta64() result = df.add(other, fill_value=fill) expected = DataFrame( { "A": Series( ["2016-01-02", "2016-01-03", "2016-01-05"], dtype="datetime64[ns]" ), "B": ser * 2, } ) tm.assert_frame_equal(result, expected) def test_arith_flex_frame( self, all_arithmetic_operators, float_frame, mixed_float_frame ): # one instance of parametrized fixture op = all_arithmetic_operators def f(x, y): # r-versions not in operator-stdlib; get op without "r" and invert if op.startswith("__r"): return getattr(operator, op.replace("__r", "__"))(y, x) return getattr(operator, op)(x, y) result = getattr(float_frame, op)(2 * float_frame) expected = f(float_frame, 2 * float_frame) tm.assert_frame_equal(result, expected) # vs mix float result = getattr(mixed_float_frame, op)(2 * mixed_float_frame) expected = f(mixed_float_frame, 2 * mixed_float_frame) tm.assert_frame_equal(result, expected) _check_mixed_float(result, dtype={"C": None}) @pytest.mark.parametrize("op", ["__add__", "__sub__", "__mul__"]) def test_arith_flex_frame_mixed( self, op, int_frame, mixed_int_frame, mixed_float_frame, switch_numexpr_min_elements, ): f = getattr(operator, op) # vs mix int result = getattr(mixed_int_frame, op)(2 + mixed_int_frame) expected = f(mixed_int_frame, 2 + mixed_int_frame) # no overflow in the uint dtype = None if op in ["__sub__"]: dtype = {"B": "uint64", "C": None} elif op in ["__add__", "__mul__"]: dtype = {"C": None} if expr.USE_NUMEXPR and switch_numexpr_min_elements == 0: # when using numexpr, the casting rules are slightly different: # in the `2 + mixed_int_frame` operation, int32 column becomes # and int64 column (not preserving dtype in operation with Python # scalar), and then the int32/int64 combo results in int64 result dtype["A"] = (2 + mixed_int_frame)["A"].dtype tm.assert_frame_equal(result, expected) _check_mixed_int(result, dtype=dtype) # vs mix float result = getattr(mixed_float_frame, op)(2 * mixed_float_frame) expected = f(mixed_float_frame, 2 * mixed_float_frame) tm.assert_frame_equal(result, expected) _check_mixed_float(result, dtype={"C": None}) # vs plain int result = getattr(int_frame, op)(2 * int_frame) expected = f(int_frame, 2 * int_frame) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dim", range(3, 6)) def test_arith_flex_frame_raise(self, all_arithmetic_operators, float_frame, dim): # one instance of parametrized fixture op = all_arithmetic_operators # Check that arrays with dim >= 3 raise arr = np.ones((1,) * dim) msg = "Unable to coerce to Series/DataFrame" with pytest.raises(ValueError, match=msg): getattr(float_frame, op)(arr) def test_arith_flex_frame_corner(self, float_frame): const_add = float_frame.add(1) tm.assert_frame_equal(const_add, float_frame + 1) # corner cases result = float_frame.add(float_frame[:0]) expected = float_frame.sort_index() * np.nan tm.assert_frame_equal(result, expected) result = float_frame[:0].add(float_frame) expected = float_frame.sort_index() * np.nan tm.assert_frame_equal(result, expected) with pytest.raises(NotImplementedError, match="fill_value"): float_frame.add(float_frame.iloc[0], fill_value=3) with pytest.raises(NotImplementedError, match="fill_value"): float_frame.add(float_frame.iloc[0], axis="index", fill_value=3) @pytest.mark.parametrize("op", ["add", "sub", "mul", "mod"]) def test_arith_flex_series_ops(self, simple_frame, op): # after arithmetic refactor, add truediv here df = simple_frame row = df.xs("a") col = df["two"] f = getattr(df, op) op = getattr(operator, op) tm.assert_frame_equal(f(row), op(df, row)) tm.assert_frame_equal(f(col, axis=0), op(df.T, col).T) def test_arith_flex_series(self, simple_frame): df = simple_frame row = df.xs("a") col = df["two"] # special case for some reason tm.assert_frame_equal(df.add(row, axis=None), df + row) # cases which will be refactored after big arithmetic refactor tm.assert_frame_equal(df.div(row), df / row) tm.assert_frame_equal(df.div(col, axis=0), (df.T / col).T) @pytest.mark.parametrize("dtype", ["int64", "float64"]) def test_arith_flex_series_broadcasting(self, dtype): # broadcasting issue in GH 7325 df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype=dtype) expected = DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) result = df.div(df[0], axis="index") tm.assert_frame_equal(result, expected) def test_arith_flex_zero_len_raises(self): # GH 19522 passing fill_value to frame flex arith methods should # raise even in the zero-length special cases ser_len0 = Series([], dtype=object) df_len0 = DataFrame(columns=["A", "B"]) df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) with pytest.raises(NotImplementedError, match="fill_value"): df.add(ser_len0, fill_value="E") with pytest.raises(NotImplementedError, match="fill_value"): df_len0.sub(df["A"], axis=None, fill_value=3) def test_flex_add_scalar_fill_value(self): # GH#12723 dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") df = DataFrame({"foo": dat}, index=range(6)) exp = df.fillna(0).add(2) res = df.add(2, fill_value=0) tm.assert_frame_equal(res, exp) def test_sub_alignment_with_duplicate_index(self): # GH#5185 dup aligning operations should work df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3]) df2 = DataFrame([1, 2, 3], index=[1, 2, 3]) expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3]) result = df1.sub(df2) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("op", ["__add__", "__mul__", "__sub__", "__truediv__"]) def test_arithmetic_with_duplicate_columns(self, op): # operations df = DataFrame({"A": np.arange(10), "B": np.random.default_rng(2).random(10)}) expected = getattr(df, op)(df) expected.columns = ["A", "A"] df.columns = ["A", "A"] result = getattr(df, op)(df) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("level", [0, None]) def test_broadcast_multiindex(self, level): # GH34388 df1 = DataFrame({"A": [0, 1, 2], "B": [1, 2, 3]}) df1.columns = df1.columns.set_names("L1") df2 = DataFrame({("A", "C"): [0, 0, 0], ("A", "D"): [0, 0, 0]}) df2.columns = df2.columns.set_names(["L1", "L2"]) result = df1.add(df2, level=level) expected = DataFrame({("A", "C"): [0, 1, 2], ("A", "D"): [0, 1, 2]}) expected.columns = expected.columns.set_names(["L1", "L2"]) tm.assert_frame_equal(result, expected) def test_frame_multiindex_operations(self): # GH 43321 df = DataFrame( {2010: [1, 2, 3], 2020: [3, 4, 5]}, index=MultiIndex.from_product( [["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"] ), ) series = Series( [0.4], index=MultiIndex.from_product([["b"], ["a"]], names=["mod", "scen"]), ) expected = DataFrame( {2010: [1.4, 2.4, 3.4], 2020: [3.4, 4.4, 5.4]}, index=MultiIndex.from_product( [["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"] ), ) result = df.add(series, axis=0) tm.assert_frame_equal(result, expected) def test_frame_multiindex_operations_series_index_to_frame_index(self): # GH 43321 df = DataFrame( {2010: [1], 2020: [3]}, index=MultiIndex.from_product([["a"], ["b"]], names=["scen", "mod"]), ) series = Series( [10.0, 20.0, 30.0], index=MultiIndex.from_product( [["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"] ), ) expected = DataFrame( {2010: [11.0, 21, 31.0], 2020: [13.0, 23.0, 33.0]}, index=MultiIndex.from_product( [["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"] ), ) result = df.add(series, axis=0) tm.assert_frame_equal(result, expected) def test_frame_multiindex_operations_no_align(self): df = DataFrame( {2010: [1, 2, 3], 2020: [3, 4, 5]}, index=MultiIndex.from_product( [["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"] ), ) series = Series( [0.4], index=MultiIndex.from_product([["c"], ["a"]], names=["mod", "scen"]), ) expected = DataFrame( {2010: np.nan, 2020: np.nan}, index=MultiIndex.from_tuples( [ ("a", "b", 0), ("a", "b", 1), ("a", "b", 2), ("a", "c", np.nan), ], names=["scen", "mod", "id"], ), ) result = df.add(series, axis=0) tm.assert_frame_equal(result, expected) def test_frame_multiindex_operations_part_align(self): df = DataFrame( {2010: [1, 2, 3], 2020: [3, 4, 5]}, index=MultiIndex.from_tuples( [ ("a", "b", 0), ("a", "b", 1), ("a", "c", 2), ], names=["scen", "mod", "id"], ), ) series = Series( [0.4], index=MultiIndex.from_product([["b"], ["a"]], names=["mod", "scen"]), ) expected = DataFrame( {2010: [1.4, 2.4, np.nan], 2020: [3.4, 4.4, np.nan]}, index=MultiIndex.from_tuples( [ ("a", "b", 0), ("a", "b", 1), ("a", "c", 2), ], names=["scen", "mod", "id"], ), ) result = df.add(series, axis=0) tm.assert_frame_equal(result, expected) class TestFrameArithmetic: def test_td64_op_nat_casting(self): # Make sure we don't accidentally treat timedelta64(NaT) as datetime64 # when calling dispatch_to_series in DataFrame arithmetic ser = Series(["NaT", "NaT"], dtype="timedelta64[ns]") df = DataFrame([[1, 2], [3, 4]]) result = df * ser expected = DataFrame({0: ser, 1: ser}) tm.assert_frame_equal(result, expected) def test_df_add_2d_array_rowlike_broadcasts(self): # GH#23000 arr = np.arange(6).reshape(3, 2) df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) rowlike = arr[[1], :] # shape --> (1, ncols) assert rowlike.shape == (1, df.shape[1]) expected = DataFrame( [[2, 4], [4, 6], [6, 8]], columns=df.columns, index=df.index, # specify dtype explicitly to avoid failing # on 32bit builds dtype=arr.dtype, ) result = df + rowlike tm.assert_frame_equal(result, expected) result = rowlike + df tm.assert_frame_equal(result, expected) def test_df_add_2d_array_collike_broadcasts(self): # GH#23000 arr = np.arange(6).reshape(3, 2) df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) collike = arr[:, [1]] # shape --> (nrows, 1) assert collike.shape == (df.shape[0], 1) expected = DataFrame( [[1, 2], [5, 6], [9, 10]], columns=df.columns, index=df.index, # specify dtype explicitly to avoid failing # on 32bit builds dtype=arr.dtype, ) result = df + collike tm.assert_frame_equal(result, expected) result = collike + df tm.assert_frame_equal(result, expected) def test_df_arith_2d_array_rowlike_broadcasts( self, request, all_arithmetic_operators, using_array_manager ): # GH#23000 opname = all_arithmetic_operators if using_array_manager and opname in ("__rmod__", "__rfloordiv__"): # TODO(ArrayManager) decide on dtypes td.mark_array_manager_not_yet_implemented(request) arr = np.arange(6).reshape(3, 2) df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) rowlike = arr[[1], :] # shape --> (1, ncols) assert rowlike.shape == (1, df.shape[1]) exvals = [ getattr(df.loc["A"], opname)(rowlike.squeeze()), getattr(df.loc["B"], opname)(rowlike.squeeze()), getattr(df.loc["C"], opname)(rowlike.squeeze()), ] expected = DataFrame(exvals, columns=df.columns, index=df.index) result = getattr(df, opname)(rowlike) tm.assert_frame_equal(result, expected) def test_df_arith_2d_array_collike_broadcasts( self, request, all_arithmetic_operators, using_array_manager ): # GH#23000 opname = all_arithmetic_operators if using_array_manager and opname in ("__rmod__", "__rfloordiv__"): # TODO(ArrayManager) decide on dtypes td.mark_array_manager_not_yet_implemented(request) arr = np.arange(6).reshape(3, 2) df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) collike = arr[:, [1]] # shape --> (nrows, 1) assert collike.shape == (df.shape[0], 1) exvals = { True: getattr(df[True], opname)(collike.squeeze()), False: getattr(df[False], opname)(collike.squeeze()), } dtype = None if opname in ["__rmod__", "__rfloordiv__"]: # Series ops may return mixed int/float dtypes in cases where # DataFrame op will return all-float. So we upcast `expected` dtype = np.common_type(*(x.values for x in exvals.values())) expected = DataFrame(exvals, columns=df.columns, index=df.index, dtype=dtype) result = getattr(df, opname)(collike) tm.assert_frame_equal(result, expected) def test_df_bool_mul_int(self): # GH 22047, GH 22163 multiplication by 1 should result in int dtype, # not object dtype df = DataFrame([[False, True], [False, False]]) result = df * 1 # On appveyor this comes back as np.int32 instead of np.int64, # so we check dtype.kind instead of just dtype kinds = result.dtypes.apply(lambda x: x.kind) assert (kinds == "i").all() result = 1 * df kinds = result.dtypes.apply(lambda x: x.kind) assert (kinds == "i").all() def test_arith_mixed(self): left = DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]}) result = left + left expected = DataFrame({"A": ["aa", "bb", "cc"], "B": [2, 4, 6]}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("col", ["A", "B"]) def test_arith_getitem_commute(self, all_arithmetic_functions, col): df = DataFrame({"A": [1.1, 3.3], "B": [2.5, -3.9]}) result = all_arithmetic_functions(df, 1)[col] expected = all_arithmetic_functions(df[col], 1) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "values", [[1, 2], (1, 2), np.array([1, 2]), range(1, 3), deque([1, 2])] ) def test_arith_alignment_non_pandas_object(self, values): # GH#17901 df = DataFrame({"A": [1, 1], "B": [1, 1]}) expected = DataFrame({"A": [2, 2], "B": [3, 3]}) result = df + values tm.assert_frame_equal(result, expected) def test_arith_non_pandas_object(self): df = DataFrame( np.arange(1, 10, dtype="f8").reshape(3, 3), columns=["one", "two", "three"], index=["a", "b", "c"], ) val1 = df.xs("a").values added = DataFrame(df.values + val1, index=df.index, columns=df.columns) tm.assert_frame_equal(df + val1, added) added = DataFrame((df.values.T + val1).T, index=df.index, columns=df.columns) tm.assert_frame_equal(df.add(val1, axis=0), added) val2 = list(df["two"]) added = DataFrame(df.values + val2, index=df.index, columns=df.columns) tm.assert_frame_equal(df + val2, added) added = DataFrame((df.values.T + val2).T, index=df.index, columns=df.columns) tm.assert_frame_equal(df.add(val2, axis="index"), added) val3 = np.random.default_rng(2).random(df.shape) added = DataFrame(df.values + val3, index=df.index, columns=df.columns) tm.assert_frame_equal(df.add(val3), added) def test_operations_with_interval_categories_index(self, all_arithmetic_operators): # GH#27415 op = all_arithmetic_operators ind = pd.CategoricalIndex(pd.interval_range(start=0.0, end=2.0)) data = [1, 2] df = DataFrame([data], columns=ind) num = 10 result = getattr(df, op)(num) expected = DataFrame([[getattr(n, op)(num) for n in data]], columns=ind) tm.assert_frame_equal(result, expected) def test_frame_with_frame_reindex(self): # GH#31623 df = DataFrame( { "foo": [pd.Timestamp("2019"), pd.Timestamp("2020")], "bar": [pd.Timestamp("2018"), pd.Timestamp("2021")], }, columns=["foo", "bar"], dtype="M8[ns]", ) df2 = df[["foo"]] result = df - df2 expected = DataFrame( {"foo": [pd.Timedelta(0), pd.Timedelta(0)], "bar": [np.nan, np.nan]}, columns=["bar", "foo"], ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "value, dtype", [ (1, "i8"), (1.0, "f8"), (2**63, "f8"), (1j, "complex128"), (2**63, "complex128"), (True, "bool"), (np.timedelta64(20, "ns"), " b tm.assert_frame_equal(result, expected) result = df.values > b tm.assert_numpy_array_equal(result, expected.values) msg1d = "Unable to coerce to Series, length must be 2: given 3" msg2d = "Unable to coerce to DataFrame, shape must be" msg2db = "operands could not be broadcast together with shapes" with pytest.raises(ValueError, match=msg1d): # wrong shape df > lst with pytest.raises(ValueError, match=msg1d): # wrong shape df > tup # broadcasts like ndarray (GH#23000) result = df > b_r tm.assert_frame_equal(result, expected) result = df.values > b_r tm.assert_numpy_array_equal(result, expected.values) with pytest.raises(ValueError, match=msg2d): df > b_c with pytest.raises(ValueError, match=msg2db): df.values > b_c # == expected = DataFrame([[False, False], [True, False], [False, False]]) result = df == b tm.assert_frame_equal(result, expected) with pytest.raises(ValueError, match=msg1d): df == lst with pytest.raises(ValueError, match=msg1d): df == tup # broadcasts like ndarray (GH#23000) result = df == b_r tm.assert_frame_equal(result, expected) result = df.values == b_r tm.assert_numpy_array_equal(result, expected.values) with pytest.raises(ValueError, match=msg2d): df == b_c assert df.values.shape != b_c.shape # with alignment df = DataFrame( np.arange(6).reshape((3, 2)), columns=list("AB"), index=list("abc") ) expected.index = df.index expected.columns = df.columns with pytest.raises(ValueError, match=msg1d): df == lst with pytest.raises(ValueError, match=msg1d): df == tup def test_inplace_ops_alignment(self): # inplace ops / ops alignment # GH 8511 columns = list("abcdefg") X_orig = DataFrame( np.arange(10 * len(columns)).reshape(-1, len(columns)), columns=columns, index=range(10), ) Z = 100 * X_orig.iloc[:, 1:-1].copy() block1 = list("bedcf") subs = list("bcdef") # add X = X_orig.copy() result1 = (X[block1] + Z).reindex(columns=subs) X[block1] += Z result2 = X.reindex(columns=subs) X = X_orig.copy() result3 = (X[block1] + Z[block1]).reindex(columns=subs) X[block1] += Z[block1] result4 = X.reindex(columns=subs) tm.assert_frame_equal(result1, result2) tm.assert_frame_equal(result1, result3) tm.assert_frame_equal(result1, result4) # sub X = X_orig.copy() result1 = (X[block1] - Z).reindex(columns=subs) X[block1] -= Z result2 = X.reindex(columns=subs) X = X_orig.copy() result3 = (X[block1] - Z[block1]).reindex(columns=subs) X[block1] -= Z[block1] result4 = X.reindex(columns=subs) tm.assert_frame_equal(result1, result2) tm.assert_frame_equal(result1, result3) tm.assert_frame_equal(result1, result4) def test_inplace_ops_identity(self): # GH 5104 # make sure that we are actually changing the object s_orig = Series([1, 2, 3]) df_orig = DataFrame( np.random.default_rng(2).integers(0, 5, size=10).reshape(-1, 5) ) # no dtype change s = s_orig.copy() s2 = s s += 1 tm.assert_series_equal(s, s2) tm.assert_series_equal(s_orig + 1, s) assert s is s2 assert s._mgr is s2._mgr df = df_orig.copy() df2 = df df += 1 tm.assert_frame_equal(df, df2) tm.assert_frame_equal(df_orig + 1, df) assert df is df2 assert df._mgr is df2._mgr # dtype change s = s_orig.copy() s2 = s s += 1.5 tm.assert_series_equal(s, s2) tm.assert_series_equal(s_orig + 1.5, s) df = df_orig.copy() df2 = df df += 1.5 tm.assert_frame_equal(df, df2) tm.assert_frame_equal(df_orig + 1.5, df) assert df is df2 assert df._mgr is df2._mgr # mixed dtype arr = np.random.default_rng(2).integers(0, 10, size=5) df_orig = DataFrame({"A": arr.copy(), "B": "foo"}) df = df_orig.copy() df2 = df df["A"] += 1 expected = DataFrame({"A": arr.copy() + 1, "B": "foo"}) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df2, expected) assert df._mgr is df2._mgr df = df_orig.copy() df2 = df df["A"] += 1.5 expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"}) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df2, expected) assert df._mgr is df2._mgr @pytest.mark.parametrize( "op", [ "add", "and", pytest.param( "div", marks=pytest.mark.xfail( raises=AttributeError, reason="__idiv__ not implemented" ), ), "floordiv", "mod", "mul", "or", "pow", "sub", "truediv", "xor", ], ) def test_inplace_ops_identity2(self, op): df = DataFrame({"a": [1.0, 2.0, 3.0], "b": [1, 2, 3]}) operand = 2 if op in ("and", "or", "xor"): # cannot use floats for boolean ops df["a"] = [True, False, True] df_copy = df.copy() iop = f"__i{op}__" op = f"__{op}__" # no id change and value is correct getattr(df, iop)(operand) expected = getattr(df_copy, op)(operand) tm.assert_frame_equal(df, expected) expected = id(df) assert id(df) == expected @pytest.mark.parametrize( "val", [ [1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.int64), range(1, 4), ], ) def test_alignment_non_pandas(self, val): index = ["A", "B", "C"] columns = ["X", "Y", "Z"] df = DataFrame( np.random.default_rng(2).standard_normal((3, 3)), index=index, columns=columns, ) align = DataFrame._align_for_op expected = DataFrame({"X": val, "Y": val, "Z": val}, index=df.index) tm.assert_frame_equal(align(df, val, axis=0)[1], expected) expected = DataFrame( {"X": [1, 1, 1], "Y": [2, 2, 2], "Z": [3, 3, 3]}, index=df.index ) tm.assert_frame_equal(align(df, val, axis=1)[1], expected) @pytest.mark.parametrize("val", [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]) def test_alignment_non_pandas_length_mismatch(self, val): index = ["A", "B", "C"] columns = ["X", "Y", "Z"] df = DataFrame( np.random.default_rng(2).standard_normal((3, 3)), index=index, columns=columns, ) align = DataFrame._align_for_op # length mismatch msg = "Unable to coerce to Series, length must be 3: given 2" with pytest.raises(ValueError, match=msg): align(df, val, axis=0) with pytest.raises(ValueError, match=msg): align(df, val, axis=1) def test_alignment_non_pandas_index_columns(self): index = ["A", "B", "C"] columns = ["X", "Y", "Z"] df = DataFrame( np.random.default_rng(2).standard_normal((3, 3)), index=index, columns=columns, ) align = DataFrame._align_for_op val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) tm.assert_frame_equal( align(df, val, axis=0)[1], DataFrame(val, index=df.index, columns=df.columns), ) tm.assert_frame_equal( align(df, val, axis=1)[1], DataFrame(val, index=df.index, columns=df.columns), ) # shape mismatch msg = "Unable to coerce to DataFrame, shape must be" val = np.array([[1, 2, 3], [4, 5, 6]]) with pytest.raises(ValueError, match=msg): align(df, val, axis=0) with pytest.raises(ValueError, match=msg): align(df, val, axis=1) val = np.zeros((3, 3, 3)) msg = re.escape( "Unable to coerce to Series/DataFrame, dimension must be <= 2: (3, 3, 3)" ) with pytest.raises(ValueError, match=msg): align(df, val, axis=0) with pytest.raises(ValueError, match=msg): align(df, val, axis=1) def test_no_warning(self, all_arithmetic_operators): df = DataFrame({"A": [0.0, 0.0], "B": [0.0, None]}) b = df["B"] with tm.assert_produces_warning(None): getattr(df, all_arithmetic_operators)(b) def test_dunder_methods_binary(self, all_arithmetic_operators): # GH#??? frame.__foo__ should only accept one argument df = DataFrame({"A": [0.0, 0.0], "B": [0.0, None]}) b = df["B"] with pytest.raises(TypeError, match="takes 2 positional arguments"): getattr(df, all_arithmetic_operators)(b, 0) def test_align_int_fill_bug(self): # GH#910 X = np.arange(10 * 10, dtype="float64").reshape(10, 10) Y = np.ones((10, 1), dtype=int) df1 = DataFrame(X) df1["0.X"] = Y.squeeze() df2 = df1.astype(float) result = df1 - df1.mean() expected = df2 - df2.mean() tm.assert_frame_equal(result, expected) def test_pow_with_realignment(): # GH#32685 pow has special semantics for operating with null values left = DataFrame({"A": [0, 1, 2]}) right = DataFrame(index=[0, 1, 2]) result = left**right expected = DataFrame({"A": [np.nan, 1.0, np.nan]}) tm.assert_frame_equal(result, expected) def test_dataframe_series_extension_dtypes(): # https://github.com/pandas-dev/pandas/issues/34311 df = DataFrame( np.random.default_rng(2).integers(0, 100, (10, 3)), columns=["a", "b", "c"] ) ser = Series([1, 2, 3], index=["a", "b", "c"]) expected = df.to_numpy("int64") + ser.to_numpy("int64").reshape(-1, 3) expected = DataFrame(expected, columns=df.columns, dtype="Int64") df_ea = df.astype("Int64") result = df_ea + ser tm.assert_frame_equal(result, expected) result = df_ea + ser.astype("Int64") tm.assert_frame_equal(result, expected) def test_dataframe_blockwise_slicelike(): # GH#34367 arr = np.random.default_rng(2).integers(0, 1000, (100, 10)) df1 = DataFrame(arr) # Explicit cast to float to avoid implicit cast when setting nan df2 = df1.copy().astype({1: "float", 3: "float", 7: "float"}) df2.iloc[0, [1, 3, 7]] = np.nan # Explicit cast to float to avoid implicit cast when setting nan df3 = df1.copy().astype({5: "float"}) df3.iloc[0, [5]] = np.nan # Explicit cast to float to avoid implicit cast when setting nan df4 = df1.copy().astype({2: "float", 3: "float", 4: "float"}) df4.iloc[0, np.arange(2, 5)] = np.nan # Explicit cast to float to avoid implicit cast when setting nan df5 = df1.copy().astype({4: "float", 5: "float", 6: "float"}) df5.iloc[0, np.arange(4, 7)] = np.nan for left, right in [(df1, df2), (df2, df3), (df4, df5)]: res = left + right expected = DataFrame({i: left[i] + right[i] for i in left.columns}) tm.assert_frame_equal(res, expected) @pytest.mark.parametrize( "df, col_dtype", [ (DataFrame([[1.0, 2.0], [4.0, 5.0]], columns=list("ab")), "float64"), ( DataFrame([[1.0, "b"], [4.0, "b"]], columns=list("ab")).astype( {"b": object} ), "object", ), ], ) def test_dataframe_operation_with_non_numeric_types(df, col_dtype): # GH #22663 expected = DataFrame([[0.0, np.nan], [3.0, np.nan]], columns=list("ab")) expected = expected.astype({"b": col_dtype}) result = df + Series([-1.0], index=list("a")) tm.assert_frame_equal(result, expected) def test_arith_reindex_with_duplicates(): # https://github.com/pandas-dev/pandas/issues/35194 df1 = DataFrame(data=[[0]], columns=["second"]) df2 = DataFrame(data=[[0, 0, 0]], columns=["first", "second", "second"]) result = df1 + df2 expected = DataFrame([[np.nan, 0, 0]], columns=["first", "second", "second"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("to_add", [[Series([1, 1])], [Series([1, 1]), Series([1, 1])]]) def test_arith_list_of_arraylike_raise(to_add): # GH 36702. Raise when trying to add list of array-like to DataFrame df = DataFrame({"x": [1, 2], "y": [1, 2]}) msg = f"Unable to coerce list of {type(to_add[0])} to Series/DataFrame" with pytest.raises(ValueError, match=msg): df + to_add with pytest.raises(ValueError, match=msg): to_add + df def test_inplace_arithmetic_series_update(using_copy_on_write, warn_copy_on_write): # https://github.com/pandas-dev/pandas/issues/36373 df = DataFrame({"A": [1, 2, 3]}) df_orig = df.copy() series = df["A"] vals = series._values with tm.assert_cow_warning(warn_copy_on_write): series += 1 if using_copy_on_write: assert series._values is not vals tm.assert_frame_equal(df, df_orig) else: assert series._values is vals expected = DataFrame({"A": [2, 3, 4]}) tm.assert_frame_equal(df, expected) def test_arithmetic_multiindex_align(): """ Regression test for: https://github.com/pandas-dev/pandas/issues/33765 """ df1 = DataFrame( [[1]], index=["a"], columns=MultiIndex.from_product([[0], [1]], names=["a", "b"]), ) df2 = DataFrame([[1]], index=["a"], columns=Index([0], name="a")) expected = DataFrame( [[0]], index=["a"], columns=MultiIndex.from_product([[0], [1]], names=["a", "b"]), ) result = df1 - df2 tm.assert_frame_equal(result, expected) def test_bool_frame_mult_float(): # GH 18549 df = DataFrame(True, list("ab"), list("cd")) result = df * 1.0 expected = DataFrame(np.ones((2, 2)), list("ab"), list("cd")) tm.assert_frame_equal(result, expected) def test_frame_sub_nullable_int(any_int_ea_dtype): # GH 32822 series1 = Series([1, 2, None], dtype=any_int_ea_dtype) series2 = Series([1, 2, 3], dtype=any_int_ea_dtype) expected = DataFrame([0, 0, None], dtype=any_int_ea_dtype) result = series1.to_frame() - series2.to_frame() tm.assert_frame_equal(result, expected) @pytest.mark.filterwarnings( "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" ) def test_frame_op_subclass_nonclass_constructor(): # GH#43201 subclass._constructor is a function, not the subclass itself class SubclassedSeries(Series): @property def _constructor(self): return SubclassedSeries @property def _constructor_expanddim(self): return SubclassedDataFrame class SubclassedDataFrame(DataFrame): _metadata = ["my_extra_data"] def __init__(self, my_extra_data, *args, **kwargs) -> None: self.my_extra_data = my_extra_data super().__init__(*args, **kwargs) @property def _constructor(self): return functools.partial(type(self), self.my_extra_data) @property def _constructor_sliced(self): return SubclassedSeries sdf = SubclassedDataFrame("some_data", {"A": [1, 2, 3], "B": [4, 5, 6]}) result = sdf * 2 expected = SubclassedDataFrame("some_data", {"A": [2, 4, 6], "B": [8, 10, 12]}) tm.assert_frame_equal(result, expected) result = sdf + sdf tm.assert_frame_equal(result, expected) def test_enum_column_equality(): Cols = Enum("Cols", "col1 col2") q1 = DataFrame({Cols.col1: [1, 2, 3]}) q2 = DataFrame({Cols.col1: [1, 2, 3]}) result = q1[Cols.col1] == q2[Cols.col1] expected = Series([True, True, True], name=Cols.col1) tm.assert_series_equal(result, expected) def test_mixed_col_index_dtype(): # GH 47382 df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) df2 = DataFrame(columns=list("abc"), data=0.0, index=[0]) df1.columns = df2.columns.astype("string") result = df1 + df2 expected = DataFrame(columns=list("abc"), data=1.0, index=[0]) tm.assert_frame_equal(result, expected)