You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
390 lines
14 KiB
390 lines
14 KiB
import datetime as dt
|
|
from itertools import combinations
|
|
|
|
import dateutil
|
|
import numpy as np
|
|
import pytest
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
DataFrame,
|
|
Index,
|
|
Series,
|
|
Timestamp,
|
|
concat,
|
|
isna,
|
|
)
|
|
import pandas._testing as tm
|
|
|
|
|
|
class TestAppend:
|
|
def test_append(self, sort, float_frame):
|
|
mixed_frame = float_frame.copy()
|
|
mixed_frame["foo"] = "bar"
|
|
|
|
begin_index = float_frame.index[:5]
|
|
end_index = float_frame.index[5:]
|
|
|
|
begin_frame = float_frame.reindex(begin_index)
|
|
end_frame = float_frame.reindex(end_index)
|
|
|
|
appended = begin_frame._append(end_frame)
|
|
tm.assert_almost_equal(appended["A"], float_frame["A"])
|
|
|
|
del end_frame["A"]
|
|
partial_appended = begin_frame._append(end_frame, sort=sort)
|
|
assert "A" in partial_appended
|
|
|
|
partial_appended = end_frame._append(begin_frame, sort=sort)
|
|
assert "A" in partial_appended
|
|
|
|
# mixed type handling
|
|
appended = mixed_frame[:5]._append(mixed_frame[5:])
|
|
tm.assert_frame_equal(appended, mixed_frame)
|
|
|
|
# what to test here
|
|
mixed_appended = mixed_frame[:5]._append(float_frame[5:], sort=sort)
|
|
mixed_appended2 = float_frame[:5]._append(mixed_frame[5:], sort=sort)
|
|
|
|
# all equal except 'foo' column
|
|
tm.assert_frame_equal(
|
|
mixed_appended.reindex(columns=["A", "B", "C", "D"]),
|
|
mixed_appended2.reindex(columns=["A", "B", "C", "D"]),
|
|
)
|
|
|
|
def test_append_empty(self, float_frame):
|
|
empty = DataFrame()
|
|
|
|
appended = float_frame._append(empty)
|
|
tm.assert_frame_equal(float_frame, appended)
|
|
assert appended is not float_frame
|
|
|
|
appended = empty._append(float_frame)
|
|
tm.assert_frame_equal(float_frame, appended)
|
|
assert appended is not float_frame
|
|
|
|
def test_append_overlap_raises(self, float_frame):
|
|
msg = "Indexes have overlapping values"
|
|
with pytest.raises(ValueError, match=msg):
|
|
float_frame._append(float_frame, verify_integrity=True)
|
|
|
|
def test_append_new_columns(self):
|
|
# see gh-6129: new columns
|
|
df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}})
|
|
row = Series([5, 6, 7], index=["a", "b", "c"], name="z")
|
|
expected = DataFrame(
|
|
{
|
|
"a": {"x": 1, "y": 2, "z": 5},
|
|
"b": {"x": 3, "y": 4, "z": 6},
|
|
"c": {"z": 7},
|
|
}
|
|
)
|
|
result = df._append(row)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_append_length0_frame(self, sort):
|
|
df = DataFrame(columns=["A", "B", "C"])
|
|
df3 = DataFrame(index=[0, 1], columns=["A", "B"])
|
|
df5 = df._append(df3, sort=sort)
|
|
|
|
expected = DataFrame(index=[0, 1], columns=["A", "B", "C"])
|
|
tm.assert_frame_equal(df5, expected)
|
|
|
|
def test_append_records(self):
|
|
arr1 = np.zeros((2,), dtype=("i4,f4,S10"))
|
|
arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
|
|
|
|
arr2 = np.zeros((3,), dtype=("i4,f4,S10"))
|
|
arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")]
|
|
|
|
df1 = DataFrame(arr1)
|
|
df2 = DataFrame(arr2)
|
|
|
|
result = df1._append(df2, ignore_index=True)
|
|
expected = DataFrame(np.concatenate((arr1, arr2)))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# rewrite sort fixture, since we also want to test default of None
|
|
def test_append_sorts(self, sort):
|
|
df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
|
|
df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3])
|
|
|
|
result = df1._append(df2, sort=sort)
|
|
|
|
# for None / True
|
|
expected = DataFrame(
|
|
{"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]},
|
|
columns=["a", "b", "c"],
|
|
)
|
|
if sort is False:
|
|
expected = expected[["b", "a", "c"]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_append_different_columns(self, sort):
|
|
df = DataFrame(
|
|
{
|
|
"bools": np.random.default_rng(2).standard_normal(10) > 0,
|
|
"ints": np.random.default_rng(2).integers(0, 10, 10),
|
|
"floats": np.random.default_rng(2).standard_normal(10),
|
|
"strings": ["foo", "bar"] * 5,
|
|
}
|
|
)
|
|
|
|
a = df[:5].loc[:, ["bools", "ints", "floats"]]
|
|
b = df[5:].loc[:, ["strings", "ints", "floats"]]
|
|
|
|
appended = a._append(b, sort=sort)
|
|
assert isna(appended["strings"][0:4]).all()
|
|
assert isna(appended["bools"][5:]).all()
|
|
|
|
def test_append_many(self, sort, float_frame):
|
|
chunks = [
|
|
float_frame[:5],
|
|
float_frame[5:10],
|
|
float_frame[10:15],
|
|
float_frame[15:],
|
|
]
|
|
|
|
result = chunks[0]._append(chunks[1:])
|
|
tm.assert_frame_equal(result, float_frame)
|
|
|
|
chunks[-1] = chunks[-1].copy()
|
|
chunks[-1]["foo"] = "bar"
|
|
result = chunks[0]._append(chunks[1:], sort=sort)
|
|
tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame)
|
|
assert (result["foo"][15:] == "bar").all()
|
|
assert result["foo"][:15].isna().all()
|
|
|
|
def test_append_preserve_index_name(self):
|
|
# #980
|
|
df1 = DataFrame(columns=["A", "B", "C"])
|
|
df1 = df1.set_index(["A"])
|
|
df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"])
|
|
df2 = df2.set_index(["A"])
|
|
|
|
msg = "The behavior of array concatenation with empty entries is deprecated"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
result = df1._append(df2)
|
|
assert result.index.name == "A"
|
|
|
|
indexes_can_append = [
|
|
pd.RangeIndex(3),
|
|
Index([4, 5, 6]),
|
|
Index([4.5, 5.5, 6.5]),
|
|
Index(list("abc")),
|
|
pd.CategoricalIndex("A B C".split()),
|
|
pd.CategoricalIndex("D E F".split(), ordered=True),
|
|
pd.IntervalIndex.from_breaks([7, 8, 9, 10]),
|
|
pd.DatetimeIndex(
|
|
[
|
|
dt.datetime(2013, 1, 3, 0, 0),
|
|
dt.datetime(2013, 1, 3, 6, 10),
|
|
dt.datetime(2013, 1, 3, 7, 12),
|
|
]
|
|
),
|
|
pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]),
|
|
]
|
|
|
|
@pytest.mark.parametrize(
|
|
"index", indexes_can_append, ids=lambda x: type(x).__name__
|
|
)
|
|
def test_append_same_columns_type(self, index):
|
|
# GH18359
|
|
|
|
# df wider than ser
|
|
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
|
|
ser_index = index[:2]
|
|
ser = Series([7, 8], index=ser_index, name=2)
|
|
result = df._append(ser)
|
|
expected = DataFrame(
|
|
[[1, 2, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index
|
|
)
|
|
# integer dtype is preserved for columns present in ser.index
|
|
assert expected.dtypes.iloc[0].kind == "i"
|
|
assert expected.dtypes.iloc[1].kind == "i"
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# ser wider than df
|
|
ser_index = index
|
|
index = index[:2]
|
|
df = DataFrame([[1, 2], [4, 5]], columns=index)
|
|
ser = Series([7, 8, 9], index=ser_index, name=2)
|
|
result = df._append(ser)
|
|
expected = DataFrame(
|
|
[[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
|
|
index=[0, 1, 2],
|
|
columns=ser_index,
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"df_columns, series_index",
|
|
combinations(indexes_can_append, r=2),
|
|
ids=lambda x: type(x).__name__,
|
|
)
|
|
def test_append_different_columns_types(self, df_columns, series_index):
|
|
# GH18359
|
|
# See also test 'test_append_different_columns_types_raises' below
|
|
# for errors raised when appending
|
|
|
|
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
|
|
ser = Series([7, 8, 9], index=series_index, name=2)
|
|
|
|
result = df._append(ser)
|
|
idx_diff = ser.index.difference(df_columns)
|
|
combined_columns = Index(df_columns.tolist()).append(idx_diff)
|
|
expected = DataFrame(
|
|
[
|
|
[1.0, 2.0, 3.0, np.nan, np.nan, np.nan],
|
|
[4, 5, 6, np.nan, np.nan, np.nan],
|
|
[np.nan, np.nan, np.nan, 7, 8, 9],
|
|
],
|
|
index=[0, 1, 2],
|
|
columns=combined_columns,
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_append_dtype_coerce(self, sort):
|
|
# GH 4993
|
|
# appending with datetime will incorrectly convert datetime64
|
|
|
|
df1 = DataFrame(
|
|
index=[1, 2],
|
|
data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)],
|
|
columns=["start_time"],
|
|
)
|
|
df2 = DataFrame(
|
|
index=[4, 5],
|
|
data=[
|
|
[dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)],
|
|
[dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)],
|
|
],
|
|
columns=["start_time", "end_time"],
|
|
)
|
|
|
|
expected = concat(
|
|
[
|
|
Series(
|
|
[
|
|
pd.NaT,
|
|
pd.NaT,
|
|
dt.datetime(2013, 1, 3, 6, 10),
|
|
dt.datetime(2013, 1, 4, 7, 10),
|
|
],
|
|
name="end_time",
|
|
),
|
|
Series(
|
|
[
|
|
dt.datetime(2013, 1, 1, 0, 0),
|
|
dt.datetime(2013, 1, 2, 0, 0),
|
|
dt.datetime(2013, 1, 3, 0, 0),
|
|
dt.datetime(2013, 1, 4, 0, 0),
|
|
],
|
|
name="start_time",
|
|
),
|
|
],
|
|
axis=1,
|
|
sort=sort,
|
|
)
|
|
result = df1._append(df2, ignore_index=True, sort=sort)
|
|
if sort:
|
|
expected = expected[["end_time", "start_time"]]
|
|
else:
|
|
expected = expected[["start_time", "end_time"]]
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_append_missing_column_proper_upcast(self, sort):
|
|
df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")})
|
|
df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)})
|
|
|
|
appended = df1._append(df2, ignore_index=True, sort=sort)
|
|
assert appended["A"].dtype == "f8"
|
|
assert appended["B"].dtype == "O"
|
|
|
|
def test_append_empty_frame_to_series_with_dateutil_tz(self):
|
|
# GH 23682
|
|
date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
|
|
ser = Series({"a": 1.0, "b": 2.0, "date": date})
|
|
df = DataFrame(columns=["c", "d"])
|
|
result_a = df._append(ser, ignore_index=True)
|
|
expected = DataFrame(
|
|
[[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
|
|
)
|
|
# These columns get cast to object after append
|
|
expected["c"] = expected["c"].astype(object)
|
|
expected["d"] = expected["d"].astype(object)
|
|
tm.assert_frame_equal(result_a, expected)
|
|
|
|
expected = DataFrame(
|
|
[[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]
|
|
)
|
|
expected["c"] = expected["c"].astype(object)
|
|
expected["d"] = expected["d"].astype(object)
|
|
result_b = result_a._append(ser, ignore_index=True)
|
|
tm.assert_frame_equal(result_b, expected)
|
|
|
|
result = df._append([ser, ser], ignore_index=True)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_append_empty_tz_frame_with_datetime64ns(self, using_array_manager):
|
|
# https://github.com/pandas-dev/pandas/issues/35460
|
|
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
|
|
|
|
# pd.NaT gets inferred as tz-naive, so append result is tz-naive
|
|
result = df._append({"a": pd.NaT}, ignore_index=True)
|
|
if using_array_manager:
|
|
expected = DataFrame({"a": [pd.NaT]}, dtype=object)
|
|
else:
|
|
expected = DataFrame({"a": [np.nan]}, dtype=object)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# also test with typed value to append
|
|
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
|
|
other = Series({"a": pd.NaT}, dtype="datetime64[ns]")
|
|
result = df._append(other, ignore_index=True)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# mismatched tz
|
|
other = Series({"a": pd.NaT}, dtype="datetime64[ns, US/Pacific]")
|
|
result = df._append(other, ignore_index=True)
|
|
expected = DataFrame({"a": [pd.NaT]}).astype(object)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
|
|
)
|
|
@pytest.mark.parametrize("val", [1, "NaT"])
|
|
def test_append_empty_frame_with_timedelta64ns_nat(
|
|
self, dtype_str, val, using_array_manager
|
|
):
|
|
# https://github.com/pandas-dev/pandas/issues/35460
|
|
df = DataFrame(columns=["a"]).astype(dtype_str)
|
|
|
|
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
|
|
result = df._append(other, ignore_index=True)
|
|
|
|
expected = other.astype(object)
|
|
if isinstance(val, str) and dtype_str != "int64" and not using_array_manager:
|
|
# TODO: expected used to be `other.astype(object)` which is a more
|
|
# reasonable result. This was changed when tightening
|
|
# assert_frame_equal's treatment of mismatched NAs to match the
|
|
# existing behavior.
|
|
expected = DataFrame({"a": [np.nan]}, dtype=object)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
|
|
)
|
|
@pytest.mark.parametrize("val", [1, "NaT"])
|
|
def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val):
|
|
# https://github.com/pandas-dev/pandas/issues/35460
|
|
df = DataFrame({"a": pd.array([1], dtype=dtype_str)})
|
|
|
|
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
|
|
result = df._append(other, ignore_index=True)
|
|
|
|
expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object)
|
|
tm.assert_frame_equal(result, expected)
|