You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
296 lines
10 KiB
296 lines
10 KiB
import numpy as np
|
|
import pytest
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
DataFrame,
|
|
RangeIndex,
|
|
Series,
|
|
concat,
|
|
date_range,
|
|
)
|
|
import pandas._testing as tm
|
|
|
|
|
|
class TestEmptyConcat:
|
|
def test_handle_empty_objects(self, sort, using_infer_string):
|
|
df = DataFrame(
|
|
np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd")
|
|
)
|
|
|
|
dfcopy = df[:5].copy()
|
|
dfcopy["foo"] = "bar"
|
|
empty = df[5:5]
|
|
|
|
frames = [dfcopy, empty, empty, df[5:]]
|
|
concatted = concat(frames, axis=0, sort=sort)
|
|
|
|
expected = df.reindex(columns=["a", "b", "c", "d", "foo"])
|
|
expected["foo"] = expected["foo"].astype(
|
|
object if not using_infer_string else "string[pyarrow_numpy]"
|
|
)
|
|
expected.loc[0:4, "foo"] = "bar"
|
|
|
|
tm.assert_frame_equal(concatted, expected)
|
|
|
|
# empty as first element with time series
|
|
# GH3259
|
|
df = DataFrame(
|
|
{"A": range(10000)}, index=date_range("20130101", periods=10000, freq="s")
|
|
)
|
|
empty = DataFrame()
|
|
result = concat([df, empty], axis=1)
|
|
tm.assert_frame_equal(result, df)
|
|
result = concat([empty, df], axis=1)
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
result = concat([df, empty])
|
|
tm.assert_frame_equal(result, df)
|
|
result = concat([empty, df])
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
def test_concat_empty_series(self):
|
|
# GH 11082
|
|
s1 = Series([1, 2, 3], name="x")
|
|
s2 = Series(name="y", dtype="float64")
|
|
res = concat([s1, s2], axis=1)
|
|
exp = DataFrame(
|
|
{"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]},
|
|
index=RangeIndex(3),
|
|
)
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
s1 = Series([1, 2, 3], name="x")
|
|
s2 = Series(name="y", dtype="float64")
|
|
msg = "The behavior of array concatenation with empty entries is deprecated"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
res = concat([s1, s2], axis=0)
|
|
# name will be reset
|
|
exp = Series([1, 2, 3])
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
# empty Series with no name
|
|
s1 = Series([1, 2, 3], name="x")
|
|
s2 = Series(name=None, dtype="float64")
|
|
res = concat([s1, s2], axis=1)
|
|
exp = DataFrame(
|
|
{"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]},
|
|
columns=["x", 0],
|
|
index=RangeIndex(3),
|
|
)
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
@pytest.mark.parametrize("tz", [None, "UTC"])
|
|
@pytest.mark.parametrize("values", [[], [1, 2, 3]])
|
|
def test_concat_empty_series_timelike(self, tz, values):
|
|
# GH 18447
|
|
|
|
first = Series([], dtype="M8[ns]").dt.tz_localize(tz)
|
|
dtype = None if values else np.float64
|
|
second = Series(values, dtype=dtype)
|
|
|
|
expected = DataFrame(
|
|
{
|
|
0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz),
|
|
1: values,
|
|
}
|
|
)
|
|
result = concat([first, second], axis=1)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"left,right,expected",
|
|
[
|
|
# booleans
|
|
(np.bool_, np.int32, np.object_), # changed from int32 in 2.0 GH#39817
|
|
(np.bool_, np.float32, np.object_),
|
|
# datetime-like
|
|
("m8[ns]", np.bool_, np.object_),
|
|
("m8[ns]", np.int64, np.object_),
|
|
("M8[ns]", np.bool_, np.object_),
|
|
("M8[ns]", np.int64, np.object_),
|
|
# categorical
|
|
("category", "category", "category"),
|
|
("category", "object", "object"),
|
|
],
|
|
)
|
|
def test_concat_empty_series_dtypes(self, left, right, expected):
|
|
# GH#39817, GH#45101
|
|
result = concat([Series(dtype=left), Series(dtype=right)])
|
|
assert result.dtype == expected
|
|
|
|
@pytest.mark.parametrize(
|
|
"dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]
|
|
)
|
|
def test_concat_empty_series_dtypes_match_roundtrips(self, dtype):
|
|
dtype = np.dtype(dtype)
|
|
|
|
result = concat([Series(dtype=dtype)])
|
|
assert result.dtype == dtype
|
|
|
|
result = concat([Series(dtype=dtype), Series(dtype=dtype)])
|
|
assert result.dtype == dtype
|
|
|
|
@pytest.mark.parametrize("dtype", ["float64", "int8", "uint8", "m8[ns]", "M8[ns]"])
|
|
@pytest.mark.parametrize(
|
|
"dtype2",
|
|
["float64", "int8", "uint8", "m8[ns]", "M8[ns]"],
|
|
)
|
|
def test_concat_empty_series_dtypes_roundtrips(self, dtype, dtype2):
|
|
# round-tripping with self & like self
|
|
if dtype == dtype2:
|
|
pytest.skip("same dtype is not applicable for test")
|
|
|
|
def int_result_type(dtype, dtype2):
|
|
typs = {dtype.kind, dtype2.kind}
|
|
if not len(typs - {"i", "u", "b"}) and (
|
|
dtype.kind == "i" or dtype2.kind == "i"
|
|
):
|
|
return "i"
|
|
elif not len(typs - {"u", "b"}) and (
|
|
dtype.kind == "u" or dtype2.kind == "u"
|
|
):
|
|
return "u"
|
|
return None
|
|
|
|
def float_result_type(dtype, dtype2):
|
|
typs = {dtype.kind, dtype2.kind}
|
|
if not len(typs - {"f", "i", "u"}) and (
|
|
dtype.kind == "f" or dtype2.kind == "f"
|
|
):
|
|
return "f"
|
|
return None
|
|
|
|
def get_result_type(dtype, dtype2):
|
|
result = float_result_type(dtype, dtype2)
|
|
if result is not None:
|
|
return result
|
|
result = int_result_type(dtype, dtype2)
|
|
if result is not None:
|
|
return result
|
|
return "O"
|
|
|
|
dtype = np.dtype(dtype)
|
|
dtype2 = np.dtype(dtype2)
|
|
expected = get_result_type(dtype, dtype2)
|
|
result = concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype
|
|
assert result.kind == expected
|
|
|
|
def test_concat_empty_series_dtypes_triple(self):
|
|
assert (
|
|
concat(
|
|
[Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)]
|
|
).dtype
|
|
== np.object_
|
|
)
|
|
|
|
def test_concat_empty_series_dtype_category_with_array(self):
|
|
# GH#18515
|
|
assert (
|
|
concat(
|
|
[Series(np.array([]), dtype="category"), Series(dtype="float64")]
|
|
).dtype
|
|
== "float64"
|
|
)
|
|
|
|
def test_concat_empty_series_dtypes_sparse(self):
|
|
result = concat(
|
|
[
|
|
Series(dtype="float64").astype("Sparse"),
|
|
Series(dtype="float64").astype("Sparse"),
|
|
]
|
|
)
|
|
assert result.dtype == "Sparse[float64]"
|
|
|
|
result = concat(
|
|
[Series(dtype="float64").astype("Sparse"), Series(dtype="float64")]
|
|
)
|
|
expected = pd.SparseDtype(np.float64)
|
|
assert result.dtype == expected
|
|
|
|
result = concat(
|
|
[Series(dtype="float64").astype("Sparse"), Series(dtype="object")]
|
|
)
|
|
expected = pd.SparseDtype("object")
|
|
assert result.dtype == expected
|
|
|
|
def test_concat_empty_df_object_dtype(self):
|
|
# GH 9149
|
|
df_1 = DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]})
|
|
df_2 = DataFrame(columns=df_1.columns)
|
|
result = concat([df_1, df_2], axis=0)
|
|
expected = df_1.astype(object)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_concat_empty_dataframe_dtypes(self):
|
|
df = DataFrame(columns=list("abc"))
|
|
df["a"] = df["a"].astype(np.bool_)
|
|
df["b"] = df["b"].astype(np.int32)
|
|
df["c"] = df["c"].astype(np.float64)
|
|
|
|
result = concat([df, df])
|
|
assert result["a"].dtype == np.bool_
|
|
assert result["b"].dtype == np.int32
|
|
assert result["c"].dtype == np.float64
|
|
|
|
result = concat([df, df.astype(np.float64)])
|
|
assert result["a"].dtype == np.object_
|
|
assert result["b"].dtype == np.float64
|
|
assert result["c"].dtype == np.float64
|
|
|
|
def test_concat_inner_join_empty(self):
|
|
# GH 15328
|
|
df_empty = DataFrame()
|
|
df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64")
|
|
df_expected = DataFrame({"a": []}, index=RangeIndex(0), dtype="int64")
|
|
|
|
result = concat([df_a, df_empty], axis=1, join="inner")
|
|
tm.assert_frame_equal(result, df_expected)
|
|
|
|
result = concat([df_a, df_empty], axis=1, join="outer")
|
|
tm.assert_frame_equal(result, df_a)
|
|
|
|
def test_empty_dtype_coerce(self):
|
|
# xref to #12411
|
|
# xref to #12045
|
|
# xref to #11594
|
|
# see below
|
|
|
|
# 10571
|
|
df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"])
|
|
df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"])
|
|
result = concat([df1, df2])
|
|
expected = df1.dtypes
|
|
tm.assert_series_equal(result.dtypes, expected)
|
|
|
|
def test_concat_empty_dataframe(self):
|
|
# 39037
|
|
df1 = DataFrame(columns=["a", "b"])
|
|
df2 = DataFrame(columns=["b", "c"])
|
|
result = concat([df1, df2, df1])
|
|
expected = DataFrame(columns=["a", "b", "c"])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df3 = DataFrame(columns=["a", "b"])
|
|
df4 = DataFrame(columns=["b"])
|
|
result = concat([df3, df4])
|
|
expected = DataFrame(columns=["a", "b"])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_concat_empty_dataframe_different_dtypes(self, using_infer_string):
|
|
# 39037
|
|
df1 = DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
|
|
df2 = DataFrame({"a": [1, 2, 3]})
|
|
|
|
result = concat([df1[:0], df2[:0]])
|
|
assert result["a"].dtype == np.int64
|
|
assert result["b"].dtype == np.object_ if not using_infer_string else "string"
|
|
|
|
def test_concat_to_empty_ea(self):
|
|
"""48510 `concat` to an empty EA should maintain type EA dtype."""
|
|
df_empty = DataFrame({"a": pd.array([], dtype=pd.Int64Dtype())})
|
|
df_new = DataFrame({"a": pd.array([1, 2, 3], dtype=pd.Int64Dtype())})
|
|
expected = df_new.copy()
|
|
result = concat([df_empty, df_new])
|
|
tm.assert_frame_equal(result, expected)
|