You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
473 lines
17 KiB
473 lines
17 KiB
from copy import deepcopy
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from pandas.errors import PerformanceWarning
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
DataFrame,
|
|
Index,
|
|
MultiIndex,
|
|
Series,
|
|
concat,
|
|
)
|
|
import pandas._testing as tm
|
|
|
|
|
|
class TestIndexConcat:
|
|
def test_concat_ignore_index(self, sort):
|
|
frame1 = DataFrame(
|
|
{"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]}
|
|
)
|
|
frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]})
|
|
frame1.index = Index(["x", "y", "z"])
|
|
frame2.index = Index(["x", "y", "q"])
|
|
|
|
v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort)
|
|
|
|
nan = np.nan
|
|
expected = DataFrame(
|
|
[
|
|
[nan, nan, nan, 4.3],
|
|
["a", 1, 4.5, 5.2],
|
|
["b", 2, 3.2, 2.2],
|
|
["c", 3, 1.2, nan],
|
|
],
|
|
index=Index(["q", "x", "y", "z"]),
|
|
)
|
|
if not sort:
|
|
expected = expected.loc[["x", "y", "z", "q"]]
|
|
|
|
tm.assert_frame_equal(v1, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"name_in1,name_in2,name_in3,name_out",
|
|
[
|
|
("idx", "idx", "idx", "idx"),
|
|
("idx", "idx", None, None),
|
|
("idx", None, None, None),
|
|
("idx1", "idx2", None, None),
|
|
("idx1", "idx1", "idx2", None),
|
|
("idx1", "idx2", "idx3", None),
|
|
(None, None, None, None),
|
|
],
|
|
)
|
|
def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out):
|
|
# GH13475
|
|
indices = [
|
|
Index(["a", "b", "c"], name=name_in1),
|
|
Index(["b", "c", "d"], name=name_in2),
|
|
Index(["c", "d", "e"], name=name_in3),
|
|
]
|
|
frames = [
|
|
DataFrame({c: [0, 1, 2]}, index=i) for i, c in zip(indices, ["x", "y", "z"])
|
|
]
|
|
result = concat(frames, axis=1)
|
|
|
|
exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out)
|
|
expected = DataFrame(
|
|
{
|
|
"x": [0, 1, 2, np.nan, np.nan],
|
|
"y": [np.nan, 0, 1, 2, np.nan],
|
|
"z": [np.nan, np.nan, 0, 1, 2],
|
|
},
|
|
index=exp_ind,
|
|
)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_concat_rename_index(self):
|
|
a = DataFrame(
|
|
np.random.default_rng(2).random((3, 3)),
|
|
columns=list("ABC"),
|
|
index=Index(list("abc"), name="index_a"),
|
|
)
|
|
b = DataFrame(
|
|
np.random.default_rng(2).random((3, 3)),
|
|
columns=list("ABC"),
|
|
index=Index(list("abc"), name="index_b"),
|
|
)
|
|
|
|
result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"])
|
|
|
|
exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"])
|
|
names = list(exp.index.names)
|
|
names[1] = "lvl1"
|
|
exp.index.set_names(names, inplace=True)
|
|
|
|
tm.assert_frame_equal(result, exp)
|
|
assert result.index.names == exp.index.names
|
|
|
|
def test_concat_copy_index_series(self, axis, using_copy_on_write):
|
|
# GH 29879
|
|
ser = Series([1, 2])
|
|
comb = concat([ser, ser], axis=axis, copy=True)
|
|
if not using_copy_on_write or axis in [0, "index"]:
|
|
assert comb.index is not ser.index
|
|
else:
|
|
assert comb.index is ser.index
|
|
|
|
def test_concat_copy_index_frame(self, axis, using_copy_on_write):
|
|
# GH 29879
|
|
df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
|
|
comb = concat([df, df], axis=axis, copy=True)
|
|
if not using_copy_on_write:
|
|
assert not comb.index.is_(df.index)
|
|
assert not comb.columns.is_(df.columns)
|
|
elif axis in [0, "index"]:
|
|
assert not comb.index.is_(df.index)
|
|
assert comb.columns.is_(df.columns)
|
|
elif axis in [1, "columns"]:
|
|
assert comb.index.is_(df.index)
|
|
assert not comb.columns.is_(df.columns)
|
|
|
|
def test_default_index(self):
|
|
# is_series and ignore_index
|
|
s1 = Series([1, 2, 3], name="x")
|
|
s2 = Series([4, 5, 6], name="y")
|
|
res = concat([s1, s2], axis=1, ignore_index=True)
|
|
assert isinstance(res.columns, pd.RangeIndex)
|
|
exp = DataFrame([[1, 4], [2, 5], [3, 6]])
|
|
# use check_index_type=True to check the result have
|
|
# RangeIndex (default index)
|
|
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
|
|
|
|
# is_series and all inputs have no names
|
|
s1 = Series([1, 2, 3])
|
|
s2 = Series([4, 5, 6])
|
|
res = concat([s1, s2], axis=1, ignore_index=False)
|
|
assert isinstance(res.columns, pd.RangeIndex)
|
|
exp = DataFrame([[1, 4], [2, 5], [3, 6]])
|
|
exp.columns = pd.RangeIndex(2)
|
|
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
|
|
|
|
# is_dataframe and ignore_index
|
|
df1 = DataFrame({"A": [1, 2], "B": [5, 6]})
|
|
df2 = DataFrame({"A": [3, 4], "B": [7, 8]})
|
|
|
|
res = concat([df1, df2], axis=0, ignore_index=True)
|
|
exp = DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"])
|
|
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
|
|
|
|
res = concat([df1, df2], axis=1, ignore_index=True)
|
|
exp = DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]])
|
|
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
|
|
|
|
def test_dups_index(self):
|
|
# GH 4771
|
|
|
|
# single dtypes
|
|
df = DataFrame(
|
|
np.random.default_rng(2).integers(0, 10, size=40).reshape(10, 4),
|
|
columns=["A", "A", "C", "C"],
|
|
)
|
|
|
|
result = concat([df, df], axis=1)
|
|
tm.assert_frame_equal(result.iloc[:, :4], df)
|
|
tm.assert_frame_equal(result.iloc[:, 4:], df)
|
|
|
|
result = concat([df, df], axis=0)
|
|
tm.assert_frame_equal(result.iloc[:10], df)
|
|
tm.assert_frame_equal(result.iloc[10:], df)
|
|
|
|
# multi dtypes
|
|
df = concat(
|
|
[
|
|
DataFrame(
|
|
np.random.default_rng(2).standard_normal((10, 4)),
|
|
columns=["A", "A", "B", "B"],
|
|
),
|
|
DataFrame(
|
|
np.random.default_rng(2).integers(0, 10, size=20).reshape(10, 2),
|
|
columns=["A", "C"],
|
|
),
|
|
],
|
|
axis=1,
|
|
)
|
|
|
|
result = concat([df, df], axis=1)
|
|
tm.assert_frame_equal(result.iloc[:, :6], df)
|
|
tm.assert_frame_equal(result.iloc[:, 6:], df)
|
|
|
|
result = concat([df, df], axis=0)
|
|
tm.assert_frame_equal(result.iloc[:10], df)
|
|
tm.assert_frame_equal(result.iloc[10:], df)
|
|
|
|
# append
|
|
result = df.iloc[0:8, :]._append(df.iloc[8:])
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
result = df.iloc[0:8, :]._append(df.iloc[8:9])._append(df.iloc[9:10])
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
expected = concat([df, df], axis=0)
|
|
result = df._append(df)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
class TestMultiIndexConcat:
|
|
def test_concat_multiindex_with_keys(self, multiindex_dataframe_random_data):
|
|
frame = multiindex_dataframe_random_data
|
|
index = frame.index
|
|
result = concat([frame, frame], keys=[0, 1], names=["iteration"])
|
|
|
|
assert result.index.names == ("iteration",) + index.names
|
|
tm.assert_frame_equal(result.loc[0], frame)
|
|
tm.assert_frame_equal(result.loc[1], frame)
|
|
assert result.index.nlevels == 3
|
|
|
|
def test_concat_multiindex_with_none_in_index_names(self):
|
|
# GH 15787
|
|
index = MultiIndex.from_product([[1], range(5)], names=["level1", None])
|
|
df = DataFrame({"col": range(5)}, index=index, dtype=np.int32)
|
|
|
|
result = concat([df, df], keys=[1, 2], names=["level2"])
|
|
index = MultiIndex.from_product(
|
|
[[1, 2], [1], range(5)], names=["level2", "level1", None]
|
|
)
|
|
expected = DataFrame({"col": list(range(5)) * 2}, index=index, dtype=np.int32)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = concat([df, df[:2]], keys=[1, 2], names=["level2"])
|
|
level2 = [1] * 5 + [2] * 2
|
|
level1 = [1] * 7
|
|
no_name = list(range(5)) + list(range(2))
|
|
tuples = list(zip(level2, level1, no_name))
|
|
index = MultiIndex.from_tuples(tuples, names=["level2", "level1", None])
|
|
expected = DataFrame({"col": no_name}, index=index, dtype=np.int32)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_concat_multiindex_rangeindex(self):
|
|
# GH13542
|
|
# when multi-index levels are RangeIndex objects
|
|
# there is a bug in concat with objects of len 1
|
|
|
|
df = DataFrame(np.random.default_rng(2).standard_normal((9, 2)))
|
|
df.index = MultiIndex(
|
|
levels=[pd.RangeIndex(3), pd.RangeIndex(3)],
|
|
codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)],
|
|
)
|
|
|
|
res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]])
|
|
exp = df.iloc[[2, 3, 4, 5], :]
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_concat_multiindex_dfs_with_deepcopy(self):
|
|
# GH 9967
|
|
example_multiindex1 = MultiIndex.from_product([["a"], ["b"]])
|
|
example_dataframe1 = DataFrame([0], index=example_multiindex1)
|
|
|
|
example_multiindex2 = MultiIndex.from_product([["a"], ["c"]])
|
|
example_dataframe2 = DataFrame([1], index=example_multiindex2)
|
|
|
|
example_dict = {"s1": example_dataframe1, "s2": example_dataframe2}
|
|
expected_index = MultiIndex(
|
|
levels=[["s1", "s2"], ["a"], ["b", "c"]],
|
|
codes=[[0, 1], [0, 0], [0, 1]],
|
|
names=["testname", None, None],
|
|
)
|
|
expected = DataFrame([[0], [1]], index=expected_index)
|
|
result_copy = concat(deepcopy(example_dict), names=["testname"])
|
|
tm.assert_frame_equal(result_copy, expected)
|
|
result_no_copy = concat(example_dict, names=["testname"])
|
|
tm.assert_frame_equal(result_no_copy, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"mi1_list",
|
|
[
|
|
[["a"], range(2)],
|
|
[["b"], np.arange(2.0, 4.0)],
|
|
[["c"], ["A", "B"]],
|
|
[["d"], pd.date_range(start="2017", end="2018", periods=2)],
|
|
],
|
|
)
|
|
@pytest.mark.parametrize(
|
|
"mi2_list",
|
|
[
|
|
[["a"], range(2)],
|
|
[["b"], np.arange(2.0, 4.0)],
|
|
[["c"], ["A", "B"]],
|
|
[["d"], pd.date_range(start="2017", end="2018", periods=2)],
|
|
],
|
|
)
|
|
def test_concat_with_various_multiindex_dtypes(
|
|
self, mi1_list: list, mi2_list: list
|
|
):
|
|
# GitHub #23478
|
|
mi1 = MultiIndex.from_product(mi1_list)
|
|
mi2 = MultiIndex.from_product(mi2_list)
|
|
|
|
df1 = DataFrame(np.zeros((1, len(mi1))), columns=mi1)
|
|
df2 = DataFrame(np.zeros((1, len(mi2))), columns=mi2)
|
|
|
|
if mi1_list[0] == mi2_list[0]:
|
|
expected_mi = MultiIndex(
|
|
levels=[mi1_list[0], list(mi1_list[1])],
|
|
codes=[[0, 0, 0, 0], [0, 1, 0, 1]],
|
|
)
|
|
else:
|
|
expected_mi = MultiIndex(
|
|
levels=[
|
|
mi1_list[0] + mi2_list[0],
|
|
list(mi1_list[1]) + list(mi2_list[1]),
|
|
],
|
|
codes=[[0, 0, 1, 1], [0, 1, 2, 3]],
|
|
)
|
|
|
|
expected_df = DataFrame(np.zeros((1, len(expected_mi))), columns=expected_mi)
|
|
|
|
with tm.assert_produces_warning(None):
|
|
result_df = concat((df1, df2), axis=1)
|
|
|
|
tm.assert_frame_equal(expected_df, result_df)
|
|
|
|
def test_concat_multiindex_(self):
|
|
# GitHub #44786
|
|
df = DataFrame({"col": ["a", "b", "c"]}, index=["1", "2", "2"])
|
|
df = concat([df], keys=["X"])
|
|
|
|
iterables = [["X"], ["1", "2", "2"]]
|
|
result_index = df.index
|
|
expected_index = MultiIndex.from_product(iterables)
|
|
|
|
tm.assert_index_equal(result_index, expected_index)
|
|
|
|
result_df = df
|
|
expected_df = DataFrame(
|
|
{"col": ["a", "b", "c"]}, index=MultiIndex.from_product(iterables)
|
|
)
|
|
tm.assert_frame_equal(result_df, expected_df)
|
|
|
|
def test_concat_with_key_not_unique(self):
|
|
# GitHub #46519
|
|
df1 = DataFrame({"name": [1]})
|
|
df2 = DataFrame({"name": [2]})
|
|
df3 = DataFrame({"name": [3]})
|
|
df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
|
|
# the warning is caused by indexing unsorted multi-index
|
|
with tm.assert_produces_warning(
|
|
PerformanceWarning, match="indexing past lexsort depth"
|
|
):
|
|
out_a = df_a.loc[("x", 0), :]
|
|
|
|
df_b = DataFrame(
|
|
{"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)])
|
|
)
|
|
with tm.assert_produces_warning(
|
|
PerformanceWarning, match="indexing past lexsort depth"
|
|
):
|
|
out_b = df_b.loc[("x", 0)]
|
|
|
|
tm.assert_frame_equal(out_a, out_b)
|
|
|
|
df1 = DataFrame({"name": ["a", "a", "b"]})
|
|
df2 = DataFrame({"name": ["a", "b"]})
|
|
df3 = DataFrame({"name": ["c", "d"]})
|
|
df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
|
|
with tm.assert_produces_warning(
|
|
PerformanceWarning, match="indexing past lexsort depth"
|
|
):
|
|
out_a = df_a.loc[("x", 0), :]
|
|
|
|
df_b = DataFrame(
|
|
{
|
|
"a": ["x", "x", "x", "y", "y", "x", "x"],
|
|
"b": [0, 1, 2, 0, 1, 0, 1],
|
|
"name": list("aababcd"),
|
|
}
|
|
).set_index(["a", "b"])
|
|
df_b.index.names = [None, None]
|
|
with tm.assert_produces_warning(
|
|
PerformanceWarning, match="indexing past lexsort depth"
|
|
):
|
|
out_b = df_b.loc[("x", 0), :]
|
|
|
|
tm.assert_frame_equal(out_a, out_b)
|
|
|
|
def test_concat_with_duplicated_levels(self):
|
|
# keyword levels should be unique
|
|
df1 = DataFrame({"A": [1]}, index=["x"])
|
|
df2 = DataFrame({"A": [1]}, index=["y"])
|
|
msg = r"Level values not unique: \['x', 'y', 'y'\]"
|
|
with pytest.raises(ValueError, match=msg):
|
|
concat([df1, df2], keys=["x", "y"], levels=[["x", "y", "y"]])
|
|
|
|
@pytest.mark.parametrize("levels", [[["x", "y"]], [["x", "y", "y"]]])
|
|
def test_concat_with_levels_with_none_keys(self, levels):
|
|
df1 = DataFrame({"A": [1]}, index=["x"])
|
|
df2 = DataFrame({"A": [1]}, index=["y"])
|
|
msg = "levels supported only when keys is not None"
|
|
with pytest.raises(ValueError, match=msg):
|
|
concat([df1, df2], levels=levels)
|
|
|
|
def test_concat_range_index_result(self):
|
|
# GH#47501
|
|
df1 = DataFrame({"a": [1, 2]})
|
|
df2 = DataFrame({"b": [1, 2]})
|
|
|
|
result = concat([df1, df2], sort=True, axis=1)
|
|
expected = DataFrame({"a": [1, 2], "b": [1, 2]})
|
|
tm.assert_frame_equal(result, expected)
|
|
expected_index = pd.RangeIndex(0, 2)
|
|
tm.assert_index_equal(result.index, expected_index, exact=True)
|
|
|
|
def test_concat_index_keep_dtype(self):
|
|
# GH#47329
|
|
df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype="object"))
|
|
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype="object"))
|
|
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
|
|
expected = DataFrame(
|
|
[[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="object")
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_concat_index_keep_dtype_ea_numeric(self, any_numeric_ea_dtype):
|
|
# GH#47329
|
|
df1 = DataFrame(
|
|
[[0, 1, 1]], columns=Index([1, 2, 3], dtype=any_numeric_ea_dtype)
|
|
)
|
|
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype=any_numeric_ea_dtype))
|
|
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
|
|
expected = DataFrame(
|
|
[[0, 1, 1.0], [0, 1, np.nan]],
|
|
columns=Index([1, 2, 3], dtype=any_numeric_ea_dtype),
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("dtype", ["Int8", "Int16", "Int32"])
|
|
def test_concat_index_find_common(self, dtype):
|
|
# GH#47329
|
|
df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype=dtype))
|
|
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype="Int32"))
|
|
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
|
|
expected = DataFrame(
|
|
[[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="Int32")
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string):
|
|
# GH 46675
|
|
s1 = Series(["a", "b", "c"])
|
|
s2 = Series(["a", "b"])
|
|
s3 = Series(["a", "b", "c", "d"])
|
|
s4 = Series(
|
|
[], dtype=object if not using_infer_string else "string[pyarrow_numpy]"
|
|
)
|
|
result = concat(
|
|
[s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1
|
|
)
|
|
expected = DataFrame(
|
|
[
|
|
["a"] * 3 + [np.nan],
|
|
["b"] * 3 + [np.nan],
|
|
["c", np.nan] * 2,
|
|
[np.nan] * 2 + ["d"] + [np.nan],
|
|
],
|
|
dtype=object if not using_infer_string else "string[pyarrow_numpy]",
|
|
)
|
|
tm.assert_frame_equal(
|
|
result, expected, check_index_type=True, check_column_type=True
|
|
)
|