You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

607 lines
21 KiB

import datetime as dt
from datetime import datetime
import dateutil
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
concat,
date_range,
to_timedelta,
)
import pandas._testing as tm
class TestDatetimeConcat:
def test_concat_datetime64_block(self):
rng = date_range("1/1/2000", periods=10)
df = DataFrame({"time": rng})
result = concat([df, df])
assert (result.iloc[:10]["time"] == rng).all()
assert (result.iloc[10:]["time"] == rng).all()
def test_concat_datetime_datetime64_frame(self):
# GH#2624
rows = []
rows.append([datetime(2010, 1, 1), 1])
rows.append([datetime(2010, 1, 2), "hi"])
df2_obj = DataFrame.from_records(rows, columns=["date", "test"])
ind = date_range(start="2000/1/1", freq="D", periods=10)
df1 = DataFrame({"date": ind, "test": range(10)})
# it works!
concat([df1, df2_obj])
def test_concat_datetime_timezone(self):
# GH 18523
idx1 = date_range("2011-01-01", periods=3, freq="h", tz="Europe/Paris")
idx2 = date_range(start=idx1[0], end=idx1[-1], freq="h")
df1 = DataFrame({"a": [1, 2, 3]}, index=idx1)
df2 = DataFrame({"b": [1, 2, 3]}, index=idx2)
result = concat([df1, df2], axis=1)
exp_idx = DatetimeIndex(
[
"2011-01-01 00:00:00+01:00",
"2011-01-01 01:00:00+01:00",
"2011-01-01 02:00:00+01:00",
],
dtype="M8[ns, Europe/Paris]",
freq="h",
)
expected = DataFrame(
[[1, 1], [2, 2], [3, 3]], index=exp_idx, columns=["a", "b"]
)
tm.assert_frame_equal(result, expected)
idx3 = date_range("2011-01-01", periods=3, freq="h", tz="Asia/Tokyo")
df3 = DataFrame({"b": [1, 2, 3]}, index=idx3)
result = concat([df1, df3], axis=1)
exp_idx = DatetimeIndex(
[
"2010-12-31 15:00:00+00:00",
"2010-12-31 16:00:00+00:00",
"2010-12-31 17:00:00+00:00",
"2010-12-31 23:00:00+00:00",
"2011-01-01 00:00:00+00:00",
"2011-01-01 01:00:00+00:00",
]
).as_unit("ns")
expected = DataFrame(
[
[np.nan, 1],
[np.nan, 2],
[np.nan, 3],
[1, np.nan],
[2, np.nan],
[3, np.nan],
],
index=exp_idx,
columns=["a", "b"],
)
tm.assert_frame_equal(result, expected)
# GH 13783: Concat after resample
result = concat([df1.resample("h").mean(), df2.resample("h").mean()], sort=True)
expected = DataFrame(
{"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]},
index=idx1.append(idx1),
)
tm.assert_frame_equal(result, expected)
def test_concat_datetimeindex_freq(self):
# GH 3232
# Monotonic index result
dr = date_range("01-Jan-2013", periods=100, freq="50ms", tz="UTC")
data = list(range(100))
expected = DataFrame(data, index=dr)
result = concat([expected[:50], expected[50:]])
tm.assert_frame_equal(result, expected)
# Non-monotonic index result
result = concat([expected[50:], expected[:50]])
expected = DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50]))
expected.index._data.freq = None
tm.assert_frame_equal(result, expected)
def test_concat_multiindex_datetime_object_index(self):
# https://github.com/pandas-dev/pandas/issues/11058
idx = Index(
[dt.date(2013, 1, 1), dt.date(2014, 1, 1), dt.date(2015, 1, 1)],
dtype="object",
)
s = Series(
["a", "b"],
index=MultiIndex.from_arrays(
[
[1, 2],
idx[:-1],
],
names=["first", "second"],
),
)
s2 = Series(
["a", "b"],
index=MultiIndex.from_arrays(
[[1, 2], idx[::2]],
names=["first", "second"],
),
)
mi = MultiIndex.from_arrays(
[[1, 2, 2], idx],
names=["first", "second"],
)
assert mi.levels[1].dtype == object
expected = DataFrame(
[["a", "a"], ["b", np.nan], [np.nan, "b"]],
index=mi,
)
result = concat([s, s2], axis=1)
tm.assert_frame_equal(result, expected)
def test_concat_NaT_series(self):
# GH 11693
# test for merging NaT series with datetime series.
x = Series(
date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="US/Eastern")
)
y = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]")
expected = Series([x[0], x[1], pd.NaT, pd.NaT])
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
# all NaT with tz
expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns, US/Eastern]")
result = concat([y, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_NaT_series2(self):
# without tz
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h"))
y = Series(date_range("20151124 10:00", "20151124 11:00", freq="1h"))
y[:] = pd.NaT
expected = Series([x[0], x[1], pd.NaT, pd.NaT])
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
# all NaT without tz
x[:] = pd.NaT
expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns]")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("tz", [None, "UTC"])
def test_concat_NaT_dataframes(self, tz):
# GH 12396
dti = DatetimeIndex([pd.NaT, pd.NaT], tz=tz)
first = DataFrame({0: dti})
second = DataFrame(
[[Timestamp("2015/01/01", tz=tz)], [Timestamp("2016/01/01", tz=tz)]],
index=[2, 3],
)
expected = DataFrame(
[
pd.NaT,
pd.NaT,
Timestamp("2015/01/01", tz=tz),
Timestamp("2016/01/01", tz=tz),
]
)
result = concat([first, second], axis=0)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz1", [None, "UTC"])
@pytest.mark.parametrize("tz2", [None, "UTC"])
@pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101")])
def test_concat_NaT_dataframes_all_NaT_axis_0(
self, tz1, tz2, item, using_array_manager
):
# GH 12396
# tz-naive
first = DataFrame([[pd.NaT], [pd.NaT]]).apply(lambda x: x.dt.tz_localize(tz1))
second = DataFrame([item]).apply(lambda x: x.dt.tz_localize(tz2))
result = concat([first, second], axis=0)
expected = DataFrame(Series([pd.NaT, pd.NaT, item], index=[0, 1, 0]))
expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
if tz1 != tz2:
expected = expected.astype(object)
if item is pd.NaT and not using_array_manager:
# GH#18463
# TODO: setting nan here is to keep the test passing as we
# make assert_frame_equal stricter, but is nan really the
# ideal behavior here?
if tz1 is not None:
expected.iloc[-1, 0] = np.nan
else:
expected.iloc[:-1, 0] = np.nan
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz1", [None, "UTC"])
@pytest.mark.parametrize("tz2", [None, "UTC"])
def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2):
# GH 12396
first = DataFrame(Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1))
second = DataFrame(Series([pd.NaT]).dt.tz_localize(tz2), columns=[1])
expected = DataFrame(
{
0: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1),
1: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2),
}
)
result = concat([first, second], axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz1", [None, "UTC"])
@pytest.mark.parametrize("tz2", [None, "UTC"])
def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2):
# GH 12396
# tz-naive
first = Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)
second = DataFrame(
[
[Timestamp("2015/01/01", tz=tz2)],
[Timestamp("2016/01/01", tz=tz2)],
],
index=[2, 3],
)
expected = DataFrame(
[
pd.NaT,
pd.NaT,
Timestamp("2015/01/01", tz=tz2),
Timestamp("2016/01/01", tz=tz2),
]
)
if tz1 != tz2:
expected = expected.astype(object)
result = concat([first, second])
tm.assert_frame_equal(result, expected)
class TestTimezoneConcat:
def test_concat_tz_series(self):
# gh-11755: tz and no tz
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC"))
y = Series(date_range("2012-01-01", "2012-01-02"))
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_tz_series2(self):
# gh-11887: concat tz and object
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC"))
y = Series(["a", "b"])
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_tz_series3(self, unit, unit2):
# see gh-12217 and gh-12306
# Concatenating two UTC times
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
first[0] = first[0].dt.tz_localize("UTC")
second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]")
second[0] = second[0].dt.tz_localize("UTC")
result = concat([first, second])
exp_unit = tm.get_finest_unit(unit, unit2)
assert result[0].dtype == f"datetime64[{exp_unit}, UTC]"
def test_concat_tz_series4(self, unit, unit2):
# Concatenating two London times
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
first[0] = first[0].dt.tz_localize("Europe/London")
second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]")
second[0] = second[0].dt.tz_localize("Europe/London")
result = concat([first, second])
exp_unit = tm.get_finest_unit(unit, unit2)
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
def test_concat_tz_series5(self, unit, unit2):
# Concatenating 2+1 London times
first = DataFrame(
[[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]], dtype=f"M8[{unit}]"
)
first[0] = first[0].dt.tz_localize("Europe/London")
second = DataFrame([[datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]")
second[0] = second[0].dt.tz_localize("Europe/London")
result = concat([first, second])
exp_unit = tm.get_finest_unit(unit, unit2)
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
def test_concat_tz_series6(self, unit, unit2):
# Concatenating 1+2 London times
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
first[0] = first[0].dt.tz_localize("Europe/London")
second = DataFrame(
[[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]"
)
second[0] = second[0].dt.tz_localize("Europe/London")
result = concat([first, second])
exp_unit = tm.get_finest_unit(unit, unit2)
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
def test_concat_tz_series_tzlocal(self):
# see gh-13583
x = [
Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()),
Timestamp("2011-02-01", tz=dateutil.tz.tzlocal()),
]
y = [
Timestamp("2012-01-01", tz=dateutil.tz.tzlocal()),
Timestamp("2012-02-01", tz=dateutil.tz.tzlocal()),
]
result = concat([Series(x), Series(y)], ignore_index=True)
tm.assert_series_equal(result, Series(x + y))
assert result.dtype == "datetime64[ns, tzlocal()]"
def test_concat_tz_series_with_datetimelike(self):
# see gh-12620: tz and timedelta
x = [
Timestamp("2011-01-01", tz="US/Eastern"),
Timestamp("2011-02-01", tz="US/Eastern"),
]
y = [pd.Timedelta("1 day"), pd.Timedelta("2 day")]
result = concat([Series(x), Series(y)], ignore_index=True)
tm.assert_series_equal(result, Series(x + y, dtype="object"))
# tz and period
y = [pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M")]
result = concat([Series(x), Series(y)], ignore_index=True)
tm.assert_series_equal(result, Series(x + y, dtype="object"))
def test_concat_tz_frame(self):
df2 = DataFrame(
{
"A": Timestamp("20130102", tz="US/Eastern"),
"B": Timestamp("20130603", tz="CET"),
},
index=range(5),
)
# concat
df3 = concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
tm.assert_frame_equal(df2, df3)
def test_concat_multiple_tzs(self):
# GH#12467
# combining datetime tz-aware and naive DataFrames
ts1 = Timestamp("2015-01-01", tz=None)
ts2 = Timestamp("2015-01-01", tz="UTC")
ts3 = Timestamp("2015-01-01", tz="EST")
df1 = DataFrame({"time": [ts1]})
df2 = DataFrame({"time": [ts2]})
df3 = DataFrame({"time": [ts3]})
results = concat([df1, df2]).reset_index(drop=True)
expected = DataFrame({"time": [ts1, ts2]}, dtype=object)
tm.assert_frame_equal(results, expected)
results = concat([df1, df3]).reset_index(drop=True)
expected = DataFrame({"time": [ts1, ts3]}, dtype=object)
tm.assert_frame_equal(results, expected)
results = concat([df2, df3]).reset_index(drop=True)
expected = DataFrame({"time": [ts2, ts3]})
tm.assert_frame_equal(results, expected)
def test_concat_multiindex_with_tz(self):
# GH 6606
df = DataFrame(
{
"dt": DatetimeIndex(
[
datetime(2014, 1, 1),
datetime(2014, 1, 2),
datetime(2014, 1, 3),
],
dtype="M8[ns, US/Pacific]",
),
"b": ["A", "B", "C"],
"c": [1, 2, 3],
"d": [4, 5, 6],
}
)
df = df.set_index(["dt", "b"])
exp_idx1 = DatetimeIndex(
["2014-01-01", "2014-01-02", "2014-01-03"] * 2,
dtype="M8[ns, US/Pacific]",
name="dt",
)
exp_idx2 = Index(["A", "B", "C"] * 2, name="b")
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame(
{"c": [1, 2, 3] * 2, "d": [4, 5, 6] * 2}, index=exp_idx, columns=["c", "d"]
)
result = concat([df, df])
tm.assert_frame_equal(result, expected)
def test_concat_tz_not_aligned(self):
# GH#22796
ts = pd.to_datetime([1, 2]).tz_localize("UTC")
a = DataFrame({"A": ts})
b = DataFrame({"A": ts, "B": ts})
result = concat([a, b], sort=True, ignore_index=True)
expected = DataFrame(
{"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"t1",
[
"2015-01-01",
pytest.param(
pd.NaT,
marks=pytest.mark.xfail(
reason="GH23037 incorrect dtype when concatenating"
),
),
],
)
def test_concat_tz_NaT(self, t1):
# GH#22796
# Concatenating tz-aware multicolumn DataFrames
ts1 = Timestamp(t1, tz="UTC")
ts2 = Timestamp("2015-01-01", tz="UTC")
ts3 = Timestamp("2015-01-01", tz="UTC")
df1 = DataFrame([[ts1, ts2]])
df2 = DataFrame([[ts3]])
result = concat([df1, df2])
expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
tm.assert_frame_equal(result, expected)
def test_concat_tz_with_empty(self):
# GH 9188
result = concat(
[DataFrame(date_range("2000", periods=1, tz="UTC")), DataFrame()]
)
expected = DataFrame(date_range("2000", periods=1, tz="UTC"))
tm.assert_frame_equal(result, expected)
class TestPeriodConcat:
def test_concat_period_series(self):
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="D"))
expected = Series([x[0], x[1], y[0], y[1]], dtype="Period[D]")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_period_multiple_freq_series(self):
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="M"))
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == "object"
def test_concat_period_other_series(self):
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="M"))
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == "object"
def test_concat_period_other_series2(self):
# non-period
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(DatetimeIndex(["2015-11-01", "2015-12-01"]))
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == "object"
def test_concat_period_other_series3(self):
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(["A", "B"])
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == "object"
def test_concat_timedelta64_block():
rng = to_timedelta(np.arange(10), unit="s")
df = DataFrame({"time": rng})
result = concat([df, df])
tm.assert_frame_equal(result.iloc[:10], df)
tm.assert_frame_equal(result.iloc[10:], df)
def test_concat_multiindex_datetime_nat():
# GH#44900
left = DataFrame({"a": 1}, index=MultiIndex.from_tuples([(1, pd.NaT)]))
right = DataFrame(
{"b": 2}, index=MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)])
)
result = concat([left, right], axis="columns")
expected = DataFrame(
{"a": [1.0, np.nan], "b": 2}, MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)])
)
tm.assert_frame_equal(result, expected)
def test_concat_float_datetime64(using_array_manager):
# GH#32934
df_time = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")})
df_float = DataFrame({"A": pd.array([1.0], dtype="float64")})
expected = DataFrame(
{
"A": [
pd.array(["2000"], dtype="datetime64[ns]")[0],
pd.array([1.0], dtype="float64")[0],
]
},
index=[0, 0],
)
result = concat([df_time, df_float])
tm.assert_frame_equal(result, expected)
expected = DataFrame({"A": pd.array([], dtype="object")})
result = concat([df_time.iloc[:0], df_float.iloc[:0]])
tm.assert_frame_equal(result, expected)
expected = DataFrame({"A": pd.array([1.0], dtype="object")})
result = concat([df_time.iloc[:0], df_float])
tm.assert_frame_equal(result, expected)
if not using_array_manager:
expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")})
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([df_time, df_float.iloc[:0]])
tm.assert_frame_equal(result, expected)
else:
expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype(
{"A": "object"}
)
result = concat([df_time, df_float.iloc[:0]])
tm.assert_frame_equal(result, expected)