You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
375 lines
12 KiB
375 lines
12 KiB
import re
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from pandas._libs.tslibs import Timestamp
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
DataFrame,
|
|
HDFStore,
|
|
Index,
|
|
MultiIndex,
|
|
Series,
|
|
_testing as tm,
|
|
concat,
|
|
date_range,
|
|
)
|
|
from pandas.tests.io.pytables.common import (
|
|
_maybe_remove,
|
|
ensure_clean_store,
|
|
)
|
|
from pandas.util import _test_decorators as td
|
|
|
|
pytestmark = pytest.mark.single_cpu
|
|
|
|
|
|
def test_format_type(tmp_path, setup_path):
|
|
df = DataFrame({"A": [1, 2]})
|
|
with HDFStore(tmp_path / setup_path) as store:
|
|
store.put("a", df, format="fixed")
|
|
store.put("b", df, format="table")
|
|
|
|
assert store.get_storer("a").format_type == "fixed"
|
|
assert store.get_storer("b").format_type == "table"
|
|
|
|
|
|
def test_format_kwarg_in_constructor(tmp_path, setup_path):
|
|
# GH 13291
|
|
|
|
msg = "format is not a defined argument for HDFStore"
|
|
|
|
with pytest.raises(ValueError, match=msg):
|
|
HDFStore(tmp_path / setup_path, format="table")
|
|
|
|
|
|
def test_api_default_format(tmp_path, setup_path):
|
|
# default_format option
|
|
with ensure_clean_store(setup_path) as store:
|
|
df = DataFrame(
|
|
1.1 * np.arange(120).reshape((30, 4)),
|
|
columns=Index(list("ABCD"), dtype=object),
|
|
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
|
)
|
|
|
|
with pd.option_context("io.hdf.default_format", "fixed"):
|
|
_maybe_remove(store, "df")
|
|
store.put("df", df)
|
|
assert not store.get_storer("df").is_table
|
|
|
|
msg = "Can only append to Tables"
|
|
with pytest.raises(ValueError, match=msg):
|
|
store.append("df2", df)
|
|
|
|
with pd.option_context("io.hdf.default_format", "table"):
|
|
_maybe_remove(store, "df")
|
|
store.put("df", df)
|
|
assert store.get_storer("df").is_table
|
|
|
|
_maybe_remove(store, "df2")
|
|
store.append("df2", df)
|
|
assert store.get_storer("df").is_table
|
|
|
|
path = tmp_path / setup_path
|
|
df = DataFrame(
|
|
1.1 * np.arange(120).reshape((30, 4)),
|
|
columns=Index(list("ABCD"), dtype=object),
|
|
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
|
)
|
|
|
|
with pd.option_context("io.hdf.default_format", "fixed"):
|
|
df.to_hdf(path, key="df")
|
|
with HDFStore(path) as store:
|
|
assert not store.get_storer("df").is_table
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.to_hdf(path, key="df2", append=True)
|
|
|
|
with pd.option_context("io.hdf.default_format", "table"):
|
|
df.to_hdf(path, key="df3")
|
|
with HDFStore(path) as store:
|
|
assert store.get_storer("df3").is_table
|
|
df.to_hdf(path, key="df4", append=True)
|
|
with HDFStore(path) as store:
|
|
assert store.get_storer("df4").is_table
|
|
|
|
|
|
def test_put(setup_path):
|
|
with ensure_clean_store(setup_path) as store:
|
|
ts = Series(
|
|
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
|
)
|
|
df = DataFrame(
|
|
np.random.default_rng(2).standard_normal((20, 4)),
|
|
columns=Index(list("ABCD"), dtype=object),
|
|
index=date_range("2000-01-01", periods=20, freq="B"),
|
|
)
|
|
store["a"] = ts
|
|
store["b"] = df[:10]
|
|
store["foo/bar/bah"] = df[:10]
|
|
store["foo"] = df[:10]
|
|
store["/foo"] = df[:10]
|
|
store.put("c", df[:10], format="table")
|
|
|
|
# not OK, not a table
|
|
msg = "Can only append to Tables"
|
|
with pytest.raises(ValueError, match=msg):
|
|
store.put("b", df[10:], append=True)
|
|
|
|
# node does not currently exist, test _is_table_type returns False
|
|
# in this case
|
|
_maybe_remove(store, "f")
|
|
with pytest.raises(ValueError, match=msg):
|
|
store.put("f", df[10:], append=True)
|
|
|
|
# can't put to a table (use append instead)
|
|
with pytest.raises(ValueError, match=msg):
|
|
store.put("c", df[10:], append=True)
|
|
|
|
# overwrite table
|
|
store.put("c", df[:10], format="table", append=False)
|
|
tm.assert_frame_equal(df[:10], store["c"])
|
|
|
|
|
|
def test_put_string_index(setup_path):
|
|
with ensure_clean_store(setup_path) as store:
|
|
index = Index([f"I am a very long string index: {i}" for i in range(20)])
|
|
s = Series(np.arange(20), index=index)
|
|
df = DataFrame({"A": s, "B": s})
|
|
|
|
store["a"] = s
|
|
tm.assert_series_equal(store["a"], s)
|
|
|
|
store["b"] = df
|
|
tm.assert_frame_equal(store["b"], df)
|
|
|
|
# mixed length
|
|
index = Index(
|
|
["abcdefghijklmnopqrstuvwxyz1234567890"]
|
|
+ [f"I am a very long string index: {i}" for i in range(20)]
|
|
)
|
|
s = Series(np.arange(21), index=index)
|
|
df = DataFrame({"A": s, "B": s})
|
|
store["a"] = s
|
|
tm.assert_series_equal(store["a"], s)
|
|
|
|
store["b"] = df
|
|
tm.assert_frame_equal(store["b"], df)
|
|
|
|
|
|
def test_put_compression(setup_path):
|
|
with ensure_clean_store(setup_path) as store:
|
|
df = DataFrame(
|
|
np.random.default_rng(2).standard_normal((10, 4)),
|
|
columns=Index(list("ABCD"), dtype=object),
|
|
index=date_range("2000-01-01", periods=10, freq="B"),
|
|
)
|
|
|
|
store.put("c", df, format="table", complib="zlib")
|
|
tm.assert_frame_equal(store["c"], df)
|
|
|
|
# can't compress if format='fixed'
|
|
msg = "Compression not supported on Fixed format stores"
|
|
with pytest.raises(ValueError, match=msg):
|
|
store.put("b", df, format="fixed", complib="zlib")
|
|
|
|
|
|
@td.skip_if_windows
|
|
def test_put_compression_blosc(setup_path):
|
|
df = DataFrame(
|
|
np.random.default_rng(2).standard_normal((10, 4)),
|
|
columns=Index(list("ABCD"), dtype=object),
|
|
index=date_range("2000-01-01", periods=10, freq="B"),
|
|
)
|
|
|
|
with ensure_clean_store(setup_path) as store:
|
|
# can't compress if format='fixed'
|
|
msg = "Compression not supported on Fixed format stores"
|
|
with pytest.raises(ValueError, match=msg):
|
|
store.put("b", df, format="fixed", complib="blosc")
|
|
|
|
store.put("c", df, format="table", complib="blosc")
|
|
tm.assert_frame_equal(store["c"], df)
|
|
|
|
|
|
def test_put_mixed_type(setup_path):
|
|
df = DataFrame(
|
|
np.random.default_rng(2).standard_normal((10, 4)),
|
|
columns=Index(list("ABCD"), dtype=object),
|
|
index=date_range("2000-01-01", periods=10, freq="B"),
|
|
)
|
|
df["obj1"] = "foo"
|
|
df["obj2"] = "bar"
|
|
df["bool1"] = df["A"] > 0
|
|
df["bool2"] = df["B"] > 0
|
|
df["bool3"] = True
|
|
df["int1"] = 1
|
|
df["int2"] = 2
|
|
df["timestamp1"] = Timestamp("20010102").as_unit("ns")
|
|
df["timestamp2"] = Timestamp("20010103").as_unit("ns")
|
|
df["datetime1"] = Timestamp("20010102").as_unit("ns")
|
|
df["datetime2"] = Timestamp("20010103").as_unit("ns")
|
|
df.loc[df.index[3:6], ["obj1"]] = np.nan
|
|
df = df._consolidate()
|
|
|
|
with ensure_clean_store(setup_path) as store:
|
|
_maybe_remove(store, "df")
|
|
|
|
with tm.assert_produces_warning(pd.errors.PerformanceWarning):
|
|
store.put("df", df)
|
|
|
|
expected = store.get("df")
|
|
tm.assert_frame_equal(expected, df)
|
|
|
|
|
|
@pytest.mark.parametrize("format", ["table", "fixed"])
|
|
@pytest.mark.parametrize(
|
|
"index",
|
|
[
|
|
Index([str(i) for i in range(10)]),
|
|
Index(np.arange(10, dtype=float)),
|
|
Index(np.arange(10)),
|
|
date_range("2020-01-01", periods=10),
|
|
pd.period_range("2020-01-01", periods=10),
|
|
],
|
|
)
|
|
def test_store_index_types(setup_path, format, index):
|
|
# GH5386
|
|
# test storing various index types
|
|
|
|
with ensure_clean_store(setup_path) as store:
|
|
df = DataFrame(
|
|
np.random.default_rng(2).standard_normal((10, 2)),
|
|
columns=list("AB"),
|
|
index=index,
|
|
)
|
|
_maybe_remove(store, "df")
|
|
store.put("df", df, format=format)
|
|
tm.assert_frame_equal(df, store["df"])
|
|
|
|
|
|
def test_column_multiindex(setup_path):
|
|
# GH 4710
|
|
# recreate multi-indexes properly
|
|
|
|
index = MultiIndex.from_tuples(
|
|
[("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"]
|
|
)
|
|
df = DataFrame(np.arange(12).reshape(3, 4), columns=index)
|
|
expected = df.set_axis(df.index.to_numpy())
|
|
|
|
with ensure_clean_store(setup_path) as store:
|
|
store.put("df", df)
|
|
tm.assert_frame_equal(
|
|
store["df"], expected, check_index_type=True, check_column_type=True
|
|
)
|
|
|
|
store.put("df1", df, format="table")
|
|
tm.assert_frame_equal(
|
|
store["df1"], expected, check_index_type=True, check_column_type=True
|
|
)
|
|
|
|
msg = re.escape("cannot use a multi-index on axis [1] with data_columns ['A']")
|
|
with pytest.raises(ValueError, match=msg):
|
|
store.put("df2", df, format="table", data_columns=["A"])
|
|
msg = re.escape("cannot use a multi-index on axis [1] with data_columns True")
|
|
with pytest.raises(ValueError, match=msg):
|
|
store.put("df3", df, format="table", data_columns=True)
|
|
|
|
# appending multi-column on existing table (see GH 6167)
|
|
with ensure_clean_store(setup_path) as store:
|
|
store.append("df2", df)
|
|
store.append("df2", df)
|
|
|
|
tm.assert_frame_equal(store["df2"], concat((df, df)))
|
|
|
|
# non_index_axes name
|
|
df = DataFrame(np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo"))
|
|
expected = df.set_axis(df.index.to_numpy())
|
|
|
|
with ensure_clean_store(setup_path) as store:
|
|
store.put("df1", df, format="table")
|
|
tm.assert_frame_equal(
|
|
store["df1"], expected, check_index_type=True, check_column_type=True
|
|
)
|
|
|
|
|
|
def test_store_multiindex(setup_path):
|
|
# validate multi-index names
|
|
# GH 5527
|
|
with ensure_clean_store(setup_path) as store:
|
|
|
|
def make_index(names=None):
|
|
dti = date_range("2013-12-01", "2013-12-02")
|
|
mi = MultiIndex.from_product([dti, range(2), range(3)], names=names)
|
|
return mi
|
|
|
|
# no names
|
|
_maybe_remove(store, "df")
|
|
df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index())
|
|
store.append("df", df)
|
|
tm.assert_frame_equal(store.select("df"), df)
|
|
|
|
# partial names
|
|
_maybe_remove(store, "df")
|
|
df = DataFrame(
|
|
np.zeros((12, 2)),
|
|
columns=["a", "b"],
|
|
index=make_index(["date", None, None]),
|
|
)
|
|
store.append("df", df)
|
|
tm.assert_frame_equal(store.select("df"), df)
|
|
|
|
# series
|
|
_maybe_remove(store, "ser")
|
|
ser = Series(np.zeros(12), index=make_index(["date", None, None]))
|
|
store.append("ser", ser)
|
|
xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"]))
|
|
tm.assert_series_equal(store.select("ser"), xp)
|
|
|
|
# dup with column
|
|
_maybe_remove(store, "df")
|
|
df = DataFrame(
|
|
np.zeros((12, 2)),
|
|
columns=["a", "b"],
|
|
index=make_index(["date", "a", "t"]),
|
|
)
|
|
msg = "duplicate names/columns in the multi-index when storing as a table"
|
|
with pytest.raises(ValueError, match=msg):
|
|
store.append("df", df)
|
|
|
|
# dup within level
|
|
_maybe_remove(store, "df")
|
|
df = DataFrame(
|
|
np.zeros((12, 2)),
|
|
columns=["a", "b"],
|
|
index=make_index(["date", "date", "date"]),
|
|
)
|
|
with pytest.raises(ValueError, match=msg):
|
|
store.append("df", df)
|
|
|
|
# fully names
|
|
_maybe_remove(store, "df")
|
|
df = DataFrame(
|
|
np.zeros((12, 2)),
|
|
columns=["a", "b"],
|
|
index=make_index(["date", "s", "t"]),
|
|
)
|
|
store.append("df", df)
|
|
tm.assert_frame_equal(store.select("df"), df)
|
|
|
|
|
|
@pytest.mark.parametrize("format", ["fixed", "table"])
|
|
def test_store_periodindex(tmp_path, setup_path, format):
|
|
# GH 7796
|
|
# test of PeriodIndex in HDFStore
|
|
df = DataFrame(
|
|
np.random.default_rng(2).standard_normal((5, 1)),
|
|
index=pd.period_range("20220101", freq="M", periods=5),
|
|
)
|
|
|
|
path = tmp_path / setup_path
|
|
df.to_hdf(path, key="df", mode="w", format=format)
|
|
expected = pd.read_hdf(path, "df")
|
|
tm.assert_frame_equal(df, expected)
|