You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1048 lines
36 KiB

import numpy as np
import pytest
from pandas._libs.tslibs import Timestamp
import pandas as pd
from pandas import (
DataFrame,
HDFStore,
Index,
MultiIndex,
Series,
_testing as tm,
bdate_range,
concat,
date_range,
isna,
read_hdf,
)
from pandas.tests.io.pytables.common import (
_maybe_remove,
ensure_clean_store,
)
from pandas.io.pytables import Term
pytestmark = pytest.mark.single_cpu
def test_select_columns_in_where(setup_path):
# GH 6169
# recreate multi-indexes when columns is passed
# in the `where` argument
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["foo_name", "bar_name"],
)
# With a DataFrame
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 3)),
index=index,
columns=["A", "B", "C"],
)
with ensure_clean_store(setup_path) as store:
store.put("df", df, format="table")
expected = df[["A"]]
tm.assert_frame_equal(store.select("df", columns=["A"]), expected)
tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected)
# With a Series
s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="A")
with ensure_clean_store(setup_path) as store:
store.put("s", s, format="table")
tm.assert_series_equal(store.select("s", where="columns=['A']"), s)
def test_select_with_dups(setup_path):
# single dtypes
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "A", "B", "B"]
)
df.index = date_range("20130101 9:30", periods=10, freq="min")
with ensure_clean_store(setup_path) as store:
store.append("df", df)
result = store.select("df")
expected = df
tm.assert_frame_equal(result, expected, by_blocks=True)
result = store.select("df", columns=df.columns)
expected = df
tm.assert_frame_equal(result, expected, by_blocks=True)
result = store.select("df", columns=["A"])
expected = df.loc[:, ["A"]]
tm.assert_frame_equal(result, expected)
# dups across dtypes
df = concat(
[
DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=["A", "A", "B", "B"],
),
DataFrame(
np.random.default_rng(2).integers(0, 10, size=20).reshape(10, 2),
columns=["A", "C"],
),
],
axis=1,
)
df.index = date_range("20130101 9:30", periods=10, freq="min")
with ensure_clean_store(setup_path) as store:
store.append("df", df)
result = store.select("df")
expected = df
tm.assert_frame_equal(result, expected, by_blocks=True)
result = store.select("df", columns=df.columns)
expected = df
tm.assert_frame_equal(result, expected, by_blocks=True)
expected = df.loc[:, ["A"]]
result = store.select("df", columns=["A"])
tm.assert_frame_equal(result, expected, by_blocks=True)
expected = df.loc[:, ["B", "A"]]
result = store.select("df", columns=["B", "A"])
tm.assert_frame_equal(result, expected, by_blocks=True)
# duplicates on both index and columns
with ensure_clean_store(setup_path) as store:
store.append("df", df)
store.append("df", df)
expected = df.loc[:, ["B", "A"]]
expected = concat([expected, expected])
result = store.select("df", columns=["B", "A"])
tm.assert_frame_equal(result, expected, by_blocks=True)
def test_select(setup_path):
with ensure_clean_store(setup_path) as store:
# select with columns=
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
_maybe_remove(store, "df")
store.append("df", df)
result = store.select("df", columns=["A", "B"])
expected = df.reindex(columns=["A", "B"])
tm.assert_frame_equal(expected, result)
# equivalently
result = store.select("df", [("columns=['A', 'B']")])
expected = df.reindex(columns=["A", "B"])
tm.assert_frame_equal(expected, result)
# with a data column
_maybe_remove(store, "df")
store.append("df", df, data_columns=["A"])
result = store.select("df", ["A > 0"], columns=["A", "B"])
expected = df[df.A > 0].reindex(columns=["A", "B"])
tm.assert_frame_equal(expected, result)
# all a data columns
_maybe_remove(store, "df")
store.append("df", df, data_columns=True)
result = store.select("df", ["A > 0"], columns=["A", "B"])
expected = df[df.A > 0].reindex(columns=["A", "B"])
tm.assert_frame_equal(expected, result)
# with a data column, but different columns
_maybe_remove(store, "df")
store.append("df", df, data_columns=["A"])
result = store.select("df", ["A > 0"], columns=["C", "D"])
expected = df[df.A > 0].reindex(columns=["C", "D"])
tm.assert_frame_equal(expected, result)
def test_select_dtypes(setup_path):
with ensure_clean_store(setup_path) as store:
# with a Timestamp data column (GH #2637)
df = DataFrame(
{
"ts": bdate_range("2012-01-01", periods=300),
"A": np.random.default_rng(2).standard_normal(300),
}
)
_maybe_remove(store, "df")
store.append("df", df, data_columns=["ts", "A"])
result = store.select("df", "ts>=Timestamp('2012-02-01')")
expected = df[df.ts >= Timestamp("2012-02-01")]
tm.assert_frame_equal(expected, result)
# bool columns (GH #2849)
df = DataFrame(
np.random.default_rng(2).standard_normal((5, 2)), columns=["A", "B"]
)
df["object"] = "foo"
df.loc[4:5, "object"] = "bar"
df["boolv"] = df["A"] > 0
_maybe_remove(store, "df")
store.append("df", df, data_columns=True)
expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa: E712
for v in [True, "true", 1]:
result = store.select("df", f"boolv == {v}", columns=["A", "boolv"])
tm.assert_frame_equal(expected, result)
expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa: E712
for v in [False, "false", 0]:
result = store.select("df", f"boolv == {v}", columns=["A", "boolv"])
tm.assert_frame_equal(expected, result)
# integer index
df = DataFrame(
{
"A": np.random.default_rng(2).random(20),
"B": np.random.default_rng(2).random(20),
}
)
_maybe_remove(store, "df_int")
store.append("df_int", df)
result = store.select("df_int", "index<10 and columns=['A']")
expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
tm.assert_frame_equal(expected, result)
# float index
df = DataFrame(
{
"A": np.random.default_rng(2).random(20),
"B": np.random.default_rng(2).random(20),
"index": np.arange(20, dtype="f8"),
}
)
_maybe_remove(store, "df_float")
store.append("df_float", df)
result = store.select("df_float", "index<10.0 and columns=['A']")
expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
tm.assert_frame_equal(expected, result)
with ensure_clean_store(setup_path) as store:
# floats w/o NaN
df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
df["cols"] = (df["cols"] + 10).apply(str)
store.append("df1", df, data_columns=True)
result = store.select("df1", where="values>2.0")
expected = df[df["values"] > 2.0]
tm.assert_frame_equal(expected, result)
# floats with NaN
df.iloc[0] = np.nan
expected = df[df["values"] > 2.0]
store.append("df2", df, data_columns=True, index=False)
result = store.select("df2", where="values>2.0")
tm.assert_frame_equal(expected, result)
# https://github.com/PyTables/PyTables/issues/282
# bug in selection when 0th row has a np.nan and an index
# store.append('df3',df,data_columns=True)
# result = store.select(
# 'df3', where='values>2.0')
# tm.assert_frame_equal(expected, result)
# not in first position float with NaN ok too
df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
df["cols"] = (df["cols"] + 10).apply(str)
df.iloc[1] = np.nan
expected = df[df["values"] > 2.0]
store.append("df4", df, data_columns=True)
result = store.select("df4", where="values>2.0")
tm.assert_frame_equal(expected, result)
# test selection with comparison against numpy scalar
# GH 11283
with ensure_clean_store(setup_path) as store:
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
expected = df[df["A"] > 0]
store.append("df", df, data_columns=True)
np_zero = np.float64(0) # noqa: F841
result = store.select("df", where=["A>np_zero"])
tm.assert_frame_equal(expected, result)
def test_select_with_many_inputs(setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
{
"ts": bdate_range("2012-01-01", periods=300),
"A": np.random.default_rng(2).standard_normal(300),
"B": range(300),
"users": ["a"] * 50
+ ["b"] * 50
+ ["c"] * 100
+ [f"a{i:03d}" for i in range(100)],
}
)
_maybe_remove(store, "df")
store.append("df", df, data_columns=["ts", "A", "B", "users"])
# regular select
result = store.select("df", "ts>=Timestamp('2012-02-01')")
expected = df[df.ts >= Timestamp("2012-02-01")]
tm.assert_frame_equal(expected, result)
# small selector
result = store.select("df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']")
expected = df[
(df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"])
]
tm.assert_frame_equal(expected, result)
# big selector along the columns
selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)]
result = store.select("df", "ts>=Timestamp('2012-02-01') and users=selector")
expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)]
tm.assert_frame_equal(expected, result)
selector = range(100, 200)
result = store.select("df", "B=selector")
expected = df[df.B.isin(selector)]
tm.assert_frame_equal(expected, result)
assert len(result) == 100
# big selector along the index
selector = Index(df.ts[0:100].values)
result = store.select("df", "ts=selector")
expected = df[df.ts.isin(selector.values)]
tm.assert_frame_equal(expected, result)
assert len(result) == 100
def test_select_iterator(tmp_path, setup_path):
# single table
with ensure_clean_store(setup_path) as store:
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
_maybe_remove(store, "df")
store.append("df", df)
expected = store.select("df")
results = list(store.select("df", iterator=True))
result = concat(results)
tm.assert_frame_equal(expected, result)
results = list(store.select("df", chunksize=2))
assert len(results) == 5
result = concat(results)
tm.assert_frame_equal(expected, result)
results = list(store.select("df", chunksize=2))
result = concat(results)
tm.assert_frame_equal(result, expected)
path = tmp_path / setup_path
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df.to_hdf(path, key="df_non_table")
msg = "can only use an iterator or chunksize on a table"
with pytest.raises(TypeError, match=msg):
read_hdf(path, "df_non_table", chunksize=2)
with pytest.raises(TypeError, match=msg):
read_hdf(path, "df_non_table", iterator=True)
path = tmp_path / setup_path
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df.to_hdf(path, key="df", format="table")
results = list(read_hdf(path, "df", chunksize=2))
result = concat(results)
assert len(results) == 5
tm.assert_frame_equal(result, df)
tm.assert_frame_equal(result, read_hdf(path, "df"))
# multiple
with ensure_clean_store(setup_path) as store:
df1 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
store.append("df1", df1, data_columns=True)
df2 = df1.copy().rename(columns="{}_2".format)
df2["foo"] = "bar"
store.append("df2", df2)
df = concat([df1, df2], axis=1)
# full selection
expected = store.select_as_multiple(["df1", "df2"], selector="df1")
results = list(
store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=2)
)
result = concat(results)
tm.assert_frame_equal(expected, result)
def test_select_iterator_complete_8014(setup_path):
# GH 8014
# using iterator and where clause
chunksize = 1e4
# no iterator
with ensure_clean_store(setup_path) as store:
expected = DataFrame(
np.random.default_rng(2).standard_normal((100064, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=100064, freq="s"),
)
_maybe_remove(store, "df")
store.append("df", expected)
beg_dt = expected.index[0]
end_dt = expected.index[-1]
# select w/o iteration and no where clause works
result = store.select("df")
tm.assert_frame_equal(expected, result)
# select w/o iterator and where clause, single term, begin
# of range, works
where = f"index >= '{beg_dt}'"
result = store.select("df", where=where)
tm.assert_frame_equal(expected, result)
# select w/o iterator and where clause, single term, end
# of range, works
where = f"index <= '{end_dt}'"
result = store.select("df", where=where)
tm.assert_frame_equal(expected, result)
# select w/o iterator and where clause, inclusive range,
# works
where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
result = store.select("df", where=where)
tm.assert_frame_equal(expected, result)
# with iterator, full range
with ensure_clean_store(setup_path) as store:
expected = DataFrame(
np.random.default_rng(2).standard_normal((100064, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=100064, freq="s"),
)
_maybe_remove(store, "df")
store.append("df", expected)
beg_dt = expected.index[0]
end_dt = expected.index[-1]
# select w/iterator and no where clause works
results = list(store.select("df", chunksize=chunksize))
result = concat(results)
tm.assert_frame_equal(expected, result)
# select w/iterator and where clause, single term, begin of range
where = f"index >= '{beg_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
tm.assert_frame_equal(expected, result)
# select w/iterator and where clause, single term, end of range
where = f"index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
tm.assert_frame_equal(expected, result)
# select w/iterator and where clause, inclusive range
where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
tm.assert_frame_equal(expected, result)
def test_select_iterator_non_complete_8014(setup_path):
# GH 8014
# using iterator and where clause
chunksize = 1e4
# with iterator, non complete range
with ensure_clean_store(setup_path) as store:
expected = DataFrame(
np.random.default_rng(2).standard_normal((100064, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=100064, freq="s"),
)
_maybe_remove(store, "df")
store.append("df", expected)
beg_dt = expected.index[1]
end_dt = expected.index[-2]
# select w/iterator and where clause, single term, begin of range
where = f"index >= '{beg_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
rexpected = expected[expected.index >= beg_dt]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause, single term, end of range
where = f"index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
rexpected = expected[expected.index <= end_dt]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause, inclusive range
where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)]
tm.assert_frame_equal(rexpected, result)
# with iterator, empty where
with ensure_clean_store(setup_path) as store:
expected = DataFrame(
np.random.default_rng(2).standard_normal((100064, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=100064, freq="s"),
)
_maybe_remove(store, "df")
store.append("df", expected)
end_dt = expected.index[-1]
# select w/iterator and where clause, single term, begin of range
where = f"index > '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
assert 0 == len(results)
def test_select_iterator_many_empty_frames(setup_path):
# GH 8014
# using iterator and where clause can return many empty
# frames.
chunksize = 10_000
# with iterator, range limited to the first chunk
with ensure_clean_store(setup_path) as store:
expected = DataFrame(
np.random.default_rng(2).standard_normal((100064, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=100064, freq="s"),
)
_maybe_remove(store, "df")
store.append("df", expected)
beg_dt = expected.index[0]
end_dt = expected.index[chunksize - 1]
# select w/iterator and where clause, single term, begin of range
where = f"index >= '{beg_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
rexpected = expected[expected.index >= beg_dt]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause, single term, end of range
where = f"index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
assert len(results) == 1
result = concat(results)
rexpected = expected[expected.index <= end_dt]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause, inclusive range
where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
# should be 1, is 10
assert len(results) == 1
result = concat(results)
rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause which selects
# *nothing*.
#
# To be consistent with Python idiom I suggest this should
# return [] e.g. `for e in []: print True` never prints
# True.
where = f"index <= '{beg_dt}' & index >= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
# should be []
assert len(results) == 0
def test_frame_select(setup_path):
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
with ensure_clean_store(setup_path) as store:
store.put("frame", df, format="table")
date = df.index[len(df) // 2]
crit1 = Term("index>=date")
assert crit1.env.scope["date"] == date
crit2 = "columns=['A', 'D']"
crit3 = "columns=A"
result = store.select("frame", [crit1, crit2])
expected = df.loc[date:, ["A", "D"]]
tm.assert_frame_equal(result, expected)
result = store.select("frame", [crit3])
expected = df.loc[:, ["A"]]
tm.assert_frame_equal(result, expected)
# invalid terms
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
store.append("df_time", df)
msg = "day is out of range for month: 0"
with pytest.raises(ValueError, match=msg):
store.select("df_time", "index>0")
# can't select if not written as table
# store['frame'] = df
# with pytest.raises(ValueError):
# store.select('frame', [crit1, crit2])
def test_frame_select_complex(setup_path):
# select via complex criteria
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df["string"] = "foo"
df.loc[df.index[0:4], "string"] = "bar"
with ensure_clean_store(setup_path) as store:
store.put("df", df, format="table", data_columns=["string"])
# empty
result = store.select("df", 'index>df.index[3] & string="bar"')
expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")]
tm.assert_frame_equal(result, expected)
result = store.select("df", 'index>df.index[3] & string="foo"')
expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")]
tm.assert_frame_equal(result, expected)
# or
result = store.select("df", 'index>df.index[3] | string="bar"')
expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")]
tm.assert_frame_equal(result, expected)
result = store.select(
"df", '(index>df.index[3] & index<=df.index[6]) | string="bar"'
)
expected = df.loc[
((df.index > df.index[3]) & (df.index <= df.index[6]))
| (df.string == "bar")
]
tm.assert_frame_equal(result, expected)
# invert
result = store.select("df", 'string!="bar"')
expected = df.loc[df.string != "bar"]
tm.assert_frame_equal(result, expected)
# invert not implemented in numexpr :(
msg = "cannot use an invert condition when passing to numexpr"
with pytest.raises(NotImplementedError, match=msg):
store.select("df", '~(string="bar")')
# invert ok for filters
result = store.select("df", "~(columns=['A','B'])")
expected = df.loc[:, df.columns.difference(["A", "B"])]
tm.assert_frame_equal(result, expected)
# in
result = store.select("df", "index>df.index[3] & columns in ['A','B']")
expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"])
tm.assert_frame_equal(result, expected)
def test_frame_select_complex2(tmp_path):
pp = tmp_path / "params.hdf"
hh = tmp_path / "hist.hdf"
# use non-trivial selection criteria
params = DataFrame({"A": [1, 1, 2, 2, 3]})
params.to_hdf(pp, key="df", mode="w", format="table", data_columns=["A"])
selection = read_hdf(pp, "df", where="A=[2,3]")
hist = DataFrame(
np.random.default_rng(2).standard_normal((25, 1)),
columns=["data"],
index=MultiIndex.from_tuples(
[(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"]
),
)
hist.to_hdf(hh, key="df", mode="w", format="table")
expected = read_hdf(hh, "df", where="l1=[2, 3, 4]")
# scope with list like
l0 = selection.index.tolist() # noqa: F841
with HDFStore(hh) as store:
result = store.select("df", where="l1=l0")
tm.assert_frame_equal(result, expected)
result = read_hdf(hh, "df", where="l1=l0")
tm.assert_frame_equal(result, expected)
# index
index = selection.index # noqa: F841
result = read_hdf(hh, "df", where="l1=index")
tm.assert_frame_equal(result, expected)
result = read_hdf(hh, "df", where="l1=selection.index")
tm.assert_frame_equal(result, expected)
result = read_hdf(hh, "df", where="l1=selection.index.tolist()")
tm.assert_frame_equal(result, expected)
result = read_hdf(hh, "df", where="l1=list(selection.index)")
tm.assert_frame_equal(result, expected)
# scope with index
with HDFStore(hh) as store:
result = store.select("df", where="l1=index")
tm.assert_frame_equal(result, expected)
result = store.select("df", where="l1=selection.index")
tm.assert_frame_equal(result, expected)
result = store.select("df", where="l1=selection.index.tolist()")
tm.assert_frame_equal(result, expected)
result = store.select("df", where="l1=list(selection.index)")
tm.assert_frame_equal(result, expected)
def test_invalid_filtering(setup_path):
# can't use more than one filter (atm)
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
with ensure_clean_store(setup_path) as store:
store.put("df", df, format="table")
msg = "unable to collapse Joint Filters"
# not implemented
with pytest.raises(NotImplementedError, match=msg):
store.select("df", "columns=['A'] | columns=['B']")
# in theory we could deal with this
with pytest.raises(NotImplementedError, match=msg):
store.select("df", "columns=['A','B'] & columns=['C']")
def test_string_select(setup_path):
# GH 2973
with ensure_clean_store(setup_path) as store:
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
# test string ==/!=
df["x"] = "none"
df.loc[df.index[2:7], "x"] = ""
store.append("df", df, data_columns=["x"])
result = store.select("df", "x=none")
expected = df[df.x == "none"]
tm.assert_frame_equal(result, expected)
result = store.select("df", "x!=none")
expected = df[df.x != "none"]
tm.assert_frame_equal(result, expected)
df2 = df.copy()
df2.loc[df2.x == "", "x"] = np.nan
store.append("df2", df2, data_columns=["x"])
result = store.select("df2", "x!=none")
expected = df2[isna(df2.x)]
tm.assert_frame_equal(result, expected)
# int ==/!=
df["int"] = 1
df.loc[df.index[2:7], "int"] = 2
store.append("df3", df, data_columns=["int"])
result = store.select("df3", "int=2")
expected = df[df.int == 2]
tm.assert_frame_equal(result, expected)
result = store.select("df3", "int!=2")
expected = df[df.int != 2]
tm.assert_frame_equal(result, expected)
def test_select_as_multiple(setup_path):
df1 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df2 = df1.copy().rename(columns="{}_2".format)
df2["foo"] = "bar"
with ensure_clean_store(setup_path) as store:
msg = "keys must be a list/tuple"
# no tables stored
with pytest.raises(TypeError, match=msg):
store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1")
store.append("df1", df1, data_columns=["A", "B"])
store.append("df2", df2)
# exceptions
with pytest.raises(TypeError, match=msg):
store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1")
with pytest.raises(TypeError, match=msg):
store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1")
msg = "'No object named df3 in the file'"
with pytest.raises(KeyError, match=msg):
store.select_as_multiple(
["df1", "df3"], where=["A>0", "B>0"], selector="df1"
)
with pytest.raises(KeyError, match=msg):
store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1")
with pytest.raises(KeyError, match="'No object named df4 in the file'"):
store.select_as_multiple(
["df1", "df2"], where=["A>0", "B>0"], selector="df4"
)
# default select
result = store.select("df1", ["A>0", "B>0"])
expected = store.select_as_multiple(
["df1"], where=["A>0", "B>0"], selector="df1"
)
tm.assert_frame_equal(result, expected)
expected = store.select_as_multiple("df1", where=["A>0", "B>0"], selector="df1")
tm.assert_frame_equal(result, expected)
# multiple
result = store.select_as_multiple(
["df1", "df2"], where=["A>0", "B>0"], selector="df1"
)
expected = concat([df1, df2], axis=1)
expected = expected[(expected.A > 0) & (expected.B > 0)]
tm.assert_frame_equal(result, expected, check_freq=False)
# FIXME: 2021-01-20 this is failing with freq None vs 4B on some builds
# multiple (diff selector)
result = store.select_as_multiple(
["df1", "df2"], where="index>df2.index[4]", selector="df2"
)
expected = concat([df1, df2], axis=1)
expected = expected[5:]
tm.assert_frame_equal(result, expected)
# test exception for diff rows
df3 = df1.copy().head(2)
store.append("df3", df3)
msg = "all tables must have exactly the same nrows!"
with pytest.raises(ValueError, match=msg):
store.select_as_multiple(
["df1", "df3"], where=["A>0", "B>0"], selector="df1"
)
def test_nan_selection_bug_4858(setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64")
df["cols"] = (df["cols"] + 10).apply(str)
df.iloc[0] = np.nan
expected = DataFrame(
{"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]},
index=[3, 4, 5],
)
# write w/o the index on that particular column
store.append("df", df, data_columns=True, index=["cols"])
result = store.select("df", where="values>2.0")
tm.assert_frame_equal(result, expected)
def test_query_with_nested_special_character(setup_path):
df = DataFrame(
{
"a": ["a", "a", "c", "b", "test & test", "c", "b", "e"],
"b": [1, 2, 3, 4, 5, 6, 7, 8],
}
)
expected = df[df.a == "test & test"]
with ensure_clean_store(setup_path) as store:
store.append("test", df, format="table", data_columns=True)
result = store.select("test", 'a = "test & test"')
tm.assert_frame_equal(expected, result)
def test_query_long_float_literal(setup_path):
# GH 14241
df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]})
with ensure_clean_store(setup_path) as store:
store.append("test", df, format="table", data_columns=True)
cutoff = 1000000000.0006
result = store.select("test", f"A < {cutoff:.4f}")
assert result.empty
cutoff = 1000000000.0010
result = store.select("test", f"A > {cutoff:.4f}")
expected = df.loc[[1, 2], :]
tm.assert_frame_equal(expected, result)
exact = 1000000000.0011
result = store.select("test", f"A == {exact:.4f}")
expected = df.loc[[1], :]
tm.assert_frame_equal(expected, result)
def test_query_compare_column_type(setup_path):
# GH 15492
df = DataFrame(
{
"date": ["2014-01-01", "2014-01-02"],
"real_date": date_range("2014-01-01", periods=2),
"float": [1.1, 1.2],
"int": [1, 2],
},
columns=["date", "real_date", "float", "int"],
)
with ensure_clean_store(setup_path) as store:
store.append("test", df, format="table", data_columns=True)
ts = Timestamp("2014-01-01") # noqa: F841
result = store.select("test", where="real_date > ts")
expected = df.loc[[1], :]
tm.assert_frame_equal(expected, result)
for op in ["<", ">", "=="]:
# non strings to string column always fail
for v in [2.1, True, Timestamp("2014-01-01"), pd.Timedelta(1, "s")]:
query = f"date {op} v"
msg = f"Cannot compare {v} of type {type(v)} to string column"
with pytest.raises(TypeError, match=msg):
store.select("test", where=query)
# strings to other columns must be convertible to type
v = "a"
for col in ["int", "float", "real_date"]:
query = f"{col} {op} v"
if col == "real_date":
msg = 'Given date string "a" not likely a datetime'
else:
msg = "could not convert string to"
with pytest.raises(ValueError, match=msg):
store.select("test", where=query)
for v, col in zip(
["1", "1.1", "2014-01-01"], ["int", "float", "real_date"]
):
query = f"{col} {op} v"
result = store.select("test", where=query)
if op == "==":
expected = df.loc[[0], :]
elif op == ">":
expected = df.loc[[1], :]
else:
expected = df.loc[[], :]
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize("where", ["", (), (None,), [], [None]])
def test_select_empty_where(tmp_path, where):
# GH26610
df = DataFrame([1, 2, 3])
path = tmp_path / "empty_where.h5"
with HDFStore(path) as store:
store.put("df", df, "t")
result = read_hdf(store, "df", where=where)
tm.assert_frame_equal(result, df)
def test_select_large_integer(tmp_path):
path = tmp_path / "large_int.h5"
df = DataFrame(
zip(
["a", "b", "c", "d"],
[-9223372036854775801, -9223372036854775802, -9223372036854775803, 123],
),
columns=["x", "y"],
)
result = None
with HDFStore(path) as s:
s.append("data", df, data_columns=True, index=False)
result = s.select("data", where="y==-9223372036854775801").get("y").get(0)
expected = df["y"][0]
assert expected == result