You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

437 lines
13 KiB

6 months ago
""" test orc compat """
import datetime
from decimal import Decimal
from io import BytesIO
import os
import pathlib
import numpy as np
import pytest
import pandas as pd
from pandas import read_orc
import pandas._testing as tm
from pandas.core.arrays import StringArray
pytest.importorskip("pyarrow.orc")
import pyarrow as pa
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.fixture
def dirpath(datapath):
return datapath("io", "data", "orc")
@pytest.fixture(
params=[
np.array([1, 20], dtype="uint64"),
pd.Series(["a", "b", "a"], dtype="category"),
[pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)],
[pd.Period("2022-01-03", freq="D"), pd.Period("2022-01-04", freq="D")],
]
)
def orc_writer_dtypes_not_supported(request):
# Examples of dataframes with dtypes for which conversion to ORC
# hasn't been implemented yet, that is, Category, unsigned integers,
# interval, period and sparse.
return pd.DataFrame({"unimpl": request.param})
def test_orc_reader_empty(dirpath):
columns = [
"boolean1",
"byte1",
"short1",
"int1",
"long1",
"float1",
"double1",
"bytes1",
"string1",
]
dtypes = [
"bool",
"int8",
"int16",
"int32",
"int64",
"float32",
"float64",
"object",
"object",
]
expected = pd.DataFrame(index=pd.RangeIndex(0))
for colname, dtype in zip(columns, dtypes):
expected[colname] = pd.Series(dtype=dtype)
inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
got = read_orc(inputfile, columns=columns)
tm.assert_equal(expected, got)
def test_orc_reader_basic(dirpath):
data = {
"boolean1": np.array([False, True], dtype="bool"),
"byte1": np.array([1, 100], dtype="int8"),
"short1": np.array([1024, 2048], dtype="int16"),
"int1": np.array([65536, 65536], dtype="int32"),
"long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
"float1": np.array([1.0, 2.0], dtype="float32"),
"double1": np.array([-15.0, -5.0], dtype="float64"),
"bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
"string1": np.array(["hi", "bye"], dtype="object"),
}
expected = pd.DataFrame.from_dict(data)
inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
got = read_orc(inputfile, columns=data.keys())
tm.assert_equal(expected, got)
def test_orc_reader_decimal(dirpath):
# Only testing the first 10 rows of data
data = {
"_col0": np.array(
[
Decimal("-1000.50000"),
Decimal("-999.60000"),
Decimal("-998.70000"),
Decimal("-997.80000"),
Decimal("-996.90000"),
Decimal("-995.10000"),
Decimal("-994.11000"),
Decimal("-993.12000"),
Decimal("-992.13000"),
Decimal("-991.14000"),
],
dtype="object",
)
}
expected = pd.DataFrame.from_dict(data)
inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
got = read_orc(inputfile).iloc[:10]
tm.assert_equal(expected, got)
def test_orc_reader_date_low(dirpath):
data = {
"time": np.array(
[
"1900-05-05 12:34:56.100000",
"1900-05-05 12:34:56.100100",
"1900-05-05 12:34:56.100200",
"1900-05-05 12:34:56.100300",
"1900-05-05 12:34:56.100400",
"1900-05-05 12:34:56.100500",
"1900-05-05 12:34:56.100600",
"1900-05-05 12:34:56.100700",
"1900-05-05 12:34:56.100800",
"1900-05-05 12:34:56.100900",
],
dtype="datetime64[ns]",
),
"date": np.array(
[
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
],
dtype="object",
),
}
expected = pd.DataFrame.from_dict(data)
inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
got = read_orc(inputfile).iloc[:10]
tm.assert_equal(expected, got)
def test_orc_reader_date_high(dirpath):
data = {
"time": np.array(
[
"2038-05-05 12:34:56.100000",
"2038-05-05 12:34:56.100100",
"2038-05-05 12:34:56.100200",
"2038-05-05 12:34:56.100300",
"2038-05-05 12:34:56.100400",
"2038-05-05 12:34:56.100500",
"2038-05-05 12:34:56.100600",
"2038-05-05 12:34:56.100700",
"2038-05-05 12:34:56.100800",
"2038-05-05 12:34:56.100900",
],
dtype="datetime64[ns]",
),
"date": np.array(
[
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
],
dtype="object",
),
}
expected = pd.DataFrame.from_dict(data)
inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
got = read_orc(inputfile).iloc[:10]
tm.assert_equal(expected, got)
def test_orc_reader_snappy_compressed(dirpath):
data = {
"int1": np.array(
[
-1160101563,
1181413113,
2065821249,
-267157795,
172111193,
1752363137,
1406072123,
1911809390,
-1308542224,
-467100286,
],
dtype="int32",
),
"string1": np.array(
[
"f50dcb8",
"382fdaaa",
"90758c6",
"9e8caf3f",
"ee97332b",
"d634da1",
"2bea4396",
"d67d89e8",
"ad71007e",
"e8c82066",
],
dtype="object",
),
}
expected = pd.DataFrame.from_dict(data)
inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
got = read_orc(inputfile).iloc[:10]
tm.assert_equal(expected, got)
def test_orc_roundtrip_file(dirpath):
# GH44554
# PyArrow gained ORC write support with the current argument order
pytest.importorskip("pyarrow")
data = {
"boolean1": np.array([False, True], dtype="bool"),
"byte1": np.array([1, 100], dtype="int8"),
"short1": np.array([1024, 2048], dtype="int16"),
"int1": np.array([65536, 65536], dtype="int32"),
"long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
"float1": np.array([1.0, 2.0], dtype="float32"),
"double1": np.array([-15.0, -5.0], dtype="float64"),
"bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
"string1": np.array(["hi", "bye"], dtype="object"),
}
expected = pd.DataFrame.from_dict(data)
with tm.ensure_clean() as path:
expected.to_orc(path)
got = read_orc(path)
tm.assert_equal(expected, got)
def test_orc_roundtrip_bytesio():
# GH44554
# PyArrow gained ORC write support with the current argument order
pytest.importorskip("pyarrow")
data = {
"boolean1": np.array([False, True], dtype="bool"),
"byte1": np.array([1, 100], dtype="int8"),
"short1": np.array([1024, 2048], dtype="int16"),
"int1": np.array([65536, 65536], dtype="int32"),
"long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
"float1": np.array([1.0, 2.0], dtype="float32"),
"double1": np.array([-15.0, -5.0], dtype="float64"),
"bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
"string1": np.array(["hi", "bye"], dtype="object"),
}
expected = pd.DataFrame.from_dict(data)
bytes = expected.to_orc()
got = read_orc(BytesIO(bytes))
tm.assert_equal(expected, got)
def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported):
# GH44554
# PyArrow gained ORC write support with the current argument order
pytest.importorskip("pyarrow")
msg = "The dtype of one or more columns is not supported yet."
with pytest.raises(NotImplementedError, match=msg):
orc_writer_dtypes_not_supported.to_orc()
def test_orc_dtype_backend_pyarrow():
pytest.importorskip("pyarrow")
df = pd.DataFrame(
{
"string": list("abc"),
"string_with_nan": ["a", np.nan, "c"],
"string_with_none": ["a", None, "c"],
"bytes": [b"foo", b"bar", None],
"int": list(range(1, 4)),
"float": np.arange(4.0, 7.0, dtype="float64"),
"float_with_nan": [2.0, np.nan, 3.0],
"bool": [True, False, True],
"bool_with_na": [True, False, None],
"datetime": pd.date_range("20130101", periods=3),
"datetime_with_nat": [
pd.Timestamp("20130101"),
pd.NaT,
pd.Timestamp("20130103"),
],
}
)
bytes_data = df.copy().to_orc()
result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow")
expected = pd.DataFrame(
{
col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
for col in df.columns
}
)
tm.assert_frame_equal(result, expected)
def test_orc_dtype_backend_numpy_nullable():
# GH#50503
pytest.importorskip("pyarrow")
df = pd.DataFrame(
{
"string": list("abc"),
"string_with_nan": ["a", np.nan, "c"],
"string_with_none": ["a", None, "c"],
"int": list(range(1, 4)),
"int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
"na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
"float": np.arange(4.0, 7.0, dtype="float64"),
"float_with_nan": [2.0, np.nan, 3.0],
"bool": [True, False, True],
"bool_with_na": [True, False, None],
}
)
bytes_data = df.copy().to_orc()
result = read_orc(BytesIO(bytes_data), dtype_backend="numpy_nullable")
expected = pd.DataFrame(
{
"string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
"string_with_nan": StringArray(
np.array(["a", pd.NA, "c"], dtype=np.object_)
),
"string_with_none": StringArray(
np.array(["a", pd.NA, "c"], dtype=np.object_)
),
"int": pd.Series([1, 2, 3], dtype="Int64"),
"int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
"na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
"float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),
"float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),
"bool": pd.Series([True, False, True], dtype="boolean"),
"bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
}
)
tm.assert_frame_equal(result, expected)
def test_orc_uri_path():
expected = pd.DataFrame({"int": list(range(1, 4))})
with tm.ensure_clean("tmp.orc") as path:
expected.to_orc(path)
uri = pathlib.Path(path).as_uri()
result = read_orc(uri)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"index",
[
pd.RangeIndex(start=2, stop=5, step=1),
pd.RangeIndex(start=0, stop=3, step=1, name="non-default"),
pd.Index([1, 2, 3]),
],
)
def test_to_orc_non_default_index(index):
df = pd.DataFrame({"a": [1, 2, 3]}, index=index)
msg = (
"orc does not support serializing a non-default index|"
"orc does not serialize index meta-data"
)
with pytest.raises(ValueError, match=msg):
df.to_orc()
def test_invalid_dtype_backend():
msg = (
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
"'pyarrow' are allowed."
)
df = pd.DataFrame({"int": list(range(1, 4))})
with tm.ensure_clean("tmp.orc") as path:
df.to_orc(path)
with pytest.raises(ValueError, match=msg):
read_orc(path, dtype_backend="numpy")
def test_string_inference(tmp_path):
# GH#54431
path = tmp_path / "test_string_inference.p"
df = pd.DataFrame(data={"a": ["x", "y"]})
df.to_orc(path)
with pd.option_context("future.infer_string", True):
result = read_orc(path)
expected = pd.DataFrame(
data={"a": ["x", "y"]},
dtype="string[pyarrow_numpy]",
columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)