keshe/.venv/Lib/site-packages/pandas/tests/io/test_orc.py

""" test orc compat """
import datetime
from decimal import Decimal
from io import BytesIO
import os
import pathlib

import numpy as np
import pytest

import pandas as pd
from pandas import read_orc
import pandas._testing as tm
from pandas.core.arrays import StringArray

pytest.importorskip("pyarrow.orc")

import pyarrow as pa

pytestmark = pytest.mark.filterwarnings(
    "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)


@pytest.fixture
def dirpath(datapath):
    return datapath("io", "data", "orc")


@pytest.fixture(
    params=[
        np.array([1, 20], dtype="uint64"),
        pd.Series(["a", "b", "a"], dtype="category"),
        [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)],
        [pd.Period("2022-01-03", freq="D"), pd.Period("2022-01-04", freq="D")],
    ]
)
def orc_writer_dtypes_not_supported(request):
    # Examples of dataframes with dtypes for which conversion to ORC
    # hasn't been implemented yet, that is, Category, unsigned integers,
    # interval, period and sparse.
    return pd.DataFrame({"unimpl": request.param})


def test_orc_reader_empty(dirpath):
    columns = [
        "boolean1",
        "byte1",
        "short1",
        "int1",
        "long1",
        "float1",
        "double1",
        "bytes1",
        "string1",
    ]
    dtypes = [
        "bool",
        "int8",
        "int16",
        "int32",
        "int64",
        "float32",
        "float64",
        "object",
        "object",
    ]
    expected = pd.DataFrame(index=pd.RangeIndex(0))
    for colname, dtype in zip(columns, dtypes):
        expected[colname] = pd.Series(dtype=dtype)

    inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
    got = read_orc(inputfile, columns=columns)

    tm.assert_equal(expected, got)


def test_orc_reader_basic(dirpath):
    data = {
        "boolean1": np.array([False, True], dtype="bool"),
        "byte1": np.array([1, 100], dtype="int8"),
        "short1": np.array([1024, 2048], dtype="int16"),
        "int1": np.array([65536, 65536], dtype="int32"),
        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
        "float1": np.array([1.0, 2.0], dtype="float32"),
        "double1": np.array([-15.0, -5.0], dtype="float64"),
        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
        "string1": np.array(["hi", "bye"], dtype="object"),
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
    got = read_orc(inputfile, columns=data.keys())

    tm.assert_equal(expected, got)


def test_orc_reader_decimal(dirpath):
    # Only testing the first 10 rows of data
    data = {
        "_col0": np.array(
            [
                Decimal("-1000.50000"),
                Decimal("-999.60000"),
                Decimal("-998.70000"),
                Decimal("-997.80000"),
                Decimal("-996.90000"),
                Decimal("-995.10000"),
                Decimal("-994.11000"),
                Decimal("-993.12000"),
                Decimal("-992.13000"),
                Decimal("-991.14000"),
            ],
            dtype="object",
        )
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
    got = read_orc(inputfile).iloc[:10]

    tm.assert_equal(expected, got)


def test_orc_reader_date_low(dirpath):
    data = {
        "time": np.array(
            [
                "1900-05-05 12:34:56.100000",
                "1900-05-05 12:34:56.100100",
                "1900-05-05 12:34:56.100200",
                "1900-05-05 12:34:56.100300",
                "1900-05-05 12:34:56.100400",
                "1900-05-05 12:34:56.100500",
                "1900-05-05 12:34:56.100600",
                "1900-05-05 12:34:56.100700",
                "1900-05-05 12:34:56.100800",
                "1900-05-05 12:34:56.100900",
            ],
            dtype="datetime64[ns]",
        ),
        "date": np.array(
            [
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
            ],
            dtype="object",
        ),
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
    got = read_orc(inputfile).iloc[:10]

    tm.assert_equal(expected, got)


def test_orc_reader_date_high(dirpath):
    data = {
        "time": np.array(
            [
                "2038-05-05 12:34:56.100000",
                "2038-05-05 12:34:56.100100",
                "2038-05-05 12:34:56.100200",
                "2038-05-05 12:34:56.100300",
                "2038-05-05 12:34:56.100400",
                "2038-05-05 12:34:56.100500",
                "2038-05-05 12:34:56.100600",
                "2038-05-05 12:34:56.100700",
                "2038-05-05 12:34:56.100800",
                "2038-05-05 12:34:56.100900",
            ],
            dtype="datetime64[ns]",
        ),
        "date": np.array(
            [
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
            ],
            dtype="object",
        ),
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
    got = read_orc(inputfile).iloc[:10]

    tm.assert_equal(expected, got)


def test_orc_reader_snappy_compressed(dirpath):
    data = {
        "int1": np.array(
            [
                -1160101563,
                1181413113,
                2065821249,
                -267157795,
                172111193,
                1752363137,
                1406072123,
                1911809390,
                -1308542224,
                -467100286,
            ],
            dtype="int32",
        ),
        "string1": np.array(
            [
                "f50dcb8",
                "382fdaaa",
                "90758c6",
                "9e8caf3f",
                "ee97332b",
                "d634da1",
                "2bea4396",
                "d67d89e8",
                "ad71007e",
                "e8c82066",
            ],
            dtype="object",
        ),
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
    got = read_orc(inputfile).iloc[:10]

    tm.assert_equal(expected, got)


def test_orc_roundtrip_file(dirpath):
    # GH44554
    # PyArrow gained ORC write support with the current argument order
    pytest.importorskip("pyarrow")

    data = {
        "boolean1": np.array([False, True], dtype="bool"),
        "byte1": np.array([1, 100], dtype="int8"),
        "short1": np.array([1024, 2048], dtype="int16"),
        "int1": np.array([65536, 65536], dtype="int32"),
        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
        "float1": np.array([1.0, 2.0], dtype="float32"),
        "double1": np.array([-15.0, -5.0], dtype="float64"),
        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
        "string1": np.array(["hi", "bye"], dtype="object"),
    }
    expected = pd.DataFrame.from_dict(data)

    with tm.ensure_clean() as path:
        expected.to_orc(path)
        got = read_orc(path)

        tm.assert_equal(expected, got)


def test_orc_roundtrip_bytesio():
    # GH44554
    # PyArrow gained ORC write support with the current argument order
    pytest.importorskip("pyarrow")

    data = {
        "boolean1": np.array([False, True], dtype="bool"),
        "byte1": np.array([1, 100], dtype="int8"),
        "short1": np.array([1024, 2048], dtype="int16"),
        "int1": np.array([65536, 65536], dtype="int32"),
        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
        "float1": np.array([1.0, 2.0], dtype="float32"),
        "double1": np.array([-15.0, -5.0], dtype="float64"),
        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
        "string1": np.array(["hi", "bye"], dtype="object"),
    }
    expected = pd.DataFrame.from_dict(data)

    bytes = expected.to_orc()
    got = read_orc(BytesIO(bytes))

    tm.assert_equal(expected, got)


def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported):
    # GH44554
    # PyArrow gained ORC write support with the current argument order
    pytest.importorskip("pyarrow")

    msg = "The dtype of one or more columns is not supported yet."
    with pytest.raises(NotImplementedError, match=msg):
        orc_writer_dtypes_not_supported.to_orc()


def test_orc_dtype_backend_pyarrow():
    pytest.importorskip("pyarrow")
    df = pd.DataFrame(
        {
            "string": list("abc"),
            "string_with_nan": ["a", np.nan, "c"],
            "string_with_none": ["a", None, "c"],
            "bytes": [b"foo", b"bar", None],
            "int": list(range(1, 4)),
            "float": np.arange(4.0, 7.0, dtype="float64"),
            "float_with_nan": [2.0, np.nan, 3.0],
            "bool": [True, False, True],
            "bool_with_na": [True, False, None],
            "datetime": pd.date_range("20130101", periods=3),
            "datetime_with_nat": [
                pd.Timestamp("20130101"),
                pd.NaT,
                pd.Timestamp("20130103"),
            ],
        }
    )

    bytes_data = df.copy().to_orc()
    result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow")

    expected = pd.DataFrame(
        {
            col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
            for col in df.columns
        }
    )

    tm.assert_frame_equal(result, expected)


def test_orc_dtype_backend_numpy_nullable():
    # GH#50503
    pytest.importorskip("pyarrow")
    df = pd.DataFrame(
        {
            "string": list("abc"),
            "string_with_nan": ["a", np.nan, "c"],
            "string_with_none": ["a", None, "c"],
            "int": list(range(1, 4)),
            "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
            "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
            "float": np.arange(4.0, 7.0, dtype="float64"),
            "float_with_nan": [2.0, np.nan, 3.0],
            "bool": [True, False, True],
            "bool_with_na": [True, False, None],
        }
    )

    bytes_data = df.copy().to_orc()
    result = read_orc(BytesIO(bytes_data), dtype_backend="numpy_nullable")

    expected = pd.DataFrame(
        {
            "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
            "string_with_nan": StringArray(
                np.array(["a", pd.NA, "c"], dtype=np.object_)
            ),
            "string_with_none": StringArray(
                np.array(["a", pd.NA, "c"], dtype=np.object_)
            ),
            "int": pd.Series([1, 2, 3], dtype="Int64"),
            "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
            "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
            "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),
            "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),
            "bool": pd.Series([True, False, True], dtype="boolean"),
            "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
        }
    )

    tm.assert_frame_equal(result, expected)


def test_orc_uri_path():
    expected = pd.DataFrame({"int": list(range(1, 4))})
    with tm.ensure_clean("tmp.orc") as path:
        expected.to_orc(path)
        uri = pathlib.Path(path).as_uri()
        result = read_orc(uri)
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
    "index",
    [
        pd.RangeIndex(start=2, stop=5, step=1),
        pd.RangeIndex(start=0, stop=3, step=1, name="non-default"),
        pd.Index([1, 2, 3]),
    ],
)
def test_to_orc_non_default_index(index):
    df = pd.DataFrame({"a": [1, 2, 3]}, index=index)
    msg = (
        "orc does not support serializing a non-default index|"
        "orc does not serialize index meta-data"
    )
    with pytest.raises(ValueError, match=msg):
        df.to_orc()


def test_invalid_dtype_backend():
    msg = (
        "dtype_backend numpy is invalid, only 'numpy_nullable' and "
        "'pyarrow' are allowed."
    )
    df = pd.DataFrame({"int": list(range(1, 4))})
    with tm.ensure_clean("tmp.orc") as path:
        df.to_orc(path)
        with pytest.raises(ValueError, match=msg):
            read_orc(path, dtype_backend="numpy")


def test_string_inference(tmp_path):
    # GH#54431
    path = tmp_path / "test_string_inference.p"
    df = pd.DataFrame(data={"a": ["x", "y"]})
    df.to_orc(path)
    with pd.option_context("future.infer_string", True):
        result = read_orc(path)
    expected = pd.DataFrame(
        data={"a": ["x", "y"]},
        dtype="string[pyarrow_numpy]",
        columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"),
    )
    tm.assert_frame_equal(result, expected)