You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

420 lines
14 KiB

import contextlib
from datetime import datetime
import io
import os
from pathlib import Path
import numpy as np
import pytest
from pandas.compat import IS64
from pandas.errors import EmptyDataError
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
from pandas.io.sas.sas7bdat import SAS7BDATReader
@pytest.fixture
def dirpath(datapath):
return datapath("io", "sas", "data")
@pytest.fixture(params=[(1, range(1, 16)), (2, [16])])
def data_test_ix(request, dirpath):
i, test_ix = request.param
fname = os.path.join(dirpath, f"test_sas7bdat_{i}.csv")
df = pd.read_csv(fname)
epoch = datetime(1960, 1, 1)
t1 = pd.to_timedelta(df["Column4"], unit="d")
df["Column4"] = (epoch + t1).astype("M8[s]")
t2 = pd.to_timedelta(df["Column12"], unit="d")
df["Column12"] = (epoch + t2).astype("M8[s]")
for k in range(df.shape[1]):
col = df.iloc[:, k]
if col.dtype == np.int64:
df.isetitem(k, df.iloc[:, k].astype(np.float64))
return df, test_ix
# https://github.com/cython/cython/issues/1720
class TestSAS7BDAT:
@pytest.mark.slow
def test_from_file(self, dirpath, data_test_ix):
expected, test_ix = data_test_ix
for k in test_ix:
fname = os.path.join(dirpath, f"test{k}.sas7bdat")
df = pd.read_sas(fname, encoding="utf-8")
tm.assert_frame_equal(df, expected)
@pytest.mark.slow
def test_from_buffer(self, dirpath, data_test_ix):
expected, test_ix = data_test_ix
for k in test_ix:
fname = os.path.join(dirpath, f"test{k}.sas7bdat")
with open(fname, "rb") as f:
byts = f.read()
buf = io.BytesIO(byts)
with pd.read_sas(
buf, format="sas7bdat", iterator=True, encoding="utf-8"
) as rdr:
df = rdr.read()
tm.assert_frame_equal(df, expected)
@pytest.mark.slow
def test_from_iterator(self, dirpath, data_test_ix):
expected, test_ix = data_test_ix
for k in test_ix:
fname = os.path.join(dirpath, f"test{k}.sas7bdat")
with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr:
df = rdr.read(2)
tm.assert_frame_equal(df, expected.iloc[0:2, :])
df = rdr.read(3)
tm.assert_frame_equal(df, expected.iloc[2:5, :])
@pytest.mark.slow
def test_path_pathlib(self, dirpath, data_test_ix):
expected, test_ix = data_test_ix
for k in test_ix:
fname = Path(os.path.join(dirpath, f"test{k}.sas7bdat"))
df = pd.read_sas(fname, encoding="utf-8")
tm.assert_frame_equal(df, expected)
@td.skip_if_no("py.path")
@pytest.mark.slow
def test_path_localpath(self, dirpath, data_test_ix):
from py.path import local as LocalPath
expected, test_ix = data_test_ix
for k in test_ix:
fname = LocalPath(os.path.join(dirpath, f"test{k}.sas7bdat"))
df = pd.read_sas(fname, encoding="utf-8")
tm.assert_frame_equal(df, expected)
@pytest.mark.slow
@pytest.mark.parametrize("chunksize", (3, 5, 10, 11))
@pytest.mark.parametrize("k", range(1, 17))
def test_iterator_loop(self, dirpath, k, chunksize):
# github #13654
fname = os.path.join(dirpath, f"test{k}.sas7bdat")
with pd.read_sas(fname, chunksize=chunksize, encoding="utf-8") as rdr:
y = 0
for x in rdr:
y += x.shape[0]
assert y == rdr.row_count
def test_iterator_read_too_much(self, dirpath):
# github #14734
fname = os.path.join(dirpath, "test1.sas7bdat")
with pd.read_sas(
fname, format="sas7bdat", iterator=True, encoding="utf-8"
) as rdr:
d1 = rdr.read(rdr.row_count + 20)
with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr:
d2 = rdr.read(rdr.row_count + 20)
tm.assert_frame_equal(d1, d2)
def test_encoding_options(datapath):
fname = datapath("io", "sas", "data", "test1.sas7bdat")
df1 = pd.read_sas(fname)
df2 = pd.read_sas(fname, encoding="utf-8")
for col in df1.columns:
try:
df1[col] = df1[col].str.decode("utf-8")
except AttributeError:
pass
tm.assert_frame_equal(df1, df2)
with contextlib.closing(SAS7BDATReader(fname, convert_header_text=False)) as rdr:
df3 = rdr.read()
for x, y in zip(df1.columns, df3.columns):
assert x == y.decode()
def test_encoding_infer(datapath):
fname = datapath("io", "sas", "data", "test1.sas7bdat")
with pd.read_sas(fname, encoding="infer", iterator=True) as df1_reader:
# check: is encoding inferred correctly from file
assert df1_reader.inferred_encoding == "cp1252"
df1 = df1_reader.read()
with pd.read_sas(fname, encoding="cp1252", iterator=True) as df2_reader:
df2 = df2_reader.read()
# check: reader reads correct information
tm.assert_frame_equal(df1, df2)
def test_productsales(datapath):
fname = datapath("io", "sas", "data", "productsales.sas7bdat")
df = pd.read_sas(fname, encoding="utf-8")
fname = datapath("io", "sas", "data", "productsales.csv")
df0 = pd.read_csv(fname, parse_dates=["MONTH"])
vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
df0[vn] = df0[vn].astype(np.float64)
df0["MONTH"] = df0["MONTH"].astype("M8[s]")
tm.assert_frame_equal(df, df0)
def test_12659(datapath):
fname = datapath("io", "sas", "data", "test_12659.sas7bdat")
df = pd.read_sas(fname)
fname = datapath("io", "sas", "data", "test_12659.csv")
df0 = pd.read_csv(fname)
df0 = df0.astype(np.float64)
tm.assert_frame_equal(df, df0)
def test_airline(datapath):
fname = datapath("io", "sas", "data", "airline.sas7bdat")
df = pd.read_sas(fname)
fname = datapath("io", "sas", "data", "airline.csv")
df0 = pd.read_csv(fname)
df0 = df0.astype(np.float64)
tm.assert_frame_equal(df, df0)
def test_date_time(datapath):
# Support of different SAS date/datetime formats (PR #15871)
fname = datapath("io", "sas", "data", "datetime.sas7bdat")
df = pd.read_sas(fname)
fname = datapath("io", "sas", "data", "datetime.csv")
df0 = pd.read_csv(
fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"]
)
# GH 19732: Timestamps imported from sas will incur floating point errors
# See GH#56014 for discussion of the correct "expected" results
# We are really just testing that we are "close". This only seems to be
# an issue near the implementation bounds.
df[df.columns[3]] = df.iloc[:, 3].dt.round("us")
df0["Date1"] = df0["Date1"].astype("M8[s]")
df0["Date2"] = df0["Date2"].astype("M8[s]")
df0["DateTime"] = df0["DateTime"].astype("M8[ms]")
df0["Taiw"] = df0["Taiw"].astype("M8[s]")
res = df0["DateTimeHi"].astype("M8[us]").dt.round("ms")
df0["DateTimeHi"] = res.astype("M8[ms]")
if not IS64:
# No good reason for this, just what we get on the CI
df0.loc[0, "DateTimeHi"] += np.timedelta64(1, "ms")
df0.loc[[2, 3], "DateTimeHi"] -= np.timedelta64(1, "ms")
tm.assert_frame_equal(df, df0)
@pytest.mark.parametrize("column", ["WGT", "CYL"])
def test_compact_numerical_values(datapath, column):
# Regression test for #21616
fname = datapath("io", "sas", "data", "cars.sas7bdat")
df = pd.read_sas(fname, encoding="latin-1")
# The two columns CYL and WGT in cars.sas7bdat have column
# width < 8 and only contain integral values.
# Test that pandas doesn't corrupt the numbers by adding
# decimals.
result = df[column]
expected = df[column].round()
tm.assert_series_equal(result, expected, check_exact=True)
def test_many_columns(datapath):
# Test for looking for column information in more places (PR #22628)
fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
df = pd.read_sas(fname, encoding="latin-1")
fname = datapath("io", "sas", "data", "many_columns.csv")
df0 = pd.read_csv(fname, encoding="latin-1")
tm.assert_frame_equal(df, df0)
def test_inconsistent_number_of_rows(datapath):
# Regression test for issue #16615. (PR #22628)
fname = datapath("io", "sas", "data", "load_log.sas7bdat")
df = pd.read_sas(fname, encoding="latin-1")
assert len(df) == 2097
def test_zero_variables(datapath):
# Check if the SAS file has zero variables (PR #18184)
fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
with pytest.raises(EmptyDataError, match="No columns to parse from file"):
pd.read_sas(fname)
def test_zero_rows(datapath):
# GH 18198
fname = datapath("io", "sas", "data", "zero_rows.sas7bdat")
result = pd.read_sas(fname)
expected = pd.DataFrame([{"char_field": "a", "num_field": 1.0}]).iloc[:0]
tm.assert_frame_equal(result, expected)
def test_corrupt_read(datapath):
# We don't really care about the exact failure, the important thing is
# that the resource should be cleaned up afterwards (BUG #35566)
fname = datapath("io", "sas", "data", "corrupt.sas7bdat")
msg = "'SAS7BDATReader' object has no attribute 'row_count'"
with pytest.raises(AttributeError, match=msg):
pd.read_sas(fname)
def test_max_sas_date(datapath):
# GH 20927
# NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999
# but this is read as 29DEC9999:23:59:59.998993 by a buggy
# sas7bdat module
# See also GH#56014 for discussion of the correct "expected" results.
fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
df = pd.read_sas(fname, encoding="iso-8859-1")
expected = pd.DataFrame(
{
"text": ["max", "normal"],
"dt_as_float": [253717747199.999, 1880323199.999],
"dt_as_dt": np.array(
[
datetime(9999, 12, 29, 23, 59, 59, 999000),
datetime(2019, 8, 1, 23, 59, 59, 999000),
],
dtype="M8[ms]",
),
"date_as_float": [2936547.0, 21762.0],
"date_as_date": np.array(
[
datetime(9999, 12, 29),
datetime(2019, 8, 1),
],
dtype="M8[s]",
),
},
columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"],
)
if not IS64:
# No good reason for this, just what we get on the CI
expected.loc[:, "dt_as_dt"] -= np.timedelta64(1, "ms")
tm.assert_frame_equal(df, expected)
def test_max_sas_date_iterator(datapath):
# GH 20927
# when called as an iterator, only those chunks with a date > pd.Timestamp.max
# are returned as datetime.datetime, if this happens that whole chunk is returned
# as datetime.datetime
col_order = ["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"]
fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
results = []
for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1):
# GH 19732: Timestamps imported from sas will incur floating point errors
df.reset_index(inplace=True, drop=True)
results.append(df)
expected = [
pd.DataFrame(
{
"text": ["max"],
"dt_as_float": [253717747199.999],
"dt_as_dt": np.array(
[datetime(9999, 12, 29, 23, 59, 59, 999000)], dtype="M8[ms]"
),
"date_as_float": [2936547.0],
"date_as_date": np.array([datetime(9999, 12, 29)], dtype="M8[s]"),
},
columns=col_order,
),
pd.DataFrame(
{
"text": ["normal"],
"dt_as_float": [1880323199.999],
"dt_as_dt": np.array(["2019-08-01 23:59:59.999"], dtype="M8[ms]"),
"date_as_float": [21762.0],
"date_as_date": np.array(["2019-08-01"], dtype="M8[s]"),
},
columns=col_order,
),
]
if not IS64:
# No good reason for this, just what we get on the CI
expected[0].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms")
expected[1].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms")
tm.assert_frame_equal(results[0], expected[0])
tm.assert_frame_equal(results[1], expected[1])
def test_null_date(datapath):
fname = datapath("io", "sas", "data", "dates_null.sas7bdat")
df = pd.read_sas(fname, encoding="utf-8")
expected = pd.DataFrame(
{
"datecol": np.array(
[
datetime(9999, 12, 29),
np.datetime64("NaT"),
],
dtype="M8[s]",
),
"datetimecol": np.array(
[
datetime(9999, 12, 29, 23, 59, 59, 999000),
np.datetime64("NaT"),
],
dtype="M8[ms]",
),
},
)
if not IS64:
# No good reason for this, just what we get on the CI
expected.loc[0, "datetimecol"] -= np.timedelta64(1, "ms")
tm.assert_frame_equal(df, expected)
def test_meta2_page(datapath):
# GH 35545
fname = datapath("io", "sas", "data", "test_meta2_page.sas7bdat")
df = pd.read_sas(fname)
assert len(df) == 1000
@pytest.mark.parametrize(
"test_file, override_offset, override_value, expected_msg",
[
("test2.sas7bdat", 0x10000 + 55229, 0x80 | 0x0F, "Out of bounds"),
("test2.sas7bdat", 0x10000 + 55229, 0x10, "unknown control byte"),
("test3.sas7bdat", 118170, 184, "Out of bounds"),
],
)
def test_rle_rdc_exceptions(
datapath, test_file, override_offset, override_value, expected_msg
):
"""Errors in RLE/RDC decompression should propagate."""
with open(datapath("io", "sas", "data", test_file), "rb") as fd:
data = bytearray(fd.read())
data[override_offset] = override_value
with pytest.raises(Exception, match=expected_msg):
pd.read_sas(io.BytesIO(data), format="sas7bdat")
def test_0x40_control_byte(datapath):
# GH 31243
fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat")
df = pd.read_sas(fname, encoding="ascii")
fname = datapath("io", "sas", "data", "0x40controlbyte.csv")
df0 = pd.read_csv(fname, dtype="object")
tm.assert_frame_equal(df, df0)
def test_0x00_control_byte(datapath):
# GH 47099
fname = datapath("io", "sas", "data", "0x00controlbyte.sas7bdat.bz2")
df = next(pd.read_sas(fname, chunksize=11_000))
assert df.shape == (11_000, 20)