You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
264 lines
7.3 KiB
264 lines
7.3 KiB
"""
|
|
Tests column conversion functionality during parsing
|
|
for all of the parsers defined in parsers.py
|
|
"""
|
|
from io import StringIO
|
|
|
|
from dateutil.parser import parse
|
|
import numpy as np
|
|
import pytest
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
DataFrame,
|
|
Index,
|
|
)
|
|
import pandas._testing as tm
|
|
|
|
|
|
def test_converters_type_must_be_dict(all_parsers):
|
|
parser = all_parsers
|
|
data = """index,A,B,C,D
|
|
foo,2,3,4,5
|
|
"""
|
|
if parser.engine == "pyarrow":
|
|
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
|
with pytest.raises(ValueError, match=msg):
|
|
parser.read_csv(StringIO(data), converters=0)
|
|
return
|
|
with pytest.raises(TypeError, match="Type converters.+"):
|
|
parser.read_csv(StringIO(data), converters=0)
|
|
|
|
|
|
@pytest.mark.parametrize("column", [3, "D"])
|
|
@pytest.mark.parametrize(
|
|
"converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer.
|
|
)
|
|
def test_converters(all_parsers, column, converter):
|
|
parser = all_parsers
|
|
data = """A,B,C,D
|
|
a,1,2,01/01/2009
|
|
b,3,4,01/02/2009
|
|
c,4,5,01/03/2009
|
|
"""
|
|
if parser.engine == "pyarrow":
|
|
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
|
with pytest.raises(ValueError, match=msg):
|
|
parser.read_csv(StringIO(data), converters={column: converter})
|
|
return
|
|
|
|
result = parser.read_csv(StringIO(data), converters={column: converter})
|
|
|
|
expected = parser.read_csv(StringIO(data))
|
|
expected["D"] = expected["D"].map(converter)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_converters_no_implicit_conv(all_parsers):
|
|
# see gh-2184
|
|
parser = all_parsers
|
|
data = """000102,1.2,A\n001245,2,B"""
|
|
|
|
converters = {0: lambda x: x.strip()}
|
|
|
|
if parser.engine == "pyarrow":
|
|
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
|
with pytest.raises(ValueError, match=msg):
|
|
parser.read_csv(StringIO(data), header=None, converters=converters)
|
|
return
|
|
|
|
result = parser.read_csv(StringIO(data), header=None, converters=converters)
|
|
|
|
# Column 0 should not be casted to numeric and should remain as object.
|
|
expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_converters_euro_decimal_format(all_parsers):
|
|
# see gh-583
|
|
converters = {}
|
|
parser = all_parsers
|
|
|
|
data = """Id;Number1;Number2;Text1;Text2;Number3
|
|
1;1521,1541;187101,9543;ABC;poi;4,7387
|
|
2;121,12;14897,76;DEF;uyt;0,3773
|
|
3;878,158;108013,434;GHI;rez;2,7356"""
|
|
converters["Number1"] = converters["Number2"] = converters[
|
|
"Number3"
|
|
] = lambda x: float(x.replace(",", "."))
|
|
|
|
if parser.engine == "pyarrow":
|
|
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
|
with pytest.raises(ValueError, match=msg):
|
|
parser.read_csv(StringIO(data), sep=";", converters=converters)
|
|
return
|
|
|
|
result = parser.read_csv(StringIO(data), sep=";", converters=converters)
|
|
expected = DataFrame(
|
|
[
|
|
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
|
|
[2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
|
|
[3, 878.158, 108013.434, "GHI", "rez", 2.7356],
|
|
],
|
|
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_converters_corner_with_nans(all_parsers):
|
|
parser = all_parsers
|
|
data = """id,score,days
|
|
1,2,12
|
|
2,2-5,
|
|
3,,14+
|
|
4,6-12,2"""
|
|
|
|
# Example converters.
|
|
def convert_days(x):
|
|
x = x.strip()
|
|
|
|
if not x:
|
|
return np.nan
|
|
|
|
is_plus = x.endswith("+")
|
|
|
|
if is_plus:
|
|
x = int(x[:-1]) + 1
|
|
else:
|
|
x = int(x)
|
|
|
|
return x
|
|
|
|
def convert_days_sentinel(x):
|
|
x = x.strip()
|
|
|
|
if not x:
|
|
return np.nan
|
|
|
|
is_plus = x.endswith("+")
|
|
|
|
if is_plus:
|
|
x = int(x[:-1]) + 1
|
|
else:
|
|
x = int(x)
|
|
|
|
return x
|
|
|
|
def convert_score(x):
|
|
x = x.strip()
|
|
|
|
if not x:
|
|
return np.nan
|
|
|
|
if x.find("-") > 0:
|
|
val_min, val_max = map(int, x.split("-"))
|
|
val = 0.5 * (val_min + val_max)
|
|
else:
|
|
val = float(x)
|
|
|
|
return val
|
|
|
|
results = []
|
|
|
|
for day_converter in [convert_days, convert_days_sentinel]:
|
|
if parser.engine == "pyarrow":
|
|
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
|
with pytest.raises(ValueError, match=msg):
|
|
parser.read_csv(
|
|
StringIO(data),
|
|
converters={"score": convert_score, "days": day_converter},
|
|
na_values=["", None],
|
|
)
|
|
continue
|
|
|
|
result = parser.read_csv(
|
|
StringIO(data),
|
|
converters={"score": convert_score, "days": day_converter},
|
|
na_values=["", None],
|
|
)
|
|
assert pd.isna(result["days"][1])
|
|
results.append(result)
|
|
|
|
if parser.engine != "pyarrow":
|
|
tm.assert_frame_equal(results[0], results[1])
|
|
|
|
|
|
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
|
|
def test_converter_index_col_bug(all_parsers, conv_f):
|
|
# see gh-1835 , GH#40589
|
|
parser = all_parsers
|
|
data = "A;B\n1;2\n3;4"
|
|
|
|
if parser.engine == "pyarrow":
|
|
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
|
with pytest.raises(ValueError, match=msg):
|
|
parser.read_csv(
|
|
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
|
|
)
|
|
return
|
|
|
|
rs = parser.read_csv(
|
|
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
|
|
)
|
|
|
|
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
|
|
tm.assert_frame_equal(rs, xp)
|
|
|
|
|
|
def test_converter_identity_object(all_parsers):
|
|
# GH#40589
|
|
parser = all_parsers
|
|
data = "A,B\n1,2\n3,4"
|
|
|
|
if parser.engine == "pyarrow":
|
|
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
|
with pytest.raises(ValueError, match=msg):
|
|
parser.read_csv(StringIO(data), converters={"A": lambda x: x})
|
|
return
|
|
|
|
rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x})
|
|
|
|
xp = DataFrame({"A": ["1", "3"], "B": [2, 4]})
|
|
tm.assert_frame_equal(rs, xp)
|
|
|
|
|
|
def test_converter_multi_index(all_parsers):
|
|
# GH 42446
|
|
parser = all_parsers
|
|
data = "A,B,B\nX,Y,Z\n1,2,3"
|
|
|
|
if parser.engine == "pyarrow":
|
|
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
|
with pytest.raises(ValueError, match=msg):
|
|
parser.read_csv(
|
|
StringIO(data),
|
|
header=list(range(2)),
|
|
converters={
|
|
("A", "X"): np.int32,
|
|
("B", "Y"): np.int32,
|
|
("B", "Z"): np.float32,
|
|
},
|
|
)
|
|
return
|
|
|
|
result = parser.read_csv(
|
|
StringIO(data),
|
|
header=list(range(2)),
|
|
converters={
|
|
("A", "X"): np.int32,
|
|
("B", "Y"): np.int32,
|
|
("B", "Z"): np.float32,
|
|
},
|
|
)
|
|
|
|
expected = DataFrame(
|
|
{
|
|
("A", "X"): np.int32([1]),
|
|
("B", "Y"): np.int32([2]),
|
|
("B", "Z"): np.float32([3]),
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(result, expected)
|