You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
428 lines
13 KiB
428 lines
13 KiB
from datetime import datetime
|
|
import operator
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from pandas import (
|
|
Series,
|
|
_testing as tm,
|
|
)
|
|
|
|
|
|
def test_title(any_string_dtype):
|
|
s = Series(["FOO", "BAR", np.nan, "Blah", "blurg"], dtype=any_string_dtype)
|
|
result = s.str.title()
|
|
expected = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"], dtype=any_string_dtype)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_title_mixed_object():
|
|
s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0])
|
|
result = s.str.title()
|
|
expected = Series(
|
|
["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan],
|
|
dtype=object,
|
|
)
|
|
tm.assert_almost_equal(result, expected)
|
|
|
|
|
|
def test_lower_upper(any_string_dtype):
|
|
s = Series(["om", np.nan, "nom", "nom"], dtype=any_string_dtype)
|
|
|
|
result = s.str.upper()
|
|
expected = Series(["OM", np.nan, "NOM", "NOM"], dtype=any_string_dtype)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = result.str.lower()
|
|
tm.assert_series_equal(result, s)
|
|
|
|
|
|
def test_lower_upper_mixed_object():
|
|
s = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0])
|
|
|
|
result = s.str.upper()
|
|
expected = Series(
|
|
["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan], dtype=object
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = s.str.lower()
|
|
expected = Series(
|
|
["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"data, expected",
|
|
[
|
|
(
|
|
["FOO", "BAR", np.nan, "Blah", "blurg"],
|
|
["Foo", "Bar", np.nan, "Blah", "Blurg"],
|
|
),
|
|
(["a", "b", "c"], ["A", "B", "C"]),
|
|
(["a b", "a bc. de"], ["A b", "A bc. de"]),
|
|
],
|
|
)
|
|
def test_capitalize(data, expected, any_string_dtype):
|
|
s = Series(data, dtype=any_string_dtype)
|
|
result = s.str.capitalize()
|
|
expected = Series(expected, dtype=any_string_dtype)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_capitalize_mixed_object():
|
|
s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0])
|
|
result = s.str.capitalize()
|
|
expected = Series(
|
|
["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan],
|
|
dtype=object,
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_swapcase(any_string_dtype):
|
|
s = Series(["FOO", "BAR", np.nan, "Blah", "blurg"], dtype=any_string_dtype)
|
|
result = s.str.swapcase()
|
|
expected = Series(["foo", "bar", np.nan, "bLAH", "BLURG"], dtype=any_string_dtype)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_swapcase_mixed_object():
|
|
s = Series(["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0])
|
|
result = s.str.swapcase()
|
|
expected = Series(
|
|
["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan],
|
|
dtype=object,
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_casefold():
|
|
# GH25405
|
|
expected = Series(["ss", np.nan, "case", "ssd"])
|
|
s = Series(["ß", np.nan, "case", "ßd"])
|
|
result = s.str.casefold()
|
|
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_casemethods(any_string_dtype):
|
|
values = ["aaa", "bbb", "CCC", "Dddd", "eEEE"]
|
|
s = Series(values, dtype=any_string_dtype)
|
|
assert s.str.lower().tolist() == [v.lower() for v in values]
|
|
assert s.str.upper().tolist() == [v.upper() for v in values]
|
|
assert s.str.title().tolist() == [v.title() for v in values]
|
|
assert s.str.capitalize().tolist() == [v.capitalize() for v in values]
|
|
assert s.str.swapcase().tolist() == [v.swapcase() for v in values]
|
|
|
|
|
|
def test_pad(any_string_dtype):
|
|
s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype)
|
|
|
|
result = s.str.pad(5, side="left")
|
|
expected = Series(
|
|
[" a", " b", np.nan, " c", np.nan, "eeeeee"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = s.str.pad(5, side="right")
|
|
expected = Series(
|
|
["a ", "b ", np.nan, "c ", np.nan, "eeeeee"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = s.str.pad(5, side="both")
|
|
expected = Series(
|
|
[" a ", " b ", np.nan, " c ", np.nan, "eeeeee"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_pad_mixed_object():
|
|
s = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0])
|
|
|
|
result = s.str.pad(5, side="left")
|
|
expected = Series(
|
|
[" a", np.nan, " b", np.nan, np.nan, " ee", None, np.nan, np.nan],
|
|
dtype=object,
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = s.str.pad(5, side="right")
|
|
expected = Series(
|
|
["a ", np.nan, "b ", np.nan, np.nan, "ee ", None, np.nan, np.nan],
|
|
dtype=object,
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = s.str.pad(5, side="both")
|
|
expected = Series(
|
|
[" a ", np.nan, " b ", np.nan, np.nan, " ee ", None, np.nan, np.nan],
|
|
dtype=object,
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_pad_fillchar(any_string_dtype):
|
|
s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype)
|
|
|
|
result = s.str.pad(5, side="left", fillchar="X")
|
|
expected = Series(
|
|
["XXXXa", "XXXXb", np.nan, "XXXXc", np.nan, "eeeeee"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = s.str.pad(5, side="right", fillchar="X")
|
|
expected = Series(
|
|
["aXXXX", "bXXXX", np.nan, "cXXXX", np.nan, "eeeeee"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = s.str.pad(5, side="both", fillchar="X")
|
|
expected = Series(
|
|
["XXaXX", "XXbXX", np.nan, "XXcXX", np.nan, "eeeeee"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_pad_fillchar_bad_arg_raises(any_string_dtype):
|
|
s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype)
|
|
|
|
msg = "fillchar must be a character, not str"
|
|
with pytest.raises(TypeError, match=msg):
|
|
s.str.pad(5, fillchar="XY")
|
|
|
|
msg = "fillchar must be a character, not int"
|
|
with pytest.raises(TypeError, match=msg):
|
|
s.str.pad(5, fillchar=5)
|
|
|
|
|
|
@pytest.mark.parametrize("method_name", ["center", "ljust", "rjust", "zfill", "pad"])
|
|
def test_pad_width_bad_arg_raises(method_name, any_string_dtype):
|
|
# see gh-13598
|
|
s = Series(["1", "22", "a", "bb"], dtype=any_string_dtype)
|
|
op = operator.methodcaller(method_name, "f")
|
|
|
|
msg = "width must be of integer type, not str"
|
|
with pytest.raises(TypeError, match=msg):
|
|
op(s.str)
|
|
|
|
|
|
def test_center_ljust_rjust(any_string_dtype):
|
|
s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype)
|
|
|
|
result = s.str.center(5)
|
|
expected = Series(
|
|
[" a ", " b ", np.nan, " c ", np.nan, "eeeeee"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = s.str.ljust(5)
|
|
expected = Series(
|
|
["a ", "b ", np.nan, "c ", np.nan, "eeeeee"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = s.str.rjust(5)
|
|
expected = Series(
|
|
[" a", " b", np.nan, " c", np.nan, "eeeeee"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_center_ljust_rjust_mixed_object():
|
|
s = Series(["a", np.nan, "b", True, datetime.today(), "c", "eee", None, 1, 2.0])
|
|
|
|
result = s.str.center(5)
|
|
expected = Series(
|
|
[
|
|
" a ",
|
|
np.nan,
|
|
" b ",
|
|
np.nan,
|
|
np.nan,
|
|
" c ",
|
|
" eee ",
|
|
None,
|
|
np.nan,
|
|
np.nan,
|
|
],
|
|
dtype=object,
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = s.str.ljust(5)
|
|
expected = Series(
|
|
[
|
|
"a ",
|
|
np.nan,
|
|
"b ",
|
|
np.nan,
|
|
np.nan,
|
|
"c ",
|
|
"eee ",
|
|
None,
|
|
np.nan,
|
|
np.nan,
|
|
],
|
|
dtype=object,
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = s.str.rjust(5)
|
|
expected = Series(
|
|
[
|
|
" a",
|
|
np.nan,
|
|
" b",
|
|
np.nan,
|
|
np.nan,
|
|
" c",
|
|
" eee",
|
|
None,
|
|
np.nan,
|
|
np.nan,
|
|
],
|
|
dtype=object,
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_center_ljust_rjust_fillchar(any_string_dtype):
|
|
if any_string_dtype == "string[pyarrow_numpy]":
|
|
pytest.skip(
|
|
"Arrow logic is different, "
|
|
"see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126",
|
|
)
|
|
s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype)
|
|
|
|
result = s.str.center(5, fillchar="X")
|
|
expected = Series(
|
|
["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
expected = np.array([v.center(5, "X") for v in np.array(s)], dtype=np.object_)
|
|
tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
|
|
|
|
result = s.str.ljust(5, fillchar="X")
|
|
expected = Series(
|
|
["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
expected = np.array([v.ljust(5, "X") for v in np.array(s)], dtype=np.object_)
|
|
tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
|
|
|
|
result = s.str.rjust(5, fillchar="X")
|
|
expected = Series(
|
|
["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
expected = np.array([v.rjust(5, "X") for v in np.array(s)], dtype=np.object_)
|
|
tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
|
|
|
|
|
|
def test_center_ljust_rjust_fillchar_bad_arg_raises(any_string_dtype):
|
|
s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype)
|
|
|
|
# If fillchar is not a character, normal str raises TypeError
|
|
# 'aaa'.ljust(5, 'XY')
|
|
# TypeError: must be char, not str
|
|
template = "fillchar must be a character, not {dtype}"
|
|
|
|
with pytest.raises(TypeError, match=template.format(dtype="str")):
|
|
s.str.center(5, fillchar="XY")
|
|
|
|
with pytest.raises(TypeError, match=template.format(dtype="str")):
|
|
s.str.ljust(5, fillchar="XY")
|
|
|
|
with pytest.raises(TypeError, match=template.format(dtype="str")):
|
|
s.str.rjust(5, fillchar="XY")
|
|
|
|
with pytest.raises(TypeError, match=template.format(dtype="int")):
|
|
s.str.center(5, fillchar=1)
|
|
|
|
with pytest.raises(TypeError, match=template.format(dtype="int")):
|
|
s.str.ljust(5, fillchar=1)
|
|
|
|
with pytest.raises(TypeError, match=template.format(dtype="int")):
|
|
s.str.rjust(5, fillchar=1)
|
|
|
|
|
|
def test_zfill(any_string_dtype):
|
|
s = Series(["1", "22", "aaa", "333", "45678"], dtype=any_string_dtype)
|
|
|
|
result = s.str.zfill(5)
|
|
expected = Series(
|
|
["00001", "00022", "00aaa", "00333", "45678"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
expected = np.array([v.zfill(5) for v in np.array(s)], dtype=np.object_)
|
|
tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
|
|
|
|
result = s.str.zfill(3)
|
|
expected = Series(["001", "022", "aaa", "333", "45678"], dtype=any_string_dtype)
|
|
tm.assert_series_equal(result, expected)
|
|
expected = np.array([v.zfill(3) for v in np.array(s)], dtype=np.object_)
|
|
tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
|
|
|
|
s = Series(["1", np.nan, "aaa", np.nan, "45678"], dtype=any_string_dtype)
|
|
result = s.str.zfill(5)
|
|
expected = Series(
|
|
["00001", np.nan, "00aaa", np.nan, "45678"], dtype=any_string_dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_wrap(any_string_dtype):
|
|
# test values are: two words less than width, two words equal to width,
|
|
# two words greater than width, one word less than width, one word
|
|
# equal to width, one word greater than width, multiple tokens with
|
|
# trailing whitespace equal to width
|
|
s = Series(
|
|
[
|
|
"hello world",
|
|
"hello world!",
|
|
"hello world!!",
|
|
"abcdefabcde",
|
|
"abcdefabcdef",
|
|
"abcdefabcdefa",
|
|
"ab ab ab ab ",
|
|
"ab ab ab ab a",
|
|
"\t",
|
|
],
|
|
dtype=any_string_dtype,
|
|
)
|
|
|
|
# expected values
|
|
expected = Series(
|
|
[
|
|
"hello world",
|
|
"hello world!",
|
|
"hello\nworld!!",
|
|
"abcdefabcde",
|
|
"abcdefabcdef",
|
|
"abcdefabcdef\na",
|
|
"ab ab ab ab",
|
|
"ab ab ab ab\na",
|
|
"",
|
|
],
|
|
dtype=any_string_dtype,
|
|
)
|
|
|
|
result = s.str.wrap(12, break_long_words=True)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_wrap_unicode(any_string_dtype):
|
|
# test with pre and post whitespace (non-unicode), NaN, and non-ascii Unicode
|
|
s = Series(
|
|
[" pre ", np.nan, "\xac\u20ac\U00008000 abadcafe"], dtype=any_string_dtype
|
|
)
|
|
expected = Series(
|
|
[" pre", np.nan, "\xac\u20ac\U00008000 ab\nadcafe"], dtype=any_string_dtype
|
|
)
|
|
result = s.str.wrap(6)
|
|
tm.assert_series_equal(result, expected)
|