You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

735 lines
23 KiB

from datetime import datetime
import re
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
_testing as tm,
)
from pandas.tests.strings import (
_convert_na_value,
object_pyarrow_numpy,
)
@pytest.mark.parametrize("method", ["split", "rsplit"])
def test_split(any_string_dtype, method):
values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
result = getattr(values.str, method)("_")
exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
exp = _convert_na_value(values, exp)
tm.assert_series_equal(result, exp)
@pytest.mark.parametrize("method", ["split", "rsplit"])
def test_split_more_than_one_char(any_string_dtype, method):
# more than one char
values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype)
result = getattr(values.str, method)("__")
exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
exp = _convert_na_value(values, exp)
tm.assert_series_equal(result, exp)
result = getattr(values.str, method)("__", expand=False)
tm.assert_series_equal(result, exp)
def test_split_more_regex_split(any_string_dtype):
# regex split
values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
result = values.str.split("[,_]")
exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
exp = _convert_na_value(values, exp)
tm.assert_series_equal(result, exp)
def test_split_regex(any_string_dtype):
# GH 43563
# explicit regex = True split
values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
result = values.str.split(r"\.jpg", regex=True)
exp = Series([["xxxjpgzzz", ""]])
tm.assert_series_equal(result, exp)
def test_split_regex_explicit(any_string_dtype):
# explicit regex = True split with compiled regex
regex_pat = re.compile(r".jpg")
values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
result = values.str.split(regex_pat)
exp = Series([["xx", "zzz", ""]])
tm.assert_series_equal(result, exp)
# explicit regex = False split
result = values.str.split(r"\.jpg", regex=False)
exp = Series([["xxxjpgzzz.jpg"]])
tm.assert_series_equal(result, exp)
# non explicit regex split, pattern length == 1
result = values.str.split(r".")
exp = Series([["xxxjpgzzz", "jpg"]])
tm.assert_series_equal(result, exp)
# non explicit regex split, pattern length != 1
result = values.str.split(r".jpg")
exp = Series([["xx", "zzz", ""]])
tm.assert_series_equal(result, exp)
# regex=False with pattern compiled regex raises error
with pytest.raises(
ValueError,
match="Cannot use a compiled regex as replacement pattern with regex=False",
):
values.str.split(regex_pat, regex=False)
@pytest.mark.parametrize("expand", [None, False])
@pytest.mark.parametrize("method", ["split", "rsplit"])
def test_split_object_mixed(expand, method):
mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
result = getattr(mixed.str, method)("_", expand=expand)
exp = Series(
[
["a", "b", "c"],
np.nan,
["d", "e", "f"],
np.nan,
np.nan,
None,
np.nan,
np.nan,
]
)
assert isinstance(result, Series)
tm.assert_almost_equal(result, exp)
@pytest.mark.parametrize("method", ["split", "rsplit"])
@pytest.mark.parametrize("n", [None, 0])
def test_split_n(any_string_dtype, method, n):
s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype)
expected = Series([["a", "b"], pd.NA, ["b", "c"]])
result = getattr(s.str, method)(" ", n=n)
expected = _convert_na_value(s, expected)
tm.assert_series_equal(result, expected)
def test_rsplit(any_string_dtype):
# regex split is not supported by rsplit
values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
result = values.str.rsplit("[,_]")
exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]])
exp = _convert_na_value(values, exp)
tm.assert_series_equal(result, exp)
def test_rsplit_max_number(any_string_dtype):
# setting max number of splits, make sure it's from reverse
values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
result = values.str.rsplit("_", n=1)
exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]])
exp = _convert_na_value(values, exp)
tm.assert_series_equal(result, exp)
def test_split_blank_string(any_string_dtype):
# expand blank split GH 20067
values = Series([""], name="test", dtype=any_string_dtype)
result = values.str.split(expand=True)
exp = DataFrame([[]], dtype=any_string_dtype) # NOTE: this is NOT an empty df
tm.assert_frame_equal(result, exp)
def test_split_blank_string_with_non_empty(any_string_dtype):
values = Series(["a b c", "a b", "", " "], name="test", dtype=any_string_dtype)
result = values.str.split(expand=True)
exp = DataFrame(
[
["a", "b", "c"],
["a", "b", None],
[None, None, None],
[None, None, None],
],
dtype=any_string_dtype,
)
tm.assert_frame_equal(result, exp)
@pytest.mark.parametrize("method", ["split", "rsplit"])
def test_split_noargs(any_string_dtype, method):
# #1859
s = Series(["Wes McKinney", "Travis Oliphant"], dtype=any_string_dtype)
result = getattr(s.str, method)()
expected = ["Travis", "Oliphant"]
assert result[1] == expected
@pytest.mark.parametrize(
"data, pat",
[
(["bd asdf jfg", "kjasdflqw asdfnfk"], None),
(["bd asdf jfg", "kjasdflqw asdfnfk"], "asdf"),
(["bd_asdf_jfg", "kjasdflqw_asdfnfk"], "_"),
],
)
@pytest.mark.parametrize("n", [-1, 0])
def test_split_maxsplit(data, pat, any_string_dtype, n):
# re.split 0, str.split -1
s = Series(data, dtype=any_string_dtype)
result = s.str.split(pat=pat, n=n)
xp = s.str.split(pat=pat)
tm.assert_series_equal(result, xp)
@pytest.mark.parametrize(
"data, pat, expected",
[
(
["split once", "split once too!"],
None,
Series({0: ["split", "once"], 1: ["split", "once too!"]}),
),
(
["split_once", "split_once_too!"],
"_",
Series({0: ["split", "once"], 1: ["split", "once_too!"]}),
),
],
)
def test_split_no_pat_with_nonzero_n(data, pat, expected, any_string_dtype):
s = Series(data, dtype=any_string_dtype)
result = s.str.split(pat=pat, n=1)
tm.assert_series_equal(expected, result, check_index_type=False)
def test_split_to_dataframe_no_splits(any_string_dtype):
s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
result = s.str.split("_", expand=True)
exp = DataFrame({0: Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)})
tm.assert_frame_equal(result, exp)
def test_split_to_dataframe(any_string_dtype):
s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype)
result = s.str.split("_", expand=True)
exp = DataFrame(
{0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
dtype=any_string_dtype,
)
tm.assert_frame_equal(result, exp)
def test_split_to_dataframe_unequal_splits(any_string_dtype):
s = Series(
["some_unequal_splits", "one_of_these_things_is_not"], dtype=any_string_dtype
)
result = s.str.split("_", expand=True)
exp = DataFrame(
{
0: ["some", "one"],
1: ["unequal", "of"],
2: ["splits", "these"],
3: [None, "things"],
4: [None, "is"],
5: [None, "not"],
},
dtype=any_string_dtype,
)
tm.assert_frame_equal(result, exp)
def test_split_to_dataframe_with_index(any_string_dtype):
s = Series(
["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype
)
result = s.str.split("_", expand=True)
exp = DataFrame(
{0: ["some", "with"], 1: ["splits", "index"]},
index=["preserve", "me"],
dtype=any_string_dtype,
)
tm.assert_frame_equal(result, exp)
with pytest.raises(ValueError, match="expand must be"):
s.str.split("_", expand="not_a_boolean")
def test_split_to_multiindex_expand_no_splits():
# https://github.com/pandas-dev/pandas/issues/23677
idx = Index(["nosplit", "alsonosplit", np.nan])
result = idx.str.split("_", expand=True)
exp = idx
tm.assert_index_equal(result, exp)
assert result.nlevels == 1
def test_split_to_multiindex_expand():
idx = Index(["some_equal_splits", "with_no_nans", np.nan, None])
result = idx.str.split("_", expand=True)
exp = MultiIndex.from_tuples(
[
("some", "equal", "splits"),
("with", "no", "nans"),
[np.nan, np.nan, np.nan],
[None, None, None],
]
)
tm.assert_index_equal(result, exp)
assert result.nlevels == 3
def test_split_to_multiindex_expand_unequal_splits():
idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None])
result = idx.str.split("_", expand=True)
exp = MultiIndex.from_tuples(
[
("some", "unequal", "splits", np.nan, np.nan, np.nan),
("one", "of", "these", "things", "is", "not"),
(np.nan, np.nan, np.nan, np.nan, np.nan, np.nan),
(None, None, None, None, None, None),
]
)
tm.assert_index_equal(result, exp)
assert result.nlevels == 6
with pytest.raises(ValueError, match="expand must be"):
idx.str.split("_", expand="not_a_boolean")
def test_rsplit_to_dataframe_expand_no_splits(any_string_dtype):
s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
result = s.str.rsplit("_", expand=True)
exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype)
tm.assert_frame_equal(result, exp)
def test_rsplit_to_dataframe_expand(any_string_dtype):
s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype)
result = s.str.rsplit("_", expand=True)
exp = DataFrame(
{0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
dtype=any_string_dtype,
)
tm.assert_frame_equal(result, exp)
result = s.str.rsplit("_", expand=True, n=2)
exp = DataFrame(
{0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
dtype=any_string_dtype,
)
tm.assert_frame_equal(result, exp)
result = s.str.rsplit("_", expand=True, n=1)
exp = DataFrame(
{0: ["some_equal", "with_no"], 1: ["splits", "nans"]}, dtype=any_string_dtype
)
tm.assert_frame_equal(result, exp)
def test_rsplit_to_dataframe_expand_with_index(any_string_dtype):
s = Series(
["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype
)
result = s.str.rsplit("_", expand=True)
exp = DataFrame(
{0: ["some", "with"], 1: ["splits", "index"]},
index=["preserve", "me"],
dtype=any_string_dtype,
)
tm.assert_frame_equal(result, exp)
def test_rsplit_to_multiindex_expand_no_split():
idx = Index(["nosplit", "alsonosplit"])
result = idx.str.rsplit("_", expand=True)
exp = idx
tm.assert_index_equal(result, exp)
assert result.nlevels == 1
def test_rsplit_to_multiindex_expand():
idx = Index(["some_equal_splits", "with_no_nans"])
result = idx.str.rsplit("_", expand=True)
exp = MultiIndex.from_tuples([("some", "equal", "splits"), ("with", "no", "nans")])
tm.assert_index_equal(result, exp)
assert result.nlevels == 3
def test_rsplit_to_multiindex_expand_n():
idx = Index(["some_equal_splits", "with_no_nans"])
result = idx.str.rsplit("_", expand=True, n=1)
exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")])
tm.assert_index_equal(result, exp)
assert result.nlevels == 2
def test_split_nan_expand(any_string_dtype):
# gh-18450
s = Series(["foo,bar,baz", np.nan], dtype=any_string_dtype)
result = s.str.split(",", expand=True)
exp = DataFrame(
[["foo", "bar", "baz"], [np.nan, np.nan, np.nan]], dtype=any_string_dtype
)
tm.assert_frame_equal(result, exp)
# check that these are actually np.nan/pd.NA and not None
# TODO see GH 18463
# tm.assert_frame_equal does not differentiate
if any_string_dtype in object_pyarrow_numpy:
assert all(np.isnan(x) for x in result.iloc[1])
else:
assert all(x is pd.NA for x in result.iloc[1])
def test_split_with_name_series(any_string_dtype):
# GH 12617
# should preserve name
s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
res = s.str.split(",")
exp = Series([["a", "b"], ["c", "d"]], name="xxx")
tm.assert_series_equal(res, exp)
res = s.str.split(",", expand=True)
exp = DataFrame([["a", "b"], ["c", "d"]], dtype=any_string_dtype)
tm.assert_frame_equal(res, exp)
def test_split_with_name_index():
# GH 12617
idx = Index(["a,b", "c,d"], name="xxx")
res = idx.str.split(",")
exp = Index([["a", "b"], ["c", "d"]], name="xxx")
assert res.nlevels == 1
tm.assert_index_equal(res, exp)
res = idx.str.split(",", expand=True)
exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")])
assert res.nlevels == 2
tm.assert_index_equal(res, exp)
@pytest.mark.parametrize(
"method, exp",
[
[
"partition",
[
("a", "__", "b__c"),
("c", "__", "d__e"),
np.nan,
("f", "__", "g__h"),
None,
],
],
[
"rpartition",
[
("a__b", "__", "c"),
("c__d", "__", "e"),
np.nan,
("f__g", "__", "h"),
None,
],
],
],
)
def test_partition_series_more_than_one_char(method, exp, any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/23558
# more than one char
s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype)
result = getattr(s.str, method)("__", expand=False)
expected = Series(exp)
expected = _convert_na_value(s, expected)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"method, exp",
[
[
"partition",
[("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None],
],
[
"rpartition",
[("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None],
],
],
)
def test_partition_series_none(any_string_dtype, method, exp):
# https://github.com/pandas-dev/pandas/issues/23558
# None
s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype)
result = getattr(s.str, method)(expand=False)
expected = Series(exp)
expected = _convert_na_value(s, expected)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"method, exp",
[
[
"partition",
[("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None],
],
[
"rpartition",
[("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None],
],
],
)
def test_partition_series_not_split(any_string_dtype, method, exp):
# https://github.com/pandas-dev/pandas/issues/23558
# Not split
s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype)
result = getattr(s.str, method)("_", expand=False)
expected = Series(exp)
expected = _convert_na_value(s, expected)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"method, exp",
[
[
"partition",
[("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")],
],
[
"rpartition",
[("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")],
],
],
)
def test_partition_series_unicode(any_string_dtype, method, exp):
# https://github.com/pandas-dev/pandas/issues/23558
# unicode
s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
result = getattr(s.str, method)("_", expand=False)
expected = Series(exp)
expected = _convert_na_value(s, expected)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("method", ["partition", "rpartition"])
def test_partition_series_stdlib(any_string_dtype, method):
# https://github.com/pandas-dev/pandas/issues/23558
# compare to standard lib
s = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"], dtype=any_string_dtype)
result = getattr(s.str, method)("_", expand=False).tolist()
assert result == [getattr(v, method)("_") for v in s]
@pytest.mark.parametrize(
"method, expand, exp, exp_levels",
[
[
"partition",
False,
np.array(
[("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None],
dtype=object,
),
1,
],
[
"rpartition",
False,
np.array(
[("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None],
dtype=object,
),
1,
],
],
)
def test_partition_index(method, expand, exp, exp_levels):
# https://github.com/pandas-dev/pandas/issues/23558
values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None])
result = getattr(values.str, method)("_", expand=expand)
exp = Index(exp)
tm.assert_index_equal(result, exp)
assert result.nlevels == exp_levels
@pytest.mark.parametrize(
"method, exp",
[
[
"partition",
{
0: ["a", "c", np.nan, "f", None],
1: ["_", "_", np.nan, "_", None],
2: ["b_c", "d_e", np.nan, "g_h", None],
},
],
[
"rpartition",
{
0: ["a_b", "c_d", np.nan, "f_g", None],
1: ["_", "_", np.nan, "_", None],
2: ["c", "e", np.nan, "h", None],
},
],
],
)
def test_partition_to_dataframe(any_string_dtype, method, exp):
# https://github.com/pandas-dev/pandas/issues/23558
s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype)
result = getattr(s.str, method)("_")
expected = DataFrame(
exp,
dtype=any_string_dtype,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"method, exp",
[
[
"partition",
{
0: ["a", "c", np.nan, "f", None],
1: ["_", "_", np.nan, "_", None],
2: ["b_c", "d_e", np.nan, "g_h", None],
},
],
[
"rpartition",
{
0: ["a_b", "c_d", np.nan, "f_g", None],
1: ["_", "_", np.nan, "_", None],
2: ["c", "e", np.nan, "h", None],
},
],
],
)
def test_partition_to_dataframe_from_series(any_string_dtype, method, exp):
# https://github.com/pandas-dev/pandas/issues/23558
s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype)
result = getattr(s.str, method)("_", expand=True)
expected = DataFrame(
exp,
dtype=any_string_dtype,
)
tm.assert_frame_equal(result, expected)
def test_partition_with_name(any_string_dtype):
# GH 12617
s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
result = s.str.partition(",")
expected = DataFrame(
{0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]}, dtype=any_string_dtype
)
tm.assert_frame_equal(result, expected)
def test_partition_with_name_expand(any_string_dtype):
# GH 12617
# should preserve name
s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
result = s.str.partition(",", expand=False)
expected = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx")
tm.assert_series_equal(result, expected)
def test_partition_index_with_name():
idx = Index(["a,b", "c,d"], name="xxx")
result = idx.str.partition(",")
expected = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")])
assert result.nlevels == 3
tm.assert_index_equal(result, expected)
def test_partition_index_with_name_expand_false():
idx = Index(["a,b", "c,d"], name="xxx")
# should preserve name
result = idx.str.partition(",", expand=False)
expected = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx")
assert result.nlevels == 1
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("method", ["partition", "rpartition"])
def test_partition_sep_kwarg(any_string_dtype, method):
# GH 22676; depr kwarg "pat" in favor of "sep"
s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
expected = getattr(s.str, method)(sep="_")
result = getattr(s.str, method)("_")
tm.assert_frame_equal(result, expected)
def test_get():
ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
result = ser.str.split("_").str.get(1)
expected = Series(["b", "d", np.nan, "g"], dtype=object)
tm.assert_series_equal(result, expected)
def test_get_mixed_object():
ser = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0])
result = ser.str.split("_").str.get(1)
expected = Series(
["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan], dtype=object
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("idx", [2, -3])
def test_get_bounds(idx):
ser = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"])
result = ser.str.split("_").str.get(idx)
expected = Series(["3", "8", np.nan], dtype=object)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"idx, exp", [[2, [3, 3, np.nan, "b"]], [-1, [3, 3, np.nan, np.nan]]]
)
def test_get_complex(idx, exp):
# GH 20671, getting value not in dict raising `KeyError`
ser = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}])
result = ser.str.get(idx)
expected = Series(exp)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("to_type", [tuple, list, np.array])
def test_get_complex_nested(to_type):
ser = Series([to_type([to_type([1, 2])])])
result = ser.str.get(0)
expected = Series([to_type([1, 2])])
tm.assert_series_equal(result, expected)
result = ser.str.get(1)
expected = Series([np.nan])
tm.assert_series_equal(result, expected)
def test_get_strings(any_string_dtype):
ser = Series(["a", "ab", np.nan, "abc"], dtype=any_string_dtype)
result = ser.str.get(2)
expected = Series([np.nan, np.nan, np.nan, "c"], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)