You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2098 lines
60 KiB
2098 lines
60 KiB
from __future__ import annotations
|
|
|
|
from io import (
|
|
BytesIO,
|
|
StringIO,
|
|
)
|
|
from lzma import LZMAError
|
|
import os
|
|
from tarfile import ReadError
|
|
from urllib.error import HTTPError
|
|
from xml.etree.ElementTree import ParseError
|
|
from zipfile import BadZipFile
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from pandas.compat._optional import import_optional_dependency
|
|
from pandas.errors import (
|
|
EmptyDataError,
|
|
ParserError,
|
|
)
|
|
import pandas.util._test_decorators as td
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
NA,
|
|
DataFrame,
|
|
Series,
|
|
)
|
|
import pandas._testing as tm
|
|
from pandas.core.arrays import (
|
|
ArrowStringArray,
|
|
StringArray,
|
|
)
|
|
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
|
|
|
|
from pandas.io.common import get_handle
|
|
from pandas.io.xml import read_xml
|
|
|
|
# CHECK LIST
|
|
|
|
# [x] - ValueError: "Values for parser can only be lxml or etree."
|
|
|
|
# etree
|
|
# [X] - ImportError: "lxml not found, please install or use the etree parser."
|
|
# [X] - TypeError: "expected str, bytes or os.PathLike object, not NoneType"
|
|
# [X] - ValueError: "Either element or attributes can be parsed not both."
|
|
# [X] - ValueError: "xpath does not return any nodes..."
|
|
# [X] - SyntaxError: "You have used an incorrect or unsupported XPath"
|
|
# [X] - ValueError: "names does not match length of child elements in xpath."
|
|
# [X] - TypeError: "...is not a valid type for names"
|
|
# [X] - ValueError: "To use stylesheet, you need lxml installed..."
|
|
# [] - URLError: (GENERAL ERROR WITH HTTPError AS SUBCLASS)
|
|
# [X] - HTTPError: "HTTP Error 404: Not Found"
|
|
# [] - OSError: (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS)
|
|
# [X] - FileNotFoundError: "No such file or directory"
|
|
# [] - ParseError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML)
|
|
# [X] - UnicodeDecodeError: "'utf-8' codec can't decode byte 0xe9..."
|
|
# [X] - UnicodeError: "UTF-16 stream does not start with BOM"
|
|
# [X] - BadZipFile: "File is not a zip file"
|
|
# [X] - OSError: "Invalid data stream"
|
|
# [X] - LZMAError: "Input format not supported by decoder"
|
|
# [X] - ValueError: "Unrecognized compression type"
|
|
# [X] - PermissionError: "Forbidden"
|
|
|
|
# lxml
|
|
# [X] - ValueError: "Either element or attributes can be parsed not both."
|
|
# [X] - AttributeError: "__enter__"
|
|
# [X] - XSLTApplyError: "Cannot resolve URI"
|
|
# [X] - XSLTParseError: "document is not a stylesheet"
|
|
# [X] - ValueError: "xpath does not return any nodes."
|
|
# [X] - XPathEvalError: "Invalid expression"
|
|
# [] - XPathSyntaxError: (OLD VERSION IN lxml FOR XPATH ERRORS)
|
|
# [X] - TypeError: "empty namespace prefix is not supported in XPath"
|
|
# [X] - ValueError: "names does not match length of child elements in xpath."
|
|
# [X] - TypeError: "...is not a valid type for names"
|
|
# [X] - LookupError: "unknown encoding"
|
|
# [] - URLError: (USUALLY DUE TO NETWORKING)
|
|
# [X - HTTPError: "HTTP Error 404: Not Found"
|
|
# [X] - OSError: "failed to load external entity"
|
|
# [X] - XMLSyntaxError: "Start tag expected, '<' not found"
|
|
# [] - ParserError: (FAILSAFE CATCH ALL FOR VERY COMPLEX XML
|
|
# [X] - ValueError: "Values for parser can only be lxml or etree."
|
|
# [X] - UnicodeDecodeError: "'utf-8' codec can't decode byte 0xe9..."
|
|
# [X] - UnicodeError: "UTF-16 stream does not start with BOM"
|
|
# [X] - BadZipFile: "File is not a zip file"
|
|
# [X] - OSError: "Invalid data stream"
|
|
# [X] - LZMAError: "Input format not supported by decoder"
|
|
# [X] - ValueError: "Unrecognized compression type"
|
|
# [X] - PermissionError: "Forbidden"
|
|
|
|
geom_df = DataFrame(
|
|
{
|
|
"shape": ["square", "circle", "triangle"],
|
|
"degrees": [360, 360, 180],
|
|
"sides": [4, np.nan, 3],
|
|
}
|
|
)
|
|
|
|
xml_default_nmsp = """\
|
|
<?xml version='1.0' encoding='utf-8'?>
|
|
<data xmlns="http://example.com">
|
|
<row>
|
|
<shape>square</shape>
|
|
<degrees>360</degrees>
|
|
<sides>4</sides>
|
|
</row>
|
|
<row>
|
|
<shape>circle</shape>
|
|
<degrees>360</degrees>
|
|
<sides/>
|
|
</row>
|
|
<row>
|
|
<shape>triangle</shape>
|
|
<degrees>180</degrees>
|
|
<sides>3</sides>
|
|
</row>
|
|
</data>"""
|
|
|
|
xml_prefix_nmsp = """\
|
|
<?xml version='1.0' encoding='utf-8'?>
|
|
<doc:data xmlns:doc="http://example.com">
|
|
<doc:row>
|
|
<doc:shape>square</doc:shape>
|
|
<doc:degrees>360</doc:degrees>
|
|
<doc:sides>4.0</doc:sides>
|
|
</doc:row>
|
|
<doc:row>
|
|
<doc:shape>circle</doc:shape>
|
|
<doc:degrees>360</doc:degrees>
|
|
<doc:sides/>
|
|
</doc:row>
|
|
<doc:row>
|
|
<doc:shape>triangle</doc:shape>
|
|
<doc:degrees>180</doc:degrees>
|
|
<doc:sides>3.0</doc:sides>
|
|
</doc:row>
|
|
</doc:data>"""
|
|
|
|
|
|
df_kml = DataFrame(
|
|
{
|
|
"id": {
|
|
0: "ID_00001",
|
|
1: "ID_00002",
|
|
2: "ID_00003",
|
|
3: "ID_00004",
|
|
4: "ID_00005",
|
|
},
|
|
"name": {
|
|
0: "Blue Line (Forest Park)",
|
|
1: "Red, Purple Line",
|
|
2: "Red, Purple Line",
|
|
3: "Red, Purple Line",
|
|
4: "Red, Purple Line",
|
|
},
|
|
"styleUrl": {
|
|
0: "#LineStyle01",
|
|
1: "#LineStyle01",
|
|
2: "#LineStyle01",
|
|
3: "#LineStyle01",
|
|
4: "#LineStyle01",
|
|
},
|
|
"extrude": {0: 0, 1: 0, 2: 0, 3: 0, 4: 0},
|
|
"altitudeMode": {
|
|
0: "clampedToGround",
|
|
1: "clampedToGround",
|
|
2: "clampedToGround",
|
|
3: "clampedToGround",
|
|
4: "clampedToGround",
|
|
},
|
|
"coordinates": {
|
|
0: (
|
|
"-87.77678526964958,41.8708863930319,0 "
|
|
"-87.77826234150609,41.87097820122218,0 "
|
|
"-87.78251583439344,41.87130129991005,0 "
|
|
"-87.78418294588424,41.87145055520308,0 "
|
|
"-87.7872369165933,41.8717239119163,0 "
|
|
"-87.79160214925886,41.87210797280065,0"
|
|
),
|
|
1: (
|
|
"-87.65758750947528,41.96427269188822,0 "
|
|
"-87.65802133507393,41.96581929055245,0 "
|
|
"-87.65819033925305,41.96621846093642,0 "
|
|
"-87.6583189819129,41.96650362897086,0 "
|
|
"-87.65835858701473,41.96669002089185,0 "
|
|
"-87.65838428411853,41.96688150295095,0 "
|
|
"-87.65842208882658,41.96745896091846,0 "
|
|
"-87.65846556843937,41.9683761425439,0 "
|
|
"-87.65849296214573,41.96913893870342,0"
|
|
),
|
|
2: (
|
|
"-87.65492939166126,41.95377494531437,0 "
|
|
"-87.65557043199591,41.95376544118533,0 "
|
|
"-87.65606302030132,41.95376391658746,0 "
|
|
"-87.65623502146268,41.95377379126367,0 "
|
|
"-87.65634748981634,41.95380103566435,0 "
|
|
"-87.65646537904269,41.95387703994676,0 "
|
|
"-87.65656532461145,41.95396622645799,0 "
|
|
"-87.65664760856414,41.95404201996044,0 "
|
|
"-87.65671750555913,41.95416647054043,0 "
|
|
"-87.65673983607117,41.95429949810849,0 "
|
|
"-87.65673866475777,41.95441024240925,0 "
|
|
"-87.6567690255541,41.95490657227902,0 "
|
|
"-87.65683672482363,41.95692259283837,0 "
|
|
"-87.6568900886376,41.95861070983142,0 "
|
|
"-87.65699865558875,41.96181418669004,0 "
|
|
"-87.65756347177603,41.96397045777844,0 "
|
|
"-87.65758750947528,41.96427269188822,0"
|
|
),
|
|
3: (
|
|
"-87.65362593118043,41.94742799535678,0 "
|
|
"-87.65363554415794,41.94819886386848,0 "
|
|
"-87.6536456393239,41.95059994675451,0 "
|
|
"-87.65365831235026,41.95108288489359,0 "
|
|
"-87.6536604873874,41.9519954657554,0 "
|
|
"-87.65362592053201,41.95245597302328,0 "
|
|
"-87.65367158496069,41.95311153649393,0 "
|
|
"-87.65368468595476,41.9533202828916,0 "
|
|
"-87.65369271253692,41.95343095587119,0 "
|
|
"-87.65373335834569,41.95351536301472,0 "
|
|
"-87.65378605844126,41.95358212680591,0 "
|
|
"-87.65385067928185,41.95364452823767,0 "
|
|
"-87.6539390793817,41.95370263886964,0 "
|
|
"-87.6540786298351,41.95373403675265,0 "
|
|
"-87.65430648647626,41.9537535411832,0 "
|
|
"-87.65492939166126,41.95377494531437,0"
|
|
),
|
|
4: (
|
|
"-87.65345391792157,41.94217681262115,0 "
|
|
"-87.65342448305786,41.94237224420864,0 "
|
|
"-87.65339745703922,41.94268217746244,0 "
|
|
"-87.65337753982941,41.94288140770284,0 "
|
|
"-87.65336256753105,41.94317369618263,0 "
|
|
"-87.65338799707138,41.94357253961736,0 "
|
|
"-87.65340240886648,41.94389158188269,0 "
|
|
"-87.65341837392448,41.94406444407721,0 "
|
|
"-87.65342275247338,41.94421065714904,0 "
|
|
"-87.65347469646018,41.94434829382345,0 "
|
|
"-87.65351486483024,41.94447699917548,0 "
|
|
"-87.65353483605053,41.9453896864472,0 "
|
|
"-87.65361975532807,41.94689193720703,0 "
|
|
"-87.65362593118043,41.94742799535678,0"
|
|
),
|
|
},
|
|
}
|
|
)
|
|
|
|
|
|
def test_literal_xml_deprecation():
|
|
# GH 53809
|
|
pytest.importorskip("lxml")
|
|
msg = (
|
|
"Passing literal xml to 'read_xml' is deprecated and "
|
|
"will be removed in a future version. To read from a "
|
|
"literal string, wrap it in a 'StringIO' object."
|
|
)
|
|
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
read_xml(xml_default_nmsp)
|
|
|
|
|
|
@pytest.fixture(params=["rb", "r"])
|
|
def mode(request):
|
|
return request.param
|
|
|
|
|
|
@pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"])
|
|
def parser(request):
|
|
return request.param
|
|
|
|
|
|
def read_xml_iterparse(data, **kwargs):
|
|
with tm.ensure_clean() as path:
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
f.write(data)
|
|
return read_xml(path, **kwargs)
|
|
|
|
|
|
def read_xml_iterparse_comp(comp_path, compression_only, **kwargs):
|
|
with get_handle(comp_path, "r", compression=compression_only) as handles:
|
|
with tm.ensure_clean() as path:
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
f.write(handles.handle.read())
|
|
return read_xml(path, **kwargs)
|
|
|
|
|
|
# FILE / URL
|
|
|
|
|
|
def test_parser_consistency_file(xml_books):
|
|
pytest.importorskip("lxml")
|
|
df_file_lxml = read_xml(xml_books, parser="lxml")
|
|
df_file_etree = read_xml(xml_books, parser="etree")
|
|
|
|
df_iter_lxml = read_xml(
|
|
xml_books,
|
|
parser="lxml",
|
|
iterparse={"book": ["category", "title", "year", "author", "price"]},
|
|
)
|
|
df_iter_etree = read_xml(
|
|
xml_books,
|
|
parser="etree",
|
|
iterparse={"book": ["category", "title", "year", "author", "price"]},
|
|
)
|
|
|
|
tm.assert_frame_equal(df_file_lxml, df_file_etree)
|
|
tm.assert_frame_equal(df_file_lxml, df_iter_lxml)
|
|
tm.assert_frame_equal(df_iter_lxml, df_iter_etree)
|
|
|
|
|
|
@pytest.mark.network
|
|
@pytest.mark.single_cpu
|
|
def test_parser_consistency_url(parser, httpserver):
|
|
httpserver.serve_content(content=xml_default_nmsp)
|
|
|
|
df_xpath = read_xml(StringIO(xml_default_nmsp), parser=parser)
|
|
df_iter = read_xml(
|
|
BytesIO(xml_default_nmsp.encode()),
|
|
parser=parser,
|
|
iterparse={"row": ["shape", "degrees", "sides"]},
|
|
)
|
|
|
|
tm.assert_frame_equal(df_xpath, df_iter)
|
|
|
|
|
|
def test_file_like(xml_books, parser, mode):
|
|
with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f:
|
|
df_file = read_xml(f, parser=parser)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"category": ["cooking", "children", "web"],
|
|
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
|
|
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
|
|
"year": [2005, 2005, 2003],
|
|
"price": [30.00, 29.99, 39.95],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_file, df_expected)
|
|
|
|
|
|
def test_file_io(xml_books, parser, mode):
|
|
with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f:
|
|
xml_obj = f.read()
|
|
|
|
df_io = read_xml(
|
|
(BytesIO(xml_obj) if isinstance(xml_obj, bytes) else StringIO(xml_obj)),
|
|
parser=parser,
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"category": ["cooking", "children", "web"],
|
|
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
|
|
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
|
|
"year": [2005, 2005, 2003],
|
|
"price": [30.00, 29.99, 39.95],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_io, df_expected)
|
|
|
|
|
|
def test_file_buffered_reader_string(xml_books, parser, mode):
|
|
with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f:
|
|
xml_obj = f.read()
|
|
|
|
if mode == "rb":
|
|
xml_obj = StringIO(xml_obj.decode())
|
|
elif mode == "r":
|
|
xml_obj = StringIO(xml_obj)
|
|
|
|
df_str = read_xml(xml_obj, parser=parser)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"category": ["cooking", "children", "web"],
|
|
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
|
|
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
|
|
"year": [2005, 2005, 2003],
|
|
"price": [30.00, 29.99, 39.95],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_str, df_expected)
|
|
|
|
|
|
def test_file_buffered_reader_no_xml_declaration(xml_books, parser, mode):
|
|
with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f:
|
|
next(f)
|
|
xml_obj = f.read()
|
|
|
|
if mode == "rb":
|
|
xml_obj = StringIO(xml_obj.decode())
|
|
elif mode == "r":
|
|
xml_obj = StringIO(xml_obj)
|
|
|
|
df_str = read_xml(xml_obj, parser=parser)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"category": ["cooking", "children", "web"],
|
|
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
|
|
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
|
|
"year": [2005, 2005, 2003],
|
|
"price": [30.00, 29.99, 39.95],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_str, df_expected)
|
|
|
|
|
|
def test_string_charset(parser):
|
|
txt = "<中文標籤><row><c1>1</c1><c2>2</c2></row></中文標籤>"
|
|
|
|
df_str = read_xml(StringIO(txt), parser=parser)
|
|
|
|
df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0])
|
|
|
|
tm.assert_frame_equal(df_str, df_expected)
|
|
|
|
|
|
def test_file_charset(xml_doc_ch_utf, parser):
|
|
df_file = read_xml(xml_doc_ch_utf, parser=parser)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"問": [
|
|
"問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正",
|
|
"問 既破有得申無得 亦應但破性執申假名以不",
|
|
"問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶",
|
|
],
|
|
"答": [
|
|
"".join(
|
|
[
|
|
"答 邪既無量 正亦多途 大略為言不出二種 謂",
|
|
"有得與無得 有得是邪須破 無得是正須申\n\t\t故",
|
|
]
|
|
),
|
|
None,
|
|
"答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破",
|
|
],
|
|
"a": [
|
|
None,
|
|
"答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也",
|
|
None,
|
|
],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_file, df_expected)
|
|
|
|
|
|
def test_file_handle_close(xml_books, parser):
|
|
with open(xml_books, "rb") as f:
|
|
read_xml(BytesIO(f.read()), parser=parser)
|
|
|
|
assert not f.closed
|
|
|
|
|
|
@pytest.mark.parametrize("val", ["", b""])
|
|
def test_empty_string_lxml(val):
|
|
lxml_etree = pytest.importorskip("lxml.etree")
|
|
|
|
msg = "|".join(
|
|
[
|
|
"Document is empty",
|
|
# Seen on Mac with lxml 4.91
|
|
r"None \(line 0\)",
|
|
]
|
|
)
|
|
with pytest.raises(lxml_etree.XMLSyntaxError, match=msg):
|
|
if isinstance(val, str):
|
|
read_xml(StringIO(val), parser="lxml")
|
|
else:
|
|
read_xml(BytesIO(val), parser="lxml")
|
|
|
|
|
|
@pytest.mark.parametrize("val", ["", b""])
|
|
def test_empty_string_etree(val):
|
|
with pytest.raises(ParseError, match="no element found"):
|
|
if isinstance(val, str):
|
|
read_xml(StringIO(val), parser="etree")
|
|
else:
|
|
read_xml(BytesIO(val), parser="etree")
|
|
|
|
|
|
def test_wrong_file_path(parser):
|
|
msg = (
|
|
"Passing literal xml to 'read_xml' is deprecated and "
|
|
"will be removed in a future version. To read from a "
|
|
"literal string, wrap it in a 'StringIO' object."
|
|
)
|
|
filename = os.path.join("data", "html", "books.xml")
|
|
|
|
with pytest.raises(
|
|
FutureWarning,
|
|
match=msg,
|
|
):
|
|
read_xml(filename, parser=parser)
|
|
|
|
|
|
@pytest.mark.network
|
|
@pytest.mark.single_cpu
|
|
def test_url(httpserver, xml_file):
|
|
pytest.importorskip("lxml")
|
|
with open(xml_file, encoding="utf-8") as f:
|
|
httpserver.serve_content(content=f.read())
|
|
df_url = read_xml(httpserver.url, xpath=".//book[count(*)=4]")
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"category": ["cooking", "children", "web"],
|
|
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
|
|
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
|
|
"year": [2005, 2005, 2003],
|
|
"price": [30.00, 29.99, 39.95],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_url, df_expected)
|
|
|
|
|
|
@pytest.mark.network
|
|
@pytest.mark.single_cpu
|
|
def test_wrong_url(parser, httpserver):
|
|
httpserver.serve_content("NOT FOUND", code=404)
|
|
with pytest.raises(HTTPError, match=("HTTP Error 404: NOT FOUND")):
|
|
read_xml(httpserver.url, xpath=".//book[count(*)=4]", parser=parser)
|
|
|
|
|
|
# CONTENT
|
|
|
|
|
|
def test_whitespace(parser):
|
|
xml = """
|
|
<data>
|
|
<row sides=" 4 ">
|
|
<shape>
|
|
square
|
|
</shape>
|
|
<degrees>	360	</degrees>
|
|
</row>
|
|
<row sides=" 0 ">
|
|
<shape>
|
|
circle
|
|
</shape>
|
|
<degrees>	360	</degrees>
|
|
</row>
|
|
<row sides=" 3 ">
|
|
<shape>
|
|
triangle
|
|
</shape>
|
|
<degrees>	180	</degrees>
|
|
</row>
|
|
</data>"""
|
|
|
|
df_xpath = read_xml(StringIO(xml), parser=parser, dtype="string")
|
|
|
|
df_iter = read_xml_iterparse(
|
|
xml,
|
|
parser=parser,
|
|
iterparse={"row": ["sides", "shape", "degrees"]},
|
|
dtype="string",
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"sides": [" 4 ", " 0 ", " 3 "],
|
|
"shape": [
|
|
"\n square\n ",
|
|
"\n circle\n ",
|
|
"\n triangle\n ",
|
|
],
|
|
"degrees": ["\t360\t", "\t360\t", "\t180\t"],
|
|
},
|
|
dtype="string",
|
|
)
|
|
|
|
tm.assert_frame_equal(df_xpath, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
# XPATH
|
|
|
|
|
|
def test_empty_xpath_lxml(xml_books):
|
|
pytest.importorskip("lxml")
|
|
with pytest.raises(ValueError, match=("xpath does not return any nodes")):
|
|
read_xml(xml_books, xpath=".//python", parser="lxml")
|
|
|
|
|
|
def test_bad_xpath_etree(xml_books):
|
|
with pytest.raises(
|
|
SyntaxError, match=("You have used an incorrect or unsupported XPath")
|
|
):
|
|
read_xml(xml_books, xpath=".//[book]", parser="etree")
|
|
|
|
|
|
def test_bad_xpath_lxml(xml_books):
|
|
lxml_etree = pytest.importorskip("lxml.etree")
|
|
|
|
with pytest.raises(lxml_etree.XPathEvalError, match=("Invalid expression")):
|
|
read_xml(xml_books, xpath=".//[book]", parser="lxml")
|
|
|
|
|
|
# NAMESPACE
|
|
|
|
|
|
def test_default_namespace(parser):
|
|
df_nmsp = read_xml(
|
|
StringIO(xml_default_nmsp),
|
|
xpath=".//ns:row",
|
|
namespaces={"ns": "http://example.com"},
|
|
parser=parser,
|
|
)
|
|
|
|
df_iter = read_xml_iterparse(
|
|
xml_default_nmsp,
|
|
parser=parser,
|
|
iterparse={"row": ["shape", "degrees", "sides"]},
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"shape": ["square", "circle", "triangle"],
|
|
"degrees": [360, 360, 180],
|
|
"sides": [4.0, float("nan"), 3.0],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_nmsp, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
def test_prefix_namespace(parser):
|
|
df_nmsp = read_xml(
|
|
StringIO(xml_prefix_nmsp),
|
|
xpath=".//doc:row",
|
|
namespaces={"doc": "http://example.com"},
|
|
parser=parser,
|
|
)
|
|
df_iter = read_xml_iterparse(
|
|
xml_prefix_nmsp, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"shape": ["square", "circle", "triangle"],
|
|
"degrees": [360, 360, 180],
|
|
"sides": [4.0, float("nan"), 3.0],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_nmsp, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
def test_consistency_default_namespace():
|
|
pytest.importorskip("lxml")
|
|
df_lxml = read_xml(
|
|
StringIO(xml_default_nmsp),
|
|
xpath=".//ns:row",
|
|
namespaces={"ns": "http://example.com"},
|
|
parser="lxml",
|
|
)
|
|
|
|
df_etree = read_xml(
|
|
StringIO(xml_default_nmsp),
|
|
xpath=".//doc:row",
|
|
namespaces={"doc": "http://example.com"},
|
|
parser="etree",
|
|
)
|
|
|
|
tm.assert_frame_equal(df_lxml, df_etree)
|
|
|
|
|
|
def test_consistency_prefix_namespace():
|
|
pytest.importorskip("lxml")
|
|
df_lxml = read_xml(
|
|
StringIO(xml_prefix_nmsp),
|
|
xpath=".//doc:row",
|
|
namespaces={"doc": "http://example.com"},
|
|
parser="lxml",
|
|
)
|
|
|
|
df_etree = read_xml(
|
|
StringIO(xml_prefix_nmsp),
|
|
xpath=".//doc:row",
|
|
namespaces={"doc": "http://example.com"},
|
|
parser="etree",
|
|
)
|
|
|
|
tm.assert_frame_equal(df_lxml, df_etree)
|
|
|
|
|
|
# PREFIX
|
|
|
|
|
|
def test_missing_prefix_with_default_namespace(xml_books, parser):
|
|
with pytest.raises(ValueError, match=("xpath does not return any nodes")):
|
|
read_xml(xml_books, xpath=".//Placemark", parser=parser)
|
|
|
|
|
|
def test_missing_prefix_definition_etree(kml_cta_rail_lines):
|
|
with pytest.raises(SyntaxError, match=("you used an undeclared namespace prefix")):
|
|
read_xml(kml_cta_rail_lines, xpath=".//kml:Placemark", parser="etree")
|
|
|
|
|
|
def test_missing_prefix_definition_lxml(kml_cta_rail_lines):
|
|
lxml_etree = pytest.importorskip("lxml.etree")
|
|
|
|
with pytest.raises(lxml_etree.XPathEvalError, match=("Undefined namespace prefix")):
|
|
read_xml(kml_cta_rail_lines, xpath=".//kml:Placemark", parser="lxml")
|
|
|
|
|
|
@pytest.mark.parametrize("key", ["", None])
|
|
def test_none_namespace_prefix(key):
|
|
pytest.importorskip("lxml")
|
|
with pytest.raises(
|
|
TypeError, match=("empty namespace prefix is not supported in XPath")
|
|
):
|
|
read_xml(
|
|
StringIO(xml_default_nmsp),
|
|
xpath=".//kml:Placemark",
|
|
namespaces={key: "http://www.opengis.net/kml/2.2"},
|
|
parser="lxml",
|
|
)
|
|
|
|
|
|
# ELEMS AND ATTRS
|
|
|
|
|
|
def test_file_elems_and_attrs(xml_books, parser):
|
|
df_file = read_xml(xml_books, parser=parser)
|
|
df_iter = read_xml(
|
|
xml_books,
|
|
parser=parser,
|
|
iterparse={"book": ["category", "title", "author", "year", "price"]},
|
|
)
|
|
df_expected = DataFrame(
|
|
{
|
|
"category": ["cooking", "children", "web"],
|
|
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
|
|
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
|
|
"year": [2005, 2005, 2003],
|
|
"price": [30.00, 29.99, 39.95],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_file, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
def test_file_only_attrs(xml_books, parser):
|
|
df_file = read_xml(xml_books, attrs_only=True, parser=parser)
|
|
df_iter = read_xml(xml_books, parser=parser, iterparse={"book": ["category"]})
|
|
df_expected = DataFrame({"category": ["cooking", "children", "web"]})
|
|
|
|
tm.assert_frame_equal(df_file, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
def test_file_only_elems(xml_books, parser):
|
|
df_file = read_xml(xml_books, elems_only=True, parser=parser)
|
|
df_iter = read_xml(
|
|
xml_books,
|
|
parser=parser,
|
|
iterparse={"book": ["title", "author", "year", "price"]},
|
|
)
|
|
df_expected = DataFrame(
|
|
{
|
|
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
|
|
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
|
|
"year": [2005, 2005, 2003],
|
|
"price": [30.00, 29.99, 39.95],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_file, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
def test_elem_and_attrs_only(kml_cta_rail_lines, parser):
|
|
with pytest.raises(
|
|
ValueError,
|
|
match=("Either element or attributes can be parsed not both"),
|
|
):
|
|
read_xml(kml_cta_rail_lines, elems_only=True, attrs_only=True, parser=parser)
|
|
|
|
|
|
def test_empty_attrs_only(parser):
|
|
xml = """
|
|
<data>
|
|
<row>
|
|
<shape sides="4">square</shape>
|
|
<degrees>360</degrees>
|
|
</row>
|
|
<row>
|
|
<shape sides="0">circle</shape>
|
|
<degrees>360</degrees>
|
|
</row>
|
|
<row>
|
|
<shape sides="3">triangle</shape>
|
|
<degrees>180</degrees>
|
|
</row>
|
|
</data>"""
|
|
|
|
with pytest.raises(
|
|
ValueError,
|
|
match=("xpath does not return any nodes or attributes"),
|
|
):
|
|
read_xml(StringIO(xml), xpath="./row", attrs_only=True, parser=parser)
|
|
|
|
|
|
def test_empty_elems_only(parser):
|
|
xml = """
|
|
<data>
|
|
<row sides="4" shape="square" degrees="360"/>
|
|
<row sides="0" shape="circle" degrees="360"/>
|
|
<row sides="3" shape="triangle" degrees="180"/>
|
|
</data>"""
|
|
|
|
with pytest.raises(
|
|
ValueError,
|
|
match=("xpath does not return any nodes or attributes"),
|
|
):
|
|
read_xml(StringIO(xml), xpath="./row", elems_only=True, parser=parser)
|
|
|
|
|
|
def test_attribute_centric_xml():
|
|
pytest.importorskip("lxml")
|
|
xml = """\
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
<TrainSchedule>
|
|
<Stations>
|
|
<station Name="Manhattan" coords="31,460,195,498"/>
|
|
<station Name="Laraway Road" coords="63,409,194,455"/>
|
|
<station Name="179th St (Orland Park)" coords="0,364,110,395"/>
|
|
<station Name="153rd St (Orland Park)" coords="7,333,113,362"/>
|
|
<station Name="143rd St (Orland Park)" coords="17,297,115,330"/>
|
|
<station Name="Palos Park" coords="128,281,239,303"/>
|
|
<station Name="Palos Heights" coords="148,257,283,279"/>
|
|
<station Name="Worth" coords="170,230,248,255"/>
|
|
<station Name="Chicago Ridge" coords="70,187,208,214"/>
|
|
<station Name="Oak Lawn" coords="166,159,266,185"/>
|
|
<station Name="Ashburn" coords="197,133,336,157"/>
|
|
<station Name="Wrightwood" coords="219,106,340,133"/>
|
|
<station Name="Chicago Union Sta" coords="220,0,360,43"/>
|
|
</Stations>
|
|
</TrainSchedule>"""
|
|
|
|
df_lxml = read_xml(StringIO(xml), xpath=".//station")
|
|
df_etree = read_xml(StringIO(xml), xpath=".//station", parser="etree")
|
|
|
|
df_iter_lx = read_xml_iterparse(xml, iterparse={"station": ["Name", "coords"]})
|
|
df_iter_et = read_xml_iterparse(
|
|
xml, parser="etree", iterparse={"station": ["Name", "coords"]}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_lxml, df_etree)
|
|
tm.assert_frame_equal(df_iter_lx, df_iter_et)
|
|
|
|
|
|
# NAMES
|
|
|
|
|
|
def test_names_option_output(xml_books, parser):
|
|
df_file = read_xml(
|
|
xml_books, names=["Col1", "Col2", "Col3", "Col4", "Col5"], parser=parser
|
|
)
|
|
df_iter = read_xml(
|
|
xml_books,
|
|
parser=parser,
|
|
names=["Col1", "Col2", "Col3", "Col4", "Col5"],
|
|
iterparse={"book": ["category", "title", "author", "year", "price"]},
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"Col1": ["cooking", "children", "web"],
|
|
"Col2": ["Everyday Italian", "Harry Potter", "Learning XML"],
|
|
"Col3": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
|
|
"Col4": [2005, 2005, 2003],
|
|
"Col5": [30.00, 29.99, 39.95],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_file, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
def test_repeat_names(parser):
|
|
xml = """\
|
|
<shapes>
|
|
<shape type="2D">
|
|
<name>circle</name>
|
|
<type>curved</type>
|
|
</shape>
|
|
<shape type="3D">
|
|
<name>sphere</name>
|
|
<type>curved</type>
|
|
</shape>
|
|
</shapes>"""
|
|
df_xpath = read_xml(
|
|
StringIO(xml),
|
|
xpath=".//shape",
|
|
parser=parser,
|
|
names=["type_dim", "shape", "type_edge"],
|
|
)
|
|
|
|
df_iter = read_xml_iterparse(
|
|
xml,
|
|
parser=parser,
|
|
iterparse={"shape": ["type", "name", "type"]},
|
|
names=["type_dim", "shape", "type_edge"],
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"type_dim": ["2D", "3D"],
|
|
"shape": ["circle", "sphere"],
|
|
"type_edge": ["curved", "curved"],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_xpath, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
def test_repeat_values_new_names(parser):
|
|
xml = """\
|
|
<shapes>
|
|
<shape>
|
|
<name>rectangle</name>
|
|
<family>rectangle</family>
|
|
</shape>
|
|
<shape>
|
|
<name>square</name>
|
|
<family>rectangle</family>
|
|
</shape>
|
|
<shape>
|
|
<name>ellipse</name>
|
|
<family>ellipse</family>
|
|
</shape>
|
|
<shape>
|
|
<name>circle</name>
|
|
<family>ellipse</family>
|
|
</shape>
|
|
</shapes>"""
|
|
df_xpath = read_xml(
|
|
StringIO(xml), xpath=".//shape", parser=parser, names=["name", "group"]
|
|
)
|
|
|
|
df_iter = read_xml_iterparse(
|
|
xml,
|
|
parser=parser,
|
|
iterparse={"shape": ["name", "family"]},
|
|
names=["name", "group"],
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"name": ["rectangle", "square", "ellipse", "circle"],
|
|
"group": ["rectangle", "rectangle", "ellipse", "ellipse"],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_xpath, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
def test_repeat_elements(parser):
|
|
xml = """\
|
|
<shapes>
|
|
<shape>
|
|
<value item="name">circle</value>
|
|
<value item="family">ellipse</value>
|
|
<value item="degrees">360</value>
|
|
<value item="sides">0</value>
|
|
</shape>
|
|
<shape>
|
|
<value item="name">triangle</value>
|
|
<value item="family">polygon</value>
|
|
<value item="degrees">180</value>
|
|
<value item="sides">3</value>
|
|
</shape>
|
|
<shape>
|
|
<value item="name">square</value>
|
|
<value item="family">polygon</value>
|
|
<value item="degrees">360</value>
|
|
<value item="sides">4</value>
|
|
</shape>
|
|
</shapes>"""
|
|
df_xpath = read_xml(
|
|
StringIO(xml),
|
|
xpath=".//shape",
|
|
parser=parser,
|
|
names=["name", "family", "degrees", "sides"],
|
|
)
|
|
|
|
df_iter = read_xml_iterparse(
|
|
xml,
|
|
parser=parser,
|
|
iterparse={"shape": ["value", "value", "value", "value"]},
|
|
names=["name", "family", "degrees", "sides"],
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"name": ["circle", "triangle", "square"],
|
|
"family": ["ellipse", "polygon", "polygon"],
|
|
"degrees": [360, 180, 360],
|
|
"sides": [0, 3, 4],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_xpath, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
def test_names_option_wrong_length(xml_books, parser):
|
|
with pytest.raises(ValueError, match=("names does not match length")):
|
|
read_xml(xml_books, names=["Col1", "Col2", "Col3"], parser=parser)
|
|
|
|
|
|
def test_names_option_wrong_type(xml_books, parser):
|
|
with pytest.raises(TypeError, match=("is not a valid type for names")):
|
|
read_xml(xml_books, names="Col1, Col2, Col3", parser=parser)
|
|
|
|
|
|
# ENCODING
|
|
|
|
|
|
def test_wrong_encoding(xml_baby_names, parser):
|
|
with pytest.raises(UnicodeDecodeError, match=("'utf-8' codec can't decode")):
|
|
read_xml(xml_baby_names, parser=parser)
|
|
|
|
|
|
def test_utf16_encoding(xml_baby_names, parser):
|
|
with pytest.raises(
|
|
UnicodeError,
|
|
match=(
|
|
"UTF-16 stream does not start with BOM|"
|
|
"'utf-16-le' codec can't decode byte"
|
|
),
|
|
):
|
|
read_xml(xml_baby_names, encoding="UTF-16", parser=parser)
|
|
|
|
|
|
def test_unknown_encoding(xml_baby_names, parser):
|
|
with pytest.raises(LookupError, match=("unknown encoding: UFT-8")):
|
|
read_xml(xml_baby_names, encoding="UFT-8", parser=parser)
|
|
|
|
|
|
def test_ascii_encoding(xml_baby_names, parser):
|
|
with pytest.raises(UnicodeDecodeError, match=("'ascii' codec can't decode byte")):
|
|
read_xml(xml_baby_names, encoding="ascii", parser=parser)
|
|
|
|
|
|
def test_parser_consistency_with_encoding(xml_baby_names):
|
|
pytest.importorskip("lxml")
|
|
df_xpath_lxml = read_xml(xml_baby_names, parser="lxml", encoding="ISO-8859-1")
|
|
df_xpath_etree = read_xml(xml_baby_names, parser="etree", encoding="iso-8859-1")
|
|
|
|
df_iter_lxml = read_xml(
|
|
xml_baby_names,
|
|
parser="lxml",
|
|
encoding="ISO-8859-1",
|
|
iterparse={"row": ["rank", "malename", "femalename"]},
|
|
)
|
|
df_iter_etree = read_xml(
|
|
xml_baby_names,
|
|
parser="etree",
|
|
encoding="ISO-8859-1",
|
|
iterparse={"row": ["rank", "malename", "femalename"]},
|
|
)
|
|
|
|
tm.assert_frame_equal(df_xpath_lxml, df_xpath_etree)
|
|
tm.assert_frame_equal(df_xpath_etree, df_iter_etree)
|
|
tm.assert_frame_equal(df_iter_lxml, df_iter_etree)
|
|
|
|
|
|
def test_wrong_encoding_for_lxml():
|
|
pytest.importorskip("lxml")
|
|
# GH#45133
|
|
data = """<data>
|
|
<row>
|
|
<a>c</a>
|
|
</row>
|
|
</data>
|
|
"""
|
|
with pytest.raises(TypeError, match="encoding None"):
|
|
read_xml(StringIO(data), parser="lxml", encoding=None)
|
|
|
|
|
|
def test_none_encoding_etree():
|
|
# GH#45133
|
|
data = """<data>
|
|
<row>
|
|
<a>c</a>
|
|
</row>
|
|
</data>
|
|
"""
|
|
result = read_xml(StringIO(data), parser="etree", encoding=None)
|
|
expected = DataFrame({"a": ["c"]})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
# PARSER
|
|
|
|
|
|
@td.skip_if_installed("lxml")
|
|
def test_default_parser_no_lxml(xml_books):
|
|
with pytest.raises(
|
|
ImportError, match=("lxml not found, please install or use the etree parser.")
|
|
):
|
|
read_xml(xml_books)
|
|
|
|
|
|
def test_wrong_parser(xml_books):
|
|
with pytest.raises(
|
|
ValueError, match=("Values for parser can only be lxml or etree.")
|
|
):
|
|
read_xml(xml_books, parser="bs4")
|
|
|
|
|
|
# STYLESHEET
|
|
|
|
|
|
def test_stylesheet_file(kml_cta_rail_lines, xsl_flatten_doc):
|
|
pytest.importorskip("lxml")
|
|
df_style = read_xml(
|
|
kml_cta_rail_lines,
|
|
xpath=".//k:Placemark",
|
|
namespaces={"k": "http://www.opengis.net/kml/2.2"},
|
|
stylesheet=xsl_flatten_doc,
|
|
)
|
|
|
|
df_iter = read_xml(
|
|
kml_cta_rail_lines,
|
|
iterparse={
|
|
"Placemark": [
|
|
"id",
|
|
"name",
|
|
"styleUrl",
|
|
"extrude",
|
|
"altitudeMode",
|
|
"coordinates",
|
|
]
|
|
},
|
|
)
|
|
|
|
tm.assert_frame_equal(df_kml, df_style)
|
|
tm.assert_frame_equal(df_kml, df_iter)
|
|
|
|
|
|
def test_stylesheet_file_like(kml_cta_rail_lines, xsl_flatten_doc, mode):
|
|
pytest.importorskip("lxml")
|
|
with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f:
|
|
df_style = read_xml(
|
|
kml_cta_rail_lines,
|
|
xpath=".//k:Placemark",
|
|
namespaces={"k": "http://www.opengis.net/kml/2.2"},
|
|
stylesheet=f,
|
|
)
|
|
|
|
tm.assert_frame_equal(df_kml, df_style)
|
|
|
|
|
|
def test_stylesheet_io(kml_cta_rail_lines, xsl_flatten_doc, mode):
|
|
# note: By default the bodies of untyped functions are not checked,
|
|
# consider using --check-untyped-defs
|
|
pytest.importorskip("lxml")
|
|
xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked]
|
|
|
|
with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f:
|
|
if mode == "rb":
|
|
xsl_obj = BytesIO(f.read())
|
|
else:
|
|
xsl_obj = StringIO(f.read())
|
|
|
|
df_style = read_xml(
|
|
kml_cta_rail_lines,
|
|
xpath=".//k:Placemark",
|
|
namespaces={"k": "http://www.opengis.net/kml/2.2"},
|
|
stylesheet=xsl_obj,
|
|
)
|
|
|
|
tm.assert_frame_equal(df_kml, df_style)
|
|
|
|
|
|
def test_stylesheet_buffered_reader(kml_cta_rail_lines, xsl_flatten_doc, mode):
|
|
pytest.importorskip("lxml")
|
|
with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f:
|
|
xsl_obj = f.read()
|
|
|
|
df_style = read_xml(
|
|
kml_cta_rail_lines,
|
|
xpath=".//k:Placemark",
|
|
namespaces={"k": "http://www.opengis.net/kml/2.2"},
|
|
stylesheet=xsl_obj,
|
|
)
|
|
|
|
tm.assert_frame_equal(df_kml, df_style)
|
|
|
|
|
|
def test_style_charset():
|
|
pytest.importorskip("lxml")
|
|
xml = "<中文標籤><row><c1>1</c1><c2>2</c2></row></中文標籤>"
|
|
|
|
xsl = """\
|
|
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
|
<xsl:output omit-xml-declaration="yes" indent="yes"/>
|
|
<xsl:strip-space elements="*"/>
|
|
|
|
<xsl:template match="node()|@*">
|
|
<xsl:copy>
|
|
<xsl:apply-templates select="node()|@*"/>
|
|
</xsl:copy>
|
|
</xsl:template>
|
|
|
|
<xsl:template match="中文標籤">
|
|
<根>
|
|
<xsl:apply-templates />
|
|
</根>
|
|
</xsl:template>
|
|
|
|
</xsl:stylesheet>"""
|
|
|
|
df_orig = read_xml(StringIO(xml))
|
|
df_style = read_xml(StringIO(xml), stylesheet=xsl)
|
|
|
|
tm.assert_frame_equal(df_orig, df_style)
|
|
|
|
|
|
def test_not_stylesheet(kml_cta_rail_lines, xml_books):
|
|
lxml_etree = pytest.importorskip("lxml.etree")
|
|
|
|
with pytest.raises(
|
|
lxml_etree.XSLTParseError, match=("document is not a stylesheet")
|
|
):
|
|
read_xml(kml_cta_rail_lines, stylesheet=xml_books)
|
|
|
|
|
|
def test_incorrect_xsl_syntax(kml_cta_rail_lines):
|
|
lxml_etree = pytest.importorskip("lxml.etree")
|
|
|
|
xsl = """\
|
|
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
|
xmlns:k="http://www.opengis.net/kml/2.2"/>
|
|
<xsl:output method="xml" omit-xml-declaration="yes"
|
|
cdata-section-elements="k:description" indent="yes"/>
|
|
<xsl:strip-space elements="*"/>
|
|
|
|
<xsl:template match="node()|@*">
|
|
<xsl:copy>
|
|
<xsl:apply-templates select="node()|@*"/>
|
|
</xsl:copy>
|
|
</xsl:template>
|
|
|
|
<xsl:template match="k:MultiGeometry|k:LineString">
|
|
<xsl:apply-templates select='*'/>
|
|
</xsl:template>
|
|
|
|
<xsl:template match="k:description|k:Snippet|k:Style"/>
|
|
</xsl:stylesheet>"""
|
|
|
|
with pytest.raises(
|
|
lxml_etree.XMLSyntaxError, match=("Extra content at the end of the document")
|
|
):
|
|
read_xml(kml_cta_rail_lines, stylesheet=xsl)
|
|
|
|
|
|
def test_incorrect_xsl_eval(kml_cta_rail_lines):
|
|
lxml_etree = pytest.importorskip("lxml.etree")
|
|
|
|
xsl = """\
|
|
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
|
xmlns:k="http://www.opengis.net/kml/2.2">
|
|
<xsl:output method="xml" omit-xml-declaration="yes"
|
|
cdata-section-elements="k:description" indent="yes"/>
|
|
<xsl:strip-space elements="*"/>
|
|
|
|
<xsl:template match="node(*)|@*">
|
|
<xsl:copy>
|
|
<xsl:apply-templates select="node()|@*"/>
|
|
</xsl:copy>
|
|
</xsl:template>
|
|
|
|
<xsl:template match="k:MultiGeometry|k:LineString">
|
|
<xsl:apply-templates select='*'/>
|
|
</xsl:template>
|
|
|
|
<xsl:template match="k:description|k:Snippet|k:Style"/>
|
|
</xsl:stylesheet>"""
|
|
|
|
with pytest.raises(lxml_etree.XSLTParseError, match=("failed to compile")):
|
|
read_xml(kml_cta_rail_lines, stylesheet=xsl)
|
|
|
|
|
|
def test_incorrect_xsl_apply(kml_cta_rail_lines):
|
|
lxml_etree = pytest.importorskip("lxml.etree")
|
|
|
|
xsl = """\
|
|
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
|
<xsl:output method="xml" encoding="utf-8" indent="yes" />
|
|
<xsl:strip-space elements="*"/>
|
|
|
|
<xsl:template match="@*|node()">
|
|
<xsl:copy>
|
|
<xsl:copy-of select="document('non_existent.xml')/*"/>
|
|
</xsl:copy>
|
|
</xsl:template>
|
|
</xsl:stylesheet>"""
|
|
|
|
with pytest.raises(lxml_etree.XSLTApplyError, match=("Cannot resolve URI")):
|
|
read_xml(kml_cta_rail_lines, stylesheet=xsl)
|
|
|
|
|
|
def test_wrong_stylesheet(kml_cta_rail_lines, xml_data_path):
|
|
xml_etree = pytest.importorskip("lxml.etree")
|
|
|
|
xsl = xml_data_path / "flatten.xsl"
|
|
|
|
with pytest.raises(
|
|
xml_etree.XMLSyntaxError,
|
|
match=("Start tag expected, '<' not found"),
|
|
):
|
|
read_xml(kml_cta_rail_lines, stylesheet=xsl)
|
|
|
|
|
|
def test_stylesheet_file_close(kml_cta_rail_lines, xsl_flatten_doc, mode):
|
|
# note: By default the bodies of untyped functions are not checked,
|
|
# consider using --check-untyped-defs
|
|
pytest.importorskip("lxml")
|
|
xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked]
|
|
|
|
with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f:
|
|
if mode == "rb":
|
|
xsl_obj = BytesIO(f.read())
|
|
else:
|
|
xsl_obj = StringIO(f.read())
|
|
|
|
read_xml(kml_cta_rail_lines, stylesheet=xsl_obj)
|
|
|
|
assert not f.closed
|
|
|
|
|
|
def test_stylesheet_with_etree(kml_cta_rail_lines, xsl_flatten_doc):
|
|
pytest.importorskip("lxml")
|
|
with pytest.raises(
|
|
ValueError, match=("To use stylesheet, you need lxml installed")
|
|
):
|
|
read_xml(kml_cta_rail_lines, parser="etree", stylesheet=xsl_flatten_doc)
|
|
|
|
|
|
@pytest.mark.parametrize("val", ["", b""])
|
|
def test_empty_stylesheet(val):
|
|
pytest.importorskip("lxml")
|
|
msg = (
|
|
"Passing literal xml to 'read_xml' is deprecated and "
|
|
"will be removed in a future version. To read from a "
|
|
"literal string, wrap it in a 'StringIO' object."
|
|
)
|
|
kml = os.path.join("data", "xml", "cta_rail_lines.kml")
|
|
|
|
with pytest.raises(FutureWarning, match=msg):
|
|
read_xml(kml, stylesheet=val)
|
|
|
|
|
|
# ITERPARSE
|
|
def test_file_like_iterparse(xml_books, parser, mode):
|
|
with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f:
|
|
if mode == "r" and parser == "lxml":
|
|
with pytest.raises(
|
|
TypeError, match=("reading file objects must return bytes objects")
|
|
):
|
|
read_xml(
|
|
f,
|
|
parser=parser,
|
|
iterparse={
|
|
"book": ["category", "title", "year", "author", "price"]
|
|
},
|
|
)
|
|
return None
|
|
else:
|
|
df_filelike = read_xml(
|
|
f,
|
|
parser=parser,
|
|
iterparse={"book": ["category", "title", "year", "author", "price"]},
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"category": ["cooking", "children", "web"],
|
|
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
|
|
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
|
|
"year": [2005, 2005, 2003],
|
|
"price": [30.00, 29.99, 39.95],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_filelike, df_expected)
|
|
|
|
|
|
def test_file_io_iterparse(xml_books, parser, mode):
|
|
funcIO = StringIO if mode == "r" else BytesIO
|
|
with open(
|
|
xml_books,
|
|
mode,
|
|
encoding="utf-8" if mode == "r" else None,
|
|
) as f:
|
|
with funcIO(f.read()) as b:
|
|
if mode == "r" and parser == "lxml":
|
|
with pytest.raises(
|
|
TypeError, match=("reading file objects must return bytes objects")
|
|
):
|
|
read_xml(
|
|
b,
|
|
parser=parser,
|
|
iterparse={
|
|
"book": ["category", "title", "year", "author", "price"]
|
|
},
|
|
)
|
|
return None
|
|
else:
|
|
df_fileio = read_xml(
|
|
b,
|
|
parser=parser,
|
|
iterparse={
|
|
"book": ["category", "title", "year", "author", "price"]
|
|
},
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"category": ["cooking", "children", "web"],
|
|
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
|
|
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
|
|
"year": [2005, 2005, 2003],
|
|
"price": [30.00, 29.99, 39.95],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_fileio, df_expected)
|
|
|
|
|
|
@pytest.mark.network
|
|
@pytest.mark.single_cpu
|
|
def test_url_path_error(parser, httpserver, xml_file):
|
|
with open(xml_file, encoding="utf-8") as f:
|
|
httpserver.serve_content(content=f.read())
|
|
with pytest.raises(
|
|
ParserError, match=("iterparse is designed for large XML files")
|
|
):
|
|
read_xml(
|
|
httpserver.url,
|
|
parser=parser,
|
|
iterparse={"row": ["shape", "degrees", "sides", "date"]},
|
|
)
|
|
|
|
|
|
def test_compression_error(parser, compression_only):
|
|
with tm.ensure_clean(filename="geom_xml.zip") as path:
|
|
geom_df.to_xml(path, parser=parser, compression=compression_only)
|
|
|
|
with pytest.raises(
|
|
ParserError, match=("iterparse is designed for large XML files")
|
|
):
|
|
read_xml(
|
|
path,
|
|
parser=parser,
|
|
iterparse={"row": ["shape", "degrees", "sides", "date"]},
|
|
compression=compression_only,
|
|
)
|
|
|
|
|
|
def test_wrong_dict_type(xml_books, parser):
|
|
with pytest.raises(TypeError, match="list is not a valid type for iterparse"):
|
|
read_xml(
|
|
xml_books,
|
|
parser=parser,
|
|
iterparse=["category", "title", "year", "author", "price"],
|
|
)
|
|
|
|
|
|
def test_wrong_dict_value(xml_books, parser):
|
|
with pytest.raises(
|
|
TypeError, match="<class 'str'> is not a valid type for value in iterparse"
|
|
):
|
|
read_xml(xml_books, parser=parser, iterparse={"book": "category"})
|
|
|
|
|
|
def test_bad_xml(parser):
|
|
bad_xml = """\
|
|
<?xml version='1.0' encoding='utf-8'?>
|
|
<row>
|
|
<shape>square</shape>
|
|
<degrees>00360</degrees>
|
|
<sides>4.0</sides>
|
|
<date>2020-01-01</date>
|
|
</row>
|
|
<row>
|
|
<shape>circle</shape>
|
|
<degrees>00360</degrees>
|
|
<sides/>
|
|
<date>2021-01-01</date>
|
|
</row>
|
|
<row>
|
|
<shape>triangle</shape>
|
|
<degrees>00180</degrees>
|
|
<sides>3.0</sides>
|
|
<date>2022-01-01</date>
|
|
</row>
|
|
"""
|
|
with tm.ensure_clean(filename="bad.xml") as path:
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
f.write(bad_xml)
|
|
|
|
with pytest.raises(
|
|
SyntaxError,
|
|
match=(
|
|
"Extra content at the end of the document|"
|
|
"junk after document element"
|
|
),
|
|
):
|
|
read_xml(
|
|
path,
|
|
parser=parser,
|
|
parse_dates=["date"],
|
|
iterparse={"row": ["shape", "degrees", "sides", "date"]},
|
|
)
|
|
|
|
|
|
def test_comment(parser):
|
|
xml = """\
|
|
<!-- comment before root -->
|
|
<shapes>
|
|
<!-- comment within root -->
|
|
<shape>
|
|
<name>circle</name>
|
|
<type>2D</type>
|
|
</shape>
|
|
<shape>
|
|
<name>sphere</name>
|
|
<type>3D</type>
|
|
<!-- comment within child -->
|
|
</shape>
|
|
<!-- comment within root -->
|
|
</shapes>
|
|
<!-- comment after root -->"""
|
|
|
|
df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser)
|
|
|
|
df_iter = read_xml_iterparse(
|
|
xml, parser=parser, iterparse={"shape": ["name", "type"]}
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"name": ["circle", "sphere"],
|
|
"type": ["2D", "3D"],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_xpath, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
def test_dtd(parser):
|
|
xml = """\
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!DOCTYPE non-profits [
|
|
<!ELEMENT shapes (shape*) >
|
|
<!ELEMENT shape ( name, type )>
|
|
<!ELEMENT name (#PCDATA)>
|
|
]>
|
|
<shapes>
|
|
<shape>
|
|
<name>circle</name>
|
|
<type>2D</type>
|
|
</shape>
|
|
<shape>
|
|
<name>sphere</name>
|
|
<type>3D</type>
|
|
</shape>
|
|
</shapes>"""
|
|
|
|
df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser)
|
|
|
|
df_iter = read_xml_iterparse(
|
|
xml, parser=parser, iterparse={"shape": ["name", "type"]}
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"name": ["circle", "sphere"],
|
|
"type": ["2D", "3D"],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_xpath, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
def test_processing_instruction(parser):
|
|
xml = """\
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
<?xml-stylesheet type="text/xsl" href="style.xsl"?>
|
|
<?display table-view?>
|
|
<?sort alpha-ascending?>
|
|
<?textinfo whitespace is allowed ?>
|
|
<?elementnames <shape>, <name>, <type> ?>
|
|
<shapes>
|
|
<shape>
|
|
<name>circle</name>
|
|
<type>2D</type>
|
|
</shape>
|
|
<shape>
|
|
<name>sphere</name>
|
|
<type>3D</type>
|
|
</shape>
|
|
</shapes>"""
|
|
|
|
df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser)
|
|
|
|
df_iter = read_xml_iterparse(
|
|
xml, parser=parser, iterparse={"shape": ["name", "type"]}
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"name": ["circle", "sphere"],
|
|
"type": ["2D", "3D"],
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_xpath, df_expected)
|
|
tm.assert_frame_equal(df_iter, df_expected)
|
|
|
|
|
|
def test_no_result(xml_books, parser):
|
|
with pytest.raises(
|
|
ParserError, match="No result from selected items in iterparse."
|
|
):
|
|
read_xml(
|
|
xml_books,
|
|
parser=parser,
|
|
iterparse={"node": ["attr1", "elem1", "elem2", "elem3"]},
|
|
)
|
|
|
|
|
|
def test_empty_data(xml_books, parser):
|
|
with pytest.raises(EmptyDataError, match="No columns to parse from file"):
|
|
read_xml(
|
|
xml_books,
|
|
parser=parser,
|
|
iterparse={"book": ["attr1", "elem1", "elem2", "elem3"]},
|
|
)
|
|
|
|
|
|
def test_online_stylesheet():
|
|
pytest.importorskip("lxml")
|
|
xml = """\
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
<catalog>
|
|
<cd>
|
|
<title>Empire Burlesque</title>
|
|
<artist>Bob Dylan</artist>
|
|
<country>USA</country>
|
|
<company>Columbia</company>
|
|
<price>10.90</price>
|
|
<year>1985</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Hide your heart</title>
|
|
<artist>Bonnie Tyler</artist>
|
|
<country>UK</country>
|
|
<company>CBS Records</company>
|
|
<price>9.90</price>
|
|
<year>1988</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Greatest Hits</title>
|
|
<artist>Dolly Parton</artist>
|
|
<country>USA</country>
|
|
<company>RCA</company>
|
|
<price>9.90</price>
|
|
<year>1982</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Still got the blues</title>
|
|
<artist>Gary Moore</artist>
|
|
<country>UK</country>
|
|
<company>Virgin records</company>
|
|
<price>10.20</price>
|
|
<year>1990</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Eros</title>
|
|
<artist>Eros Ramazzotti</artist>
|
|
<country>EU</country>
|
|
<company>BMG</company>
|
|
<price>9.90</price>
|
|
<year>1997</year>
|
|
</cd>
|
|
<cd>
|
|
<title>One night only</title>
|
|
<artist>Bee Gees</artist>
|
|
<country>UK</country>
|
|
<company>Polydor</company>
|
|
<price>10.90</price>
|
|
<year>1998</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Sylvias Mother</title>
|
|
<artist>Dr.Hook</artist>
|
|
<country>UK</country>
|
|
<company>CBS</company>
|
|
<price>8.10</price>
|
|
<year>1973</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Maggie May</title>
|
|
<artist>Rod Stewart</artist>
|
|
<country>UK</country>
|
|
<company>Pickwick</company>
|
|
<price>8.50</price>
|
|
<year>1990</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Romanza</title>
|
|
<artist>Andrea Bocelli</artist>
|
|
<country>EU</country>
|
|
<company>Polydor</company>
|
|
<price>10.80</price>
|
|
<year>1996</year>
|
|
</cd>
|
|
<cd>
|
|
<title>When a man loves a woman</title>
|
|
<artist>Percy Sledge</artist>
|
|
<country>USA</country>
|
|
<company>Atlantic</company>
|
|
<price>8.70</price>
|
|
<year>1987</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Black angel</title>
|
|
<artist>Savage Rose</artist>
|
|
<country>EU</country>
|
|
<company>Mega</company>
|
|
<price>10.90</price>
|
|
<year>1995</year>
|
|
</cd>
|
|
<cd>
|
|
<title>1999 Grammy Nominees</title>
|
|
<artist>Many</artist>
|
|
<country>USA</country>
|
|
<company>Grammy</company>
|
|
<price>10.20</price>
|
|
<year>1999</year>
|
|
</cd>
|
|
<cd>
|
|
<title>For the good times</title>
|
|
<artist>Kenny Rogers</artist>
|
|
<country>UK</country>
|
|
<company>Mucik Master</company>
|
|
<price>8.70</price>
|
|
<year>1995</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Big Willie style</title>
|
|
<artist>Will Smith</artist>
|
|
<country>USA</country>
|
|
<company>Columbia</company>
|
|
<price>9.90</price>
|
|
<year>1997</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Tupelo Honey</title>
|
|
<artist>Van Morrison</artist>
|
|
<country>UK</country>
|
|
<company>Polydor</company>
|
|
<price>8.20</price>
|
|
<year>1971</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Soulsville</title>
|
|
<artist>Jorn Hoel</artist>
|
|
<country>Norway</country>
|
|
<company>WEA</company>
|
|
<price>7.90</price>
|
|
<year>1996</year>
|
|
</cd>
|
|
<cd>
|
|
<title>The very best of</title>
|
|
<artist>Cat Stevens</artist>
|
|
<country>UK</country>
|
|
<company>Island</company>
|
|
<price>8.90</price>
|
|
<year>1990</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Stop</title>
|
|
<artist>Sam Brown</artist>
|
|
<country>UK</country>
|
|
<company>A and M</company>
|
|
<price>8.90</price>
|
|
<year>1988</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Bridge of Spies</title>
|
|
<artist>T`Pau</artist>
|
|
<country>UK</country>
|
|
<company>Siren</company>
|
|
<price>7.90</price>
|
|
<year>1987</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Private Dancer</title>
|
|
<artist>Tina Turner</artist>
|
|
<country>UK</country>
|
|
<company>Capitol</company>
|
|
<price>8.90</price>
|
|
<year>1983</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Midt om natten</title>
|
|
<artist>Kim Larsen</artist>
|
|
<country>EU</country>
|
|
<company>Medley</company>
|
|
<price>7.80</price>
|
|
<year>1983</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Pavarotti Gala Concert</title>
|
|
<artist>Luciano Pavarotti</artist>
|
|
<country>UK</country>
|
|
<company>DECCA</company>
|
|
<price>9.90</price>
|
|
<year>1991</year>
|
|
</cd>
|
|
<cd>
|
|
<title>The dock of the bay</title>
|
|
<artist>Otis Redding</artist>
|
|
<country>USA</country>
|
|
<COMPANY>Stax Records</COMPANY>
|
|
<PRICE>7.90</PRICE>
|
|
<YEAR>1968</YEAR>
|
|
</cd>
|
|
<cd>
|
|
<title>Picture book</title>
|
|
<artist>Simply Red</artist>
|
|
<country>EU</country>
|
|
<company>Elektra</company>
|
|
<price>7.20</price>
|
|
<year>1985</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Red</title>
|
|
<artist>The Communards</artist>
|
|
<country>UK</country>
|
|
<company>London</company>
|
|
<price>7.80</price>
|
|
<year>1987</year>
|
|
</cd>
|
|
<cd>
|
|
<title>Unchain my heart</title>
|
|
<artist>Joe Cocker</artist>
|
|
<country>USA</country>
|
|
<company>EMI</company>
|
|
<price>8.20</price>
|
|
<year>1987</year>
|
|
</cd>
|
|
</catalog>
|
|
"""
|
|
xsl = """\
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
|
<xsl:template match="/">
|
|
<html>
|
|
<body>
|
|
<h2>My CD Collection</h2>
|
|
<table border="1">
|
|
<tr bgcolor="#9acd32">
|
|
<th style="text-align:left">Title</th>
|
|
<th style="text-align:left">Artist</th>
|
|
</tr>
|
|
<xsl:for-each select="catalog/cd">
|
|
<tr>
|
|
<td><xsl:value-of select="title"/></td>
|
|
<td><xsl:value-of select="artist"/></td>
|
|
</tr>
|
|
</xsl:for-each>
|
|
</table>
|
|
</body>
|
|
</html>
|
|
</xsl:template>
|
|
</xsl:stylesheet>
|
|
"""
|
|
|
|
df_xsl = read_xml(
|
|
StringIO(xml),
|
|
xpath=".//tr[td and position() <= 6]",
|
|
names=["title", "artist"],
|
|
stylesheet=xsl,
|
|
)
|
|
|
|
df_expected = DataFrame(
|
|
{
|
|
"title": {
|
|
0: "Empire Burlesque",
|
|
1: "Hide your heart",
|
|
2: "Greatest Hits",
|
|
3: "Still got the blues",
|
|
4: "Eros",
|
|
},
|
|
"artist": {
|
|
0: "Bob Dylan",
|
|
1: "Bonnie Tyler",
|
|
2: "Dolly Parton",
|
|
3: "Gary Moore",
|
|
4: "Eros Ramazzotti",
|
|
},
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(df_expected, df_xsl)
|
|
|
|
|
|
# COMPRESSION
|
|
|
|
|
|
def test_compression_read(parser, compression_only):
|
|
with tm.ensure_clean() as comp_path:
|
|
geom_df.to_xml(
|
|
comp_path, index=False, parser=parser, compression=compression_only
|
|
)
|
|
|
|
df_xpath = read_xml(comp_path, parser=parser, compression=compression_only)
|
|
|
|
df_iter = read_xml_iterparse_comp(
|
|
comp_path,
|
|
compression_only,
|
|
parser=parser,
|
|
iterparse={"row": ["shape", "degrees", "sides"]},
|
|
compression=compression_only,
|
|
)
|
|
|
|
tm.assert_frame_equal(df_xpath, geom_df)
|
|
tm.assert_frame_equal(df_iter, geom_df)
|
|
|
|
|
|
def test_wrong_compression(parser, compression, compression_only):
|
|
actual_compression = compression
|
|
attempted_compression = compression_only
|
|
|
|
if actual_compression == attempted_compression:
|
|
pytest.skip(f"{actual_compression} == {attempted_compression}")
|
|
|
|
errors = {
|
|
"bz2": (OSError, "Invalid data stream"),
|
|
"gzip": (OSError, "Not a gzipped file"),
|
|
"zip": (BadZipFile, "File is not a zip file"),
|
|
"tar": (ReadError, "file could not be opened successfully"),
|
|
}
|
|
zstd = import_optional_dependency("zstandard", errors="ignore")
|
|
if zstd is not None:
|
|
errors["zstd"] = (zstd.ZstdError, "Unknown frame descriptor")
|
|
lzma = import_optional_dependency("lzma", errors="ignore")
|
|
if lzma is not None:
|
|
errors["xz"] = (LZMAError, "Input format not supported by decoder")
|
|
error_cls, error_str = errors[attempted_compression]
|
|
|
|
with tm.ensure_clean() as path:
|
|
geom_df.to_xml(path, parser=parser, compression=actual_compression)
|
|
|
|
with pytest.raises(error_cls, match=error_str):
|
|
read_xml(path, parser=parser, compression=attempted_compression)
|
|
|
|
|
|
def test_unsuported_compression(parser):
|
|
with pytest.raises(ValueError, match="Unrecognized compression type"):
|
|
with tm.ensure_clean() as path:
|
|
read_xml(path, parser=parser, compression="7z")
|
|
|
|
|
|
# STORAGE OPTIONS
|
|
|
|
|
|
@pytest.mark.network
|
|
@pytest.mark.single_cpu
|
|
def test_s3_parser_consistency(s3_public_bucket_with_data, s3so):
|
|
pytest.importorskip("s3fs")
|
|
pytest.importorskip("lxml")
|
|
s3 = f"s3://{s3_public_bucket_with_data.name}/books.xml"
|
|
|
|
df_lxml = read_xml(s3, parser="lxml", storage_options=s3so)
|
|
|
|
df_etree = read_xml(s3, parser="etree", storage_options=s3so)
|
|
|
|
tm.assert_frame_equal(df_lxml, df_etree)
|
|
|
|
|
|
def test_read_xml_nullable_dtypes(
|
|
parser, string_storage, dtype_backend, using_infer_string
|
|
):
|
|
# GH#50500
|
|
data = """<?xml version='1.0' encoding='utf-8'?>
|
|
<data xmlns="http://example.com">
|
|
<row>
|
|
<a>x</a>
|
|
<b>1</b>
|
|
<c>4.0</c>
|
|
<d>x</d>
|
|
<e>2</e>
|
|
<f>4.0</f>
|
|
<g></g>
|
|
<h>True</h>
|
|
<i>False</i>
|
|
</row>
|
|
<row>
|
|
<a>y</a>
|
|
<b>2</b>
|
|
<c>5.0</c>
|
|
<d></d>
|
|
<e></e>
|
|
<f></f>
|
|
<g></g>
|
|
<h>False</h>
|
|
<i></i>
|
|
</row>
|
|
</data>"""
|
|
|
|
if using_infer_string:
|
|
pa = pytest.importorskip("pyarrow")
|
|
string_array = ArrowStringArrayNumpySemantics(pa.array(["x", "y"]))
|
|
string_array_na = ArrowStringArrayNumpySemantics(pa.array(["x", None]))
|
|
|
|
elif string_storage == "python":
|
|
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
|
|
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
|
|
|
|
elif dtype_backend == "pyarrow":
|
|
pa = pytest.importorskip("pyarrow")
|
|
from pandas.arrays import ArrowExtensionArray
|
|
|
|
string_array = ArrowExtensionArray(pa.array(["x", "y"]))
|
|
string_array_na = ArrowExtensionArray(pa.array(["x", None]))
|
|
|
|
else:
|
|
pa = pytest.importorskip("pyarrow")
|
|
string_array = ArrowStringArray(pa.array(["x", "y"]))
|
|
string_array_na = ArrowStringArray(pa.array(["x", None]))
|
|
|
|
with pd.option_context("mode.string_storage", string_storage):
|
|
result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend)
|
|
|
|
expected = DataFrame(
|
|
{
|
|
"a": string_array,
|
|
"b": Series([1, 2], dtype="Int64"),
|
|
"c": Series([4.0, 5.0], dtype="Float64"),
|
|
"d": string_array_na,
|
|
"e": Series([2, NA], dtype="Int64"),
|
|
"f": Series([4.0, NA], dtype="Float64"),
|
|
"g": Series([NA, NA], dtype="Int64"),
|
|
"h": Series([True, False], dtype="boolean"),
|
|
"i": Series([False, NA], dtype="boolean"),
|
|
}
|
|
)
|
|
|
|
if dtype_backend == "pyarrow":
|
|
pa = pytest.importorskip("pyarrow")
|
|
from pandas.arrays import ArrowExtensionArray
|
|
|
|
expected = DataFrame(
|
|
{
|
|
col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
|
|
for col in expected.columns
|
|
}
|
|
)
|
|
expected["g"] = ArrowExtensionArray(pa.array([None, None]))
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_invalid_dtype_backend():
|
|
msg = (
|
|
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
|
|
"'pyarrow' are allowed."
|
|
)
|
|
with pytest.raises(ValueError, match=msg):
|
|
read_xml("test", dtype_backend="numpy")
|