Skip to content

Commit 8cb1f22

Browse files
authored
BUG: iterparse on read_xml ignores repeated elements (#51355)
BUG: iterparse on ignores repeated elements
1 parent 9764f3d commit 8cb1f22

File tree

3 files changed

+57
-2
lines changed

3 files changed

+57
-2
lines changed

doc/source/whatsnew/v2.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1304,6 +1304,7 @@ I/O
13041304
- Bug in :meth:`DataFrame.to_dict` not converting ``NA`` to ``None`` (:issue:`50795`)
13051305
- Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
13061306
- Bug in :func:`read_xml` where file-like objects failed when iterparse is used (:issue:`50641`)
1307+
- Bug in :func:`read_xml` ignored repeated elements when iterparse is used (:issue:`51183`)
13071308

13081309
Period
13091310
^^^^^^

pandas/io/xml.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,10 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
337337
"local disk and not as compressed files or online sources."
338338
)
339339

340+
iterparse_repeats = len(self.iterparse[row_node]) != len(
341+
set(self.iterparse[row_node])
342+
)
343+
340344
for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
341345
curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
342346

@@ -345,12 +349,13 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
345349
row = {}
346350

347351
if row is not None:
348-
if self.names:
352+
if self.names and iterparse_repeats:
349353
for col, nm in zip(self.iterparse[row_node], self.names):
350354
if curr_elem == col:
351355
elem_val = elem.text.strip() if elem.text else None
352-
if row.get(nm) != elem_val and nm not in row:
356+
if elem_val not in row.values() and nm not in row:
353357
row[nm] = elem_val
358+
354359
if col in elem.attrib:
355360
if elem.attrib[col] not in row.values() and nm not in row:
356361
row[nm] = elem.attrib[col]

pandas/tests/io/xml/test_xml.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -948,6 +948,55 @@ def test_repeat_values_new_names(parser):
948948
tm.assert_frame_equal(df_iter, df_expected)
949949

950950

951+
def test_repeat_elements(parser):
952+
xml = """\
953+
<shapes>
954+
<shape>
955+
<value item="name">circle</value>
956+
<value item="family">ellipse</value>
957+
<value item="degrees">360</value>
958+
<value item="sides">0</value>
959+
</shape>
960+
<shape>
961+
<value item="name">triangle</value>
962+
<value item="family">polygon</value>
963+
<value item="degrees">180</value>
964+
<value item="sides">3</value>
965+
</shape>
966+
<shape>
967+
<value item="name">square</value>
968+
<value item="family">polygon</value>
969+
<value item="degrees">360</value>
970+
<value item="sides">4</value>
971+
</shape>
972+
</shapes>"""
973+
df_xpath = read_xml(
974+
xml,
975+
xpath=".//shape",
976+
parser=parser,
977+
names=["name", "family", "degrees", "sides"],
978+
)
979+
980+
df_iter = read_xml_iterparse(
981+
xml,
982+
parser=parser,
983+
iterparse={"shape": ["value", "value", "value", "value"]},
984+
names=["name", "family", "degrees", "sides"],
985+
)
986+
987+
df_expected = DataFrame(
988+
{
989+
"name": ["circle", "triangle", "square"],
990+
"family": ["ellipse", "polygon", "polygon"],
991+
"degrees": [360, 180, 360],
992+
"sides": [0, 3, 4],
993+
}
994+
)
995+
996+
tm.assert_frame_equal(df_xpath, df_expected)
997+
tm.assert_frame_equal(df_iter, df_expected)
998+
999+
9511000
def test_names_option_wrong_length(datapath, parser):
9521001
filename = datapath("io", "data", "xml", "books.xml")
9531002

0 commit comments

Comments
 (0)