Skip to content

Commit

Permalink
Fix donut token2json multiline (#30300)
Browse files Browse the repository at this point in the history
* Fix multiline processing

* Update test for token2json
  • Loading branch information
qubvel authored and ydshieh committed Apr 23, 2024
1 parent 4fea8a9 commit 1db6a56
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
4 changes: 3 additions & 1 deletion src/transformers/models/donut/processing_donut.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,9 @@ def token2json(self, tokens, is_inner_value=False, added_vocab=None):
end_token = end_token.group()
start_token_escaped = re.escape(start_token)
end_token_escaped = re.escape(end_token)
content = re.search(f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE)
content = re.search(
f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE | re.DOTALL
)
if content is not None:
content = content.group(1).strip()
if r"<s_" in content and r"</s_" in content: # non-leaf node
Expand Down
4 changes: 4 additions & 0 deletions tests/models/donut/test_processing_donut.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,17 @@ def test_token2json(self):
"zip": "30301",
"phone": "123-4567",
"nicknames": [{"nickname": "Johnny"}, {"nickname": "JD"}],
"multiline": "text\nwith\nnewlines",
"empty": "",
}

sequence = (
"<s_name>John Doe</s_name><s_age>99</s_age><s_city>Atlanta</s_city>"
"<s_state>GA</s_state><s_zip>30301</s_zip><s_phone>123-4567</s_phone>"
"<s_nicknames><s_nickname>Johnny</s_nickname>"
"<sep/><s_nickname>JD</s_nickname></s_nicknames>"
"<s_multiline>text\nwith\nnewlines</s_multiline>"
"<s_empty></s_empty>"
)
actual_json = self.processor.token2json(sequence)

Expand Down

0 comments on commit 1db6a56

Please sign in to comment.