Skip to content

Commit c20cd7c

Browse files
committed
When rebuilding sentence text in ssurgeon, use the parent's SpaceAfter=No field for a word which is part of an MWT
1 parent 218d7d3 commit c20cd7c

File tree

2 files changed

+11
-4
lines changed

2 files changed

+11
-4
lines changed

stanza/server/ssurgeon.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,8 @@ def convert_response_to_doc(doc, semgrex_response):
146146
mwt_tokens = []
147147
for word_start_idx, word in enumerate(tokens):
148148
if not word["is_first_mwt"]:
149+
if word["is_mwt"]:
150+
word[MISC] = None
149151
mwt_tokens.append(word)
150152
continue
151153
word_end_idx = word_start_idx + 1
@@ -159,6 +161,7 @@ def convert_response_to_doc(doc, semgrex_response):
159161
# use the SpaceAfter=No (or not) from the last word in the token
160162
MISC: tokens[word_end_idx-1][MISC],
161163
}
164+
word[MISC] = None
162165
mwt_tokens.append(mwt_token_entry)
163166
mwt_tokens.append(word)
164167

@@ -167,7 +170,11 @@ def convert_response_to_doc(doc, semgrex_response):
167170

168171
# TODO: look at word.parent to see if it is part of an MWT
169172
# once that's done, the beginning words of an MWT do not need SpaceAfter=No any more (it is implied)
170-
word_text = [word.text if (word_idx == len(sentence.words) - 1 or (word.misc and "SpaceAfter=No" in word.misc)) else word.text + " "
173+
word_text = [word.text if (word_idx == len(sentence.words) - 1 or
174+
(word.misc and "SpaceAfter=No" in word.misc) or
175+
word.id != word.parent.id[-1] or
176+
(word.parent.misc and "SpaceAfter=No" in word.parent.misc))
177+
else word.text + " "
171178
for word_idx, word in enumerate(sentence.words)]
172179
sentence_text = "".join(word_text)
173180

stanza/tests/server/test_ssurgeon.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def test_ssurgeon_different_length():
106106
# text = It's not yours!
107107
# comment = negation
108108
1-2 It's _ _ _ _ _ _ _ _
109-
1 It it PRON PRP Number=Sing|Person=2|PronType=Prs 4 nsubj _ SpaceAfter=No
109+
1 It it PRON PRP Number=Sing|Person=2|PronType=Prs 4 nsubj _ _
110110
2 's be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 4 cop _ _
111111
3 not not PART RB Polarity=Neg 4 advmod _ _
112112
4 yours yours PRON PRP Gender=Neut|Number=Sing|Person=2|Poss=Yes|PronType=Prs 0 root _ SpaceAfter=No
@@ -150,8 +150,8 @@ def test_ssurgeon_become_mwt():
150150
2 of of ADP IN _ 4 case _ _
151151
3 “ " PUNCT `` _ 4 punct _ SpaceAfter=No
152152
4-5 NCRC4ME’s _ _ _ _ _ _ _ SpaceAfter=No
153-
4 NCRC4ME NCRC4ME PROPN NNP Number=Sing 1 compound _ SpaceAfter=No
154-
5 ’s 's PART POS _ 4 case _ SpaceAfter=No
153+
4 NCRC4ME NCRC4ME PROPN NNP Number=Sing 1 compound _ _
154+
5 ’s 's PART POS _ 4 case _ _
155155
6 ” " PUNCT '' _ 4 punct _ _
156156
"""
157157

0 commit comments

Comments
 (0)