-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Fixed the TextTransform bug where chargrams where being computed differently when using with/without word tokenizer. #548
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
868a72f
f482eea
64390ee
2201433
d1dd161
f458cdb
90cdf2a
3ed6e87
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -262,6 +262,30 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV | |
view = new ConcatTransform(h, new ConcatTransform.Arguments() { Column = xfCols }, view); | ||
} | ||
|
||
if (tparams.NeedsNormalizeTransform) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @justinormont, I moved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know if there are averse effects of running If you make me a build, I'll run a manual regression test. Unfortunately we don't currently have running nightly regression tests to tell us if the text datasets decreased in their core metrics. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't like this change. It's far from clear to me that if someone asks to remove stopwords that they meant for the content of stopwords to be retained in the chargrams. At the very least this ought to be a configurable option. In reply to: 204000786 [](ancestors = 204000786) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @TomFinley, Can you elaborate more on this? Your point is not clear to me. Don't you see it good to have In reply to: 204201054 [](ancestors = 204201054,204000786) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry thought you were saying something else for some reason got confused. #Resolved |
||
{ | ||
var xfCols = new TextNormalizerCol[textCols.Length]; | ||
string[] dstCols = new string[textCols.Length]; | ||
for (int i = 0; i < textCols.Length; i++) | ||
{ | ||
dstCols[i] = GenerateColumnName(view.Schema, textCols[i], "TextNormalizer"); | ||
tempCols.Add(dstCols[i]); | ||
xfCols[i] = new TextNormalizerCol() { Source = textCols[i], Name = dstCols[i] }; | ||
} | ||
|
||
view = new TextNormalizerTransform(h, | ||
new TextNormalizerArgs() | ||
{ | ||
Column = xfCols, | ||
KeepDiacritics = tparams.KeepDiacritics, | ||
KeepNumbers = tparams.KeepNumbers, | ||
KeepPunctuations = tparams.KeepPunctuations, | ||
TextCase = tparams.TextCase | ||
}, view); | ||
|
||
textCols = dstCols; | ||
} | ||
|
||
if (tparams.NeedsWordTokenizationTransform) | ||
{ | ||
var xfCols = new DelimitedTokenizeTransform.Column[textCols.Length]; | ||
|
@@ -281,34 +305,6 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV | |
view = new DelimitedTokenizeTransform(h, new DelimitedTokenizeTransform.Arguments() { Column = xfCols }, view); | ||
} | ||
|
||
if (tparams.NeedsNormalizeTransform) | ||
{ | ||
string[] srcCols = wordTokCols == null ? textCols : wordTokCols; | ||
var xfCols = new TextNormalizerCol[srcCols.Length]; | ||
string[] dstCols = new string[srcCols.Length]; | ||
for (int i = 0; i < srcCols.Length; i++) | ||
{ | ||
dstCols[i] = GenerateColumnName(view.Schema, srcCols[i], "TextNormalizer"); | ||
tempCols.Add(dstCols[i]); | ||
xfCols[i] = new TextNormalizerCol() { Source = srcCols[i], Name = dstCols[i] }; | ||
} | ||
|
||
view = new TextNormalizerTransform(h, | ||
new TextNormalizerArgs() | ||
{ | ||
Column = xfCols, | ||
KeepDiacritics = tparams.KeepDiacritics, | ||
KeepNumbers = tparams.KeepNumbers, | ||
KeepPunctuations = tparams.KeepPunctuations, | ||
TextCase = tparams.TextCase | ||
}, view); | ||
|
||
if (wordTokCols != null) | ||
wordTokCols = dstCols; | ||
else | ||
textCols = dstCols; | ||
} | ||
|
||
if (tparams.NeedsRemoveStopwordsTransform) | ||
{ | ||
Contracts.Assert(wordTokCols != null, "StopWords transform requires that word tokenization has been applied to the input text."); | ||
|
@@ -360,7 +356,7 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV | |
if (tparams.CharExtractorFactory != null) | ||
{ | ||
{ | ||
var srcCols = wordTokCols ?? textCols; | ||
var srcCols = tparams.NeedsRemoveStopwordsTransform ? wordTokCols : textCols; | ||
charTokCols = new string[srcCols.Length]; | ||
var xfCols = new CharTokenizeTransform.Column[srcCols.Length]; | ||
for (int i = 0; i < srcCols.Length; i++) | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See other examples. You will see that we keep commented out the version strings from old versions, to track them and so we know why we bumped the version each time. This is important: we have live code in our deserializers that is meant to handle those version bumps, and for that reason we like to have documentation on what exactly changed, so that when we have tests on this or that version in our deserializers, we know why they changed. #Resolved