Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

fix default char extractor for NgramFeaturizer #9

Merged
merged 5 commits into from
Oct 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -347,3 +347,4 @@ _doc_report.txt
data.csv
data.txt

/build/TestCoverageReport
8 changes: 6 additions & 2 deletions build.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -299,9 +299,13 @@ set TestsPath1=%PackagePath%\tests
set TestsPath2=%__currentScriptDir%src\python\tests
set ReportPath=%__currentScriptDir%build\TestCoverageReport
call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath1%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
if errorlevel 1 (
goto :Exit_Error
)
call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath2%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
goto :Exit_Success

if errorlevel 1 (
goto :Exit_Error
)

:Exit_Success
endlocal
Expand Down
2 changes: 1 addition & 1 deletion src/python/docs/sphinx/installationguide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Installation Guide
Supported Platforms
-------------------

Release 0.6.0:
Release 0.6:
* Windows 10, Ubuntu 14.04, Ubuntu 16.04, CentOS 7, RHEL 7, Mac OS 10.11, 10.12, 10.13


Expand Down
2 changes: 1 addition & 1 deletion src/python/nimbusml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Microsoft Machine Learning for Python
"""

__version__ = '0.6.0'
__version__ = '0.6.1'

# CoreCLR version of MicrosoftML is built on Windows.
# But file permissions are not preserved when it's copied to Linux.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,12 @@ def __init__(
dictionary=None,
word_feature_extractor=Ngram(
max_num_terms=[10000000]),
char_feature_extractor=None,
vector_normalizer='L2',
columns=None,
char_feature_extractor=Ngram(
ngram_length=3,
all_lengths=False,
max_num_terms=[10000000]),
vector_normalizer='L2',
columns=None,
**params):

if columns:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,11 @@ def __init__(
dictionary=None,
word_feature_extractor=n_gram(
max_num_terms=[10000000]),
char_feature_extractor=None,
vector_normalizer='L2',
char_feature_extractor=n_gram(
ngram_length=3,
all_lengths=False,
max_num_terms=[10000000]),
vector_normalizer='L2',
**params):
BasePipelineItem.__init__(
self, type='transform', **params)
Expand Down
78 changes: 39 additions & 39 deletions src/python/nimbusml/tests/data_type/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
from nimbusml import Pipeline
from nimbusml.ensemble import LightGbmClassifier
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_array_almost_equal


def transform_data(data=None, datatype=None):
Expand All @@ -34,7 +34,7 @@ def train_data_type_single(
"Talk about second",
"Thrid one",
"Final example."]
model = NGramFeaturizer(word_feature_extractor=n_gram())
model = NGramFeaturizer()
data_with_new_type = transform_data(data, fit_X_type)
model.fit(data_with_new_type)
test_data_with_new_type = transform_data(data, predict_X_type)
Expand All @@ -49,7 +49,7 @@ def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None):
"Final example."]
label = [1, 0, 1, 1]
model = Pipeline([
NGramFeaturizer(word_feature_extractor=n_gram()),
NGramFeaturizer(),
LightGbmClassifier(min_data_per_leaf=1, n_thread=1)
])
data_with_new_type = transform_data(data, fit_X_type)
Expand All @@ -66,127 +66,127 @@ class TestTextDataType(unittest.TestCase):
def test_check_text_datatype_single_list_list_series(self):
result = train_data_type_single("list", "list", "series")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])

def test_check_text_datatype_single_series_list_series(self):
result = train_data_type_single("series", "list", "series")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])

def test_check_text_datatype_single_series_list_list(self):
result = train_data_type_single("series", "list", "list")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])

def test_check_text_datatype_single_array_list_series(self):
result = train_data_type_single("array", "list", "series")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])

def test_check_text_datatype_single_series_array_dataframe(self):
result = train_data_type_single("series", "array", "dataframe")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])

def test_check_text_datatype_single_array_series_series(self):
result = train_data_type_single("array", "series", "series")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])

def test_check_text_datatype_single_dataframe_list_series(self):
result = train_data_type_single("dataframe", "list", "series")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])

def test_check_text_datatype_single_series_series_dataframe(self):
result = train_data_type_single("series", "series", "dataframe")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])

def test_check_text_datatype_single_dataframe_series_list(self):
result = train_data_type_single("dataframe", "series", "list")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])

def test_check_text_datatype_ppl_series_list_array(self):
result, scores, metrics = train_data_type_ppl(
"series", "list", "array")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])

def test_check_text_datatype_ppl_list_series_dataframe(self):
result, scores, metrics = train_data_type_ppl(
"list", "series", "dataframe")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])

def test_check_text_datatype_ppl_list_list_series(self):
result, scores, metrics = train_data_type_ppl("list", "list", "series")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])

def test_check_text_datatype_ppl_array_series_array(self):
result, scores, metrics = train_data_type_ppl(
"array", "series", "array")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])

def test_check_text_datatype_ppl_series_array_dataframe(self):
result, scores, metrics = train_data_type_ppl(
"series", "array", "dataframe")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])

def test_check_text_datatype_ppl_array_series_list(self):
result, scores, metrics = train_data_type_ppl(
"array", "series", "list")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])

def test_check_text_datatype_ppl_dataframe_list_series(self):
result, scores, metrics = train_data_type_ppl(
"dataframe", "list", "series")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])

def test_check_text_datatype_ppl_series_series_dataframe(self):
result, scores, metrics = train_data_type_ppl(
"series", "series", "dataframe")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])

def test_check_text_datatype_ppl_dataframe_series_series(self):
result, scores, metrics = train_data_type_ppl(
"dataframe", "series", "series")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_ngramfeaturizer(self):
X_train = texttransform.fit_transform(X_train[:100])
sum = X_train.iloc[:].sum().sum()
print(sum)
assert_equal(sum, 4594, "sum of all features is incorrect!")
assert_equal(sum, 30513, "sum of all features is incorrect!")


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_word_embedding_example(self):
])

features = pipeline.fit_transform(data)
assert features.shape == (248, 409)
assert features.shape == (248, 802)

# TODO: fix ssl issue on test centos7 & ubuntu14 boxes.
# Test works on ubuntu16.
Expand Down Expand Up @@ -127,7 +127,7 @@ def test_word_embedding_example2(self):
])

features = pipeline.fit_transform(data)
assert features.shape == (248, 409)
assert features.shape == (248, 802)
assert 'features_TransformedText.94' in list(features.columns)

# TODO: fix ssl issue on test centos7 & ubuntu14 boxes.
Expand Down Expand Up @@ -166,7 +166,7 @@ def test_word_embedding_example_dict_same_name(self):
])

features = pipeline.fit_transform(data)
assert features.shape == (248, 409)
assert features.shape == (248, 802)

@unittest.skip('System.ArgumentOutOfRangeException')
def test_word_embedding_example_dict_newname(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_ngramfeaturizer(self):
textt = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
X = textt.fit_transform(X)

assert X.shape == (25, 21)
assert X.shape == (25, 116)

mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
X_test = textt.transform(test_reviews)
Expand Down Expand Up @@ -180,7 +180,7 @@ def test_ngramfeaturizer_syntax_dict(self):
'outg': ['review']}
X = textt.fit_transform(X)

assert X.shape == (25, 22)
assert X.shape == (25, 117)
# columns ordering changed between 0.22 and 0.23
assert 'review' in (X.columns[0], X.columns[-1])
X = X.drop('review', axis=1)
Expand All @@ -204,7 +204,7 @@ def test_ngramfeaturizer_single(self):
columns={'features': ['id', 'education']})

features = xf.fit_transform(data)
assert features.shape == (248, 259)
assert features.shape == (248, 652)

def test_ngramfeaturizer_multi(self):

Expand Down
2 changes: 1 addition & 1 deletion src/python/nimbusml/tests/test_syntax_onehotvectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,4 +146,4 @@ def test_syntax9_multiple_inputs(self):
ng4 = NGramFeaturizer(word_feature_extractor=n_gram()) << {
'out1': ['education1', 'education2']}
output4 = ng4.fit_transform(X)
assert output4.shape == (5, 7)
assert output4.shape == (5, 13)
2 changes: 1 addition & 1 deletion src/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
# Versions should comply with PEP440. For a discussion on
# single-sourcing the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version='0.6.0',
version='0.6.1',

description='NimbusML',
long_description=long_description,
Expand Down
4 changes: 3 additions & 1 deletion src/python/tools/code_fixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@
('name=name,', 'output=output,')],
'NGramFeaturizer': [(NG_1, NG_1_correct),
('word_feature_extractor = n_gram',
'word_feature_extractor = Ngram')],
'word_feature_extractor = Ngram'),
('char_feature_extractor = n_gram',
'char_feature_extractor = Ngram')],
'CountSelector': ('count = 0,', 'count = 1.0,'),
'OneClassSvmAnomalyDetector': (
'label_column=label_column,', 'label_column=None,'),
Expand Down
8 changes: 1 addition & 7 deletions src/python/tools/manifest_diff.json
Original file line number Diff line number Diff line change
Expand Up @@ -682,13 +682,7 @@
"Name": "Transforms.TextFeaturizer",
"NewName": "NGramFeaturizer",
"Module": "feature_extraction.text",
"Type": "Transform",
"Inputs": [
{
"Name": "CharFeatureExtractor",
"Default": null
}
]
"Type": "Transform"
},
{
"Name": "Transforms.WordEmbeddings",
Expand Down
2 changes: 1 addition & 1 deletion version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.6.0
0.6.1