Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Modifications to support scripted temp/docs merging. #361

Merged
merged 2 commits into from
Nov 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 21 additions & 75 deletions src/python/nimbusml/datasets/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
"""
Datasets used in MicrosoftML unittests.
Datasets used in MicrosoftML unittests.
"""
import copy
import os
Expand All @@ -15,6 +15,8 @@

__all__ = ["get_dataset", "available_datasets"]

DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')


class DataSet:
"""
Expand Down Expand Up @@ -175,11 +177,7 @@ def load(self):
# isCase ~ age + parity + education + spontaneous + induced
# education age parity induced case spontaneous stratum
# pooled.stratum
this = os.path.join(
os.path.dirname(__file__),
"data",
"gplv2",
"infert.csv")
this = os.path.join(DATA_DIR, "gplv2", "infert.csv")
self.__dict__['_data'] = pandas.read_csv(this)
self.__dict__['case'] = self._data["case"]
self._finalize()
Expand Down Expand Up @@ -229,11 +227,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"gplv2",
"infert.csv")
return os.path.join(DATA_DIR, "gplv2", "infert.csv")


class DataSetAirQuality(DataSet):
Expand Down Expand Up @@ -262,11 +256,7 @@ def load(self):
# isCase ~ age + parity + education + spontaneous + induced
# education age parity induced case spontaneous stratum
# pooled.stratum
this = os.path.join(
os.path.dirname(__file__),
"data",
"gplv2",
"airquality.csv")
this = os.path.join(DATA_DIR, "gplv2", "airquality.csv")
self.__dict__['_data'] = pandas.read_csv(this)
self._finalize()

Expand Down Expand Up @@ -294,11 +284,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"gplv2",
"airquality.csv")
return os.path.join(DATA_DIR, "gplv2", "airquality.csv")


class Topics(DataSet):
Expand All @@ -324,8 +310,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(os.path.dirname(__file__), "data",
"topics.csv")
return os.path.join(DATA_DIR, "topics.csv")


class Timeseries(DataSet):
Expand All @@ -351,10 +336,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"timeseries.csv")
return os.path.join(DATA_DIR, "timeseries.csv")


class WikiDetox_Train(DataSet):
Expand All @@ -379,10 +361,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"train-250.wikipedia.sample.tsv")
return os.path.join(DATA_DIR, "train-250.wikipedia.sample.tsv")


class WikiDetox_Test(DataSet):
Expand All @@ -407,10 +386,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"test.wikipedia.sample.tsv")
return os.path.join(DATA_DIR, "test.wikipedia.sample.tsv")


class FS_Train(DataSet):
Expand All @@ -435,10 +411,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"train_fs.csv")
return os.path.join(DATA_DIR, "train_fs.csv")


class FS_Test(DataSet):
Expand All @@ -463,10 +436,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"test_fs.csv")
return os.path.join(DATA_DIR, "test_fs.csv")


class MSLTR_Train(DataSet):
Expand All @@ -492,10 +462,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"train-msltr.sample.csv")
return os.path.join(DATA_DIR, "train-msltr.sample.csv")


class MSLTR_Test(DataSet):
Expand All @@ -521,10 +488,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"test-msltr.sample.csv")
return os.path.join(DATA_DIR, "test-msltr.sample.csv")


class Uci_Train(DataSet):
Expand All @@ -548,10 +512,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"train-500.uciadult.sample.csv")
return os.path.join(DATA_DIR, "train-500.uciadult.sample.csv")


class Uci_Test(DataSet):
Expand All @@ -575,10 +536,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"test-100.uciadult.sample.csv")
return os.path.join(DATA_DIR, "test-100.uciadult.sample.csv")


class Generated_Twitter_Train(DataSet):
Expand All @@ -603,10 +561,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"train-twitter.gen-sample.tsv")
return os.path.join(DATA_DIR, "train-twitter.gen-sample.tsv")


class Generated_Twitter_Test(DataSet):
Expand All @@ -631,10 +586,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"test-twitter.gen-sample.tsv")
return os.path.join(DATA_DIR, "test-twitter.gen-sample.tsv")


class Generated_Ticket_Train(DataSet):
Expand All @@ -659,10 +611,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"train-ticketchoice.csv")
return os.path.join(DATA_DIR, "train-ticketchoice.csv")


class Generated_Ticket_Test(DataSet):
Expand All @@ -687,10 +636,7 @@ def as_filepath(self):
"""
Return file name.
"""
return os.path.join(
os.path.dirname(__file__),
"data",
"test-ticketchoice.csv")
return os.path.join(DATA_DIR, "test-ticketchoice.csv")


_datasets = dict(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
# WordEmbedding: pre-trained transform to generate word embeddings
import pandas
from nimbusml import Pipeline
from nimbusml.feature_extraction.text import WordEmbedding
from nimbusml.feature_extraction.text.ngramfeaturizer import NGramFeaturizer
from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding
from nimbusml.feature_extraction.text.extractor import Ngram

# create the data
Expand All @@ -22,5 +21,12 @@
])
y = pipeline.fit_transform(customer_reviews)

# view the review embeddings
# print(y.head())
# view a small subset of the review embeddings
print(y.iloc[:5, -3:])
# review_TransformedText.147 review_TransformedText.148 review_TransformedText.149
# 0 1.918661 -0.714531 3.062141
# 1 1.891922 -0.248650 1.706620
# 2 1.601611 0.309785 3.379576
# 3 1.970666 1.477450 3.110802
# 4 2.521791 0.122538 3.129919