hyperdimensional-computing · mikeheddes · Jan 6, 2023 · Jan 5, 2023 · Jan 5, 2023
diff --git a/docs/datasets.rst b/docs/datasets.rst
@@ -79,7 +79,36 @@ The Torchhd library provides many popular built-in datasets to work with.
     Mammographic
     Miniboone
     MolecBiolPromoter
-
+    MolecBiolSplice
+    Monks1
+    Monks2
+    Monks3
+    Mushroom
+    Musk1
+    Musk2
+    Nursery
+    OocytesMerlucciusNucleus4d
+    OocytesMerlucciusStates2f
+    OocytesTrisopterusNucleus2f
+    OocytesTrisopterusStates5b
+    Optical
+    Ozone
+    PageBlocks
+    Parkinsons
+    Pendigits
+    Pima
+    PittsburgBridgesMaterial
+    PittsburgBridgesRelL
+    PittsburgBridgesSpan
+    PittsburgBridgesTOrD
+    PittsburgBridgesType
+    Planning
+    PlantMargin
+    PlantShape
+    PlantTexture
+    PostOperative
+    PrimaryTumor
+    Ringnorm
 
 Base classes
 ------------------------

diff --git a/torchhd/datasets/__init__.py b/torchhd/datasets/__init__.py
@@ -69,7 +69,36 @@
 from torchhd.datasets.mammographic import Mammographic
 from torchhd.datasets.miniboone import Miniboone
 from torchhd.datasets.molec_biol_promoter import MolecBiolPromoter
-
+from torchhd.datasets.molec_biol_splice import MolecBiolSplice
+from torchhd.datasets.monks_1 import Monks1
+from torchhd.datasets.monks_2 import Monks2
+from torchhd.datasets.monks_3 import Monks3
+from torchhd.datasets.mushroom import Mushroom
+from torchhd.datasets.musk_1 import Musk1
+from torchhd.datasets.musk_2 import Musk2
+from torchhd.datasets.nursery import Nursery
+from torchhd.datasets.oocytes_merluccius_nucleus_4d import OocytesMerlucciusNucleus4d
+from torchhd.datasets.oocytes_merluccius_states_2f import OocytesMerlucciusStates2f
+from torchhd.datasets.oocytes_trisopterus_nucleus_2f import OocytesTrisopterusNucleus2f
+from torchhd.datasets.oocytes_trisopterus_states_5b import OocytesTrisopterusStates5b
+from torchhd.datasets.optical import Optical
+from torchhd.datasets.ozone import Ozone
+from torchhd.datasets.page_blocks import PageBlocks
+from torchhd.datasets.parkinsons import Parkinsons
+from torchhd.datasets.pendigits import Pendigits
+from torchhd.datasets.pima import Pima
+from torchhd.datasets.pittsburg_bridges_material import PittsburgBridgesMaterial
+from torchhd.datasets.pittsburg_bridges_rel_l import PittsburgBridgesRelL
+from torchhd.datasets.pittsburg_bridges_span import PittsburgBridgesSpan
+from torchhd.datasets.pittsburg_bridges_t_or_d import PittsburgBridgesTOrD
+from torchhd.datasets.pittsburg_bridges_type import PittsburgBridgesType
+from torchhd.datasets.planning import Planning
+from torchhd.datasets.plant_margin import PlantMargin
+from torchhd.datasets.plant_shape import PlantShape
+from torchhd.datasets.plant_texture import PlantTexture
+from torchhd.datasets.post_operative import PostOperative
+from torchhd.datasets.primary_tumor import PrimaryTumor
+from torchhd.datasets.ringnorm import Ringnorm
 
 __all__ = [
     "BeijingAirQuality",
@@ -143,4 +172,34 @@
     "Mammographic",
     "Miniboone",
     "MolecBiolPromoter",
+    "MolecBiolSplice",
+    "Monks1",
+    "Monks2",
+    "Monks3",
+    "Mushroom",
+    "Musk1",
+    "Musk2",
+    "Nursery",
+    "OocytesMerlucciusNucleus4d",
+    "OocytesMerlucciusStates2f",
+    "OocytesTrisopterusNucleus2f",
+    "OocytesTrisopterusStates5b",
+    "Optical",
+    "Ozone",
+    "PageBlocks",
+    "Parkinsons",
+    "Pendigits",
+    "Pima",
+    "PittsburgBridgesMaterial",
+    "PittsburgBridgesRelL",
+    "PittsburgBridgesSpan",
+    "PittsburgBridgesTOrD",
+    "PittsburgBridgesType",
+    "Planning",
+    "PlantMargin",
+    "PlantShape",
+    "PlantTexture",
+    "PostOperative",
+    "PrimaryTumor",
+    "Ringnorm",
 ]
diff --git a/torchhd/datasets/molec_biol_splice.py b/torchhd/datasets/molec_biol_splice.py
@@ -0,0 +1,32 @@
+from typing import List
+from torchhd.datasets import DatasetFourFold
+
+
+class MolecBiolSplice(DatasetFourFold):
+    """`Molecular Biology (Splice-junction Gene Sequences) <https://archive.ics.uci.edu/ml/datasets/Molecular+Biology+(Splice-junction+Gene+Sequences)>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
+            Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
+            as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
+        fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
+            Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
+            Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
+        hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "molec-biol-splice"
+    classes: List[str] = [
+        "EI",
+        "IE",
+        "N",
+    ]
diff --git a/torchhd/datasets/monks_1.py b/torchhd/datasets/monks_1.py
@@ -0,0 +1,27 @@
+from typing import List
+from torchhd.datasets import DatasetTrainTest
+
+
+class Monks1(DatasetTrainTest):
+    """`MONK's Problems <https://archive.ics.uci.edu/ml/datasets/MONK%27s+Problems>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by hyper_search variable.
+            Otherwise returns a subset of train dataset if hyperparameter search is performed (``hyper_search = True``) if not (``hyper_search = False``) returns test set.
+        hyper_search (bool, optional): If True, creates dataset using indices in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "monks-1"
+    classes: List[str] = [
+        "0",
+        "1",
+    ]
diff --git a/torchhd/datasets/monks_2.py b/torchhd/datasets/monks_2.py
@@ -0,0 +1,27 @@
+from typing import List
+from torchhd.datasets import DatasetTrainTest
+
+
+class Monks2(DatasetTrainTest):
+    """`MONK's Problems <https://archive.ics.uci.edu/ml/datasets/MONK%27s+Problems>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by hyper_search variable.
+            Otherwise returns a subset of train dataset if hyperparameter search is performed (``hyper_search = True``) if not (``hyper_search = False``) returns test set.
+        hyper_search (bool, optional): If True, creates dataset using indices in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "monks-2"
+    classes: List[str] = [
+        "0",
+        "1",
+    ]
diff --git a/torchhd/datasets/monks_3.py b/torchhd/datasets/monks_3.py
@@ -0,0 +1,27 @@
+from typing import List
+from torchhd.datasets import DatasetTrainTest
+
+
+class Monks3(DatasetTrainTest):
+    """`MONK's Problems <https://archive.ics.uci.edu/ml/datasets/MONK%27s+Problems>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by hyper_search variable.
+            Otherwise returns a subset of train dataset if hyperparameter search is performed (``hyper_search = True``) if not (``hyper_search = False``) returns test set.
+        hyper_search (bool, optional): If True, creates dataset using indices in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "monks-3"
+    classes: List[str] = [
+        "0",
+        "1",
+    ]
diff --git a/torchhd/datasets/mushroom.py b/torchhd/datasets/mushroom.py
@@ -0,0 +1,31 @@
+from typing import List
+from torchhd.datasets import DatasetFourFold
+
+
+class Mushroom(DatasetFourFold):
+    """`Mushroom <https://archive.ics.uci.edu/ml/datasets/mushroom>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
+            Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
+            as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
+        fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
+            Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
+            Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
+        hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "mushroom"
+    classes: List[str] = [
+        "edible",
+        "poisonous",
+    ]
diff --git a/torchhd/datasets/musk_1.py b/torchhd/datasets/musk_1.py
@@ -0,0 +1,31 @@
+from typing import List
+from torchhd.datasets import DatasetFourFold
+
+
+class Musk1(DatasetFourFold):
+    """`Musk (Version 1) <https://archive.ics.uci.edu/ml/datasets/Musk+(Version+1)>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
+            Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
+            as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
+        fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
+            Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
+            Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
+        hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "musk-1"
+    classes: List[str] = [
+        "non-musk",
+        "musk",
+    ]
diff --git a/torchhd/datasets/musk_2.py b/torchhd/datasets/musk_2.py
@@ -0,0 +1,31 @@
+from typing import List
+from torchhd.datasets import DatasetFourFold
+
+
+class Musk2(DatasetFourFold):
+    """`Musk (Version 2) <https://archive.ics.uci.edu/ml/datasets/Musk+(Version+2)>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
+            Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
+            as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
+        fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
+            Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
+            Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
+        hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "musk-2"
+    classes: List[str] = [
+        "non-musk",
+        "musk",
+    ]
diff --git a/torchhd/datasets/nursery.py b/torchhd/datasets/nursery.py
@@ -0,0 +1,34 @@
+from typing import List
+from torchhd.datasets import DatasetFourFold
+
+
+class Nursery(DatasetFourFold):
+    """`Nursery <https://archive.ics.uci.edu/ml/datasets/nursery>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
+            Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
+            as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
+        fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
+            Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
+            Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
+        hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "nursery"
+    classes: List[str] = [
+        "not_recom",
+        "recommend",
+        "very_recom",
+        "priority",
+        "spec_prior",
+    ]