diff --git a/.gitignore b/.gitignore index 20f7b1a6c0..94921d7f58 100644 --- a/.gitignore +++ b/.gitignore @@ -60,8 +60,6 @@ gen ### Images template # JPEG -*.jpg -*.jpeg *.jpe *.jif *.jfif diff --git a/docs/source/best_practice/sklearn/heading_of_pca.jpg b/docs/source/best_practice/sklearn/heading_of_pca.jpg new file mode 100644 index 0000000000..4633db738a Binary files /dev/null and b/docs/source/best_practice/sklearn/heading_of_pca.jpg differ diff --git a/docs/source/best_practice/sklearn/index.rst b/docs/source/best_practice/sklearn/index.rst new file mode 100644 index 0000000000..f439eff058 --- /dev/null +++ b/docs/source/best_practice/sklearn/index.rst @@ -0,0 +1,35 @@ +Apply into Scikit-Learn +=========================== + +Actually, ``TreeValue`` can be used in practice with not only ``numpy`` or ``torch`` library, such as ``scikit-learn``. +In the following part, a demo of PCA to tree-structured arrays will be shown. + +In the field of traditional machine learning, PCA (Principal Component Analysis) is often used to preprocess data, +by normalizing the data range, and trying to reduce the dimensionality of the data, so as to reduce the complexity +of the input data and improve machine learning's efficiency and quality. Just as the following image + +.. figure:: heading_of_pca.jpg + :alt: PCA Principle + + PCA in a nutshell. Source: Lavrenko and Sutton 2011, slide 13. + +In the scikit-learn library, the PCA class is provided to support this function, and the function ``fit_transform`` +can be used to simplify the data. For a set of ``np.array`` format data that presents a tree structure, +we can implement the operation support for the tree structure by quickly wrapping the function ``fit_transform``. +The specific code is as follows + +.. literalinclude:: sklearn.demo.py + :language: python + :linenos: + +The output should be + +.. literalinclude:: sklearn.demo.py.txt + :language: text + :linenos: + +For further information, see the links below: + +* `Official documentation of PCA in scikit-learn `_ +* `Details of PCA `_ + diff --git a/docs/source/best_practice/sklearn/sklearn.demo.py b/docs/source/best_practice/sklearn/sklearn.demo.py new file mode 100644 index 0000000000..093d37f10b --- /dev/null +++ b/docs/source/best_practice/sklearn/sklearn.demo.py @@ -0,0 +1,20 @@ +import numpy as np +from sklearn.decomposition import PCA + +from treevalue import FastTreeValue + +fit_transform = FastTreeValue.func()(lambda x: PCA(min(*x.shape)).fit_transform(x)) + +if __name__ == '__main__': + data = FastTreeValue({ + 'a': np.random.randint(-5, 15, (4, 3)), + 'x': { + 'c': np.random.randint(-15, 5, (5, 4)), + } + }) + print("Original int data:") + print(data) + + pdata = fit_transform(data) + print("Fit transformed data:") + print(pdata) diff --git a/docs/source/index.rst b/docs/source/index.rst index a7b078ca63..d84cd86b3e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -26,6 +26,7 @@ structure processing when the calculation is tree-based. :caption: Best Practice best_practice/numpy/index + best_practice/sklearn/index .. toctree:: :maxdepth: 2 diff --git a/requirements-doc.txt b/requirements-doc.txt index 04c64a0122..0b603d05a3 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -7,4 +7,5 @@ packaging sphinx-multiversion~=0.2.4 where~=1.0.2 numpy>=1.19,<2 -easydict>=1.7,<2 \ No newline at end of file +easydict>=1.7,<2 +scikit-learn>=0.24.2 \ No newline at end of file