project init

thompson0012 · Oct 21, 2021 · a4f9f88 · a4f9f88
1 parent cef14b3
commit a4f9f88
Show file tree

Hide file tree

Showing 51 changed files with 3,690 additions and 0 deletions.
diff --git a/assets/icon.afdesign b/assets/icon.afdesign
diff --git a/assets/icon.png b/assets/icon.png
diff --git a/assets/icon.svg b/assets/icon.svg
diff --git a/assets/img-s3-infographic-analytics.png b/assets/img-s3-infographic-analytics.png
diff --git a/examples/forecast at scale.ipynb b/examples/forecast at scale.ipynb
diff --git a/examples/test_pyod.ipynb b/examples/test_pyod.ipynb
@@ -0,0 +1,328 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c9ddbf1d-90eb-4c1b-b9c2-16d54d9acdbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyod.models.xgbod import XGBOD\n",
+    "from pyod.models.knn import KNN\n",
+    "from pyod.models.cof import COF\n",
+    "from pyod.models.hbos import HBOS\n",
+    "from pyod.models.pca import PCA\n",
+    "from pyod.models.iforest import IForest\n",
+    "from pyod.models.lof import LOF\n",
+    "from pyod.models.suod import SUOD"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "99020d10-2eb4-47ac-8a40-4a29c002a772",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyod.models.combination import aom, moa, average, maximization\n",
+    "from pyod.utils.data import generate_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "c5505809-24a9-4f04-a44a-187bd1a8deb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from jax import numpy as jnp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "03a45c07-5e03-4c9c-9591-90a7fcc53c29",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cardio.mat does not exist. Use generated data\n",
+      "Combining 20 kNN detectors\n",
+      "Combination by Average ROC:0.9999, precision @ rank n:0.9756\n",
+      "Combination by Maximization ROC:0.9999, precision @ rank n:0.9756\n",
+      "Combination by Median ROC:0.9999, precision @ rank n:0.9756\n",
+      "Combination by AOM ROC:0.9999, precision @ rank n:0.9756\n",
+      "Combination by MOA ROC:0.9999, precision @ rank n:0.9756\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from scipy.io import loadmat\n",
+    "\n",
+    "from pyod.models.knn import KNN\n",
+    "from pyod.models.combination import aom, moa, average, maximization, median\n",
+    "from pyod.utils.utility import standardizer\n",
+    "from pyod.utils.data import generate_data\n",
+    "from pyod.utils.data import evaluate_print\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "\n",
+    "# Define data file and read X and y\n",
+    "# Generate some data if the source data is missing\n",
+    "mat_file = 'cardio.mat'\n",
+    "try:\n",
+    "    mat = loadmat(os.path.join('data', mat_file))\n",
+    "\n",
+    "except TypeError:\n",
+    "    print('{data_file} does not exist. Use generated data'.format(\n",
+    "        data_file=mat_file))\n",
+    "    X, y = generate_data(train_only=True)  # load data\n",
+    "except IOError:\n",
+    "    print('{data_file} does not exist. Use generated data'.format(\n",
+    "        data_file=mat_file))\n",
+    "    X, y = generate_data(train_only=True)  # load data\n",
+    "else:\n",
+    "    X = mat['X']\n",
+    "    y = mat['y'].ravel()\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)\n",
+    "\n",
+    "# standardizing data for processing\n",
+    "X_train_norm, X_test_norm = standardizer(X_train, X_test)\n",
+    "\n",
+    "n_clf = 20  # number of base detectors\n",
+    "\n",
+    "# Initialize 20 base detectors for combination\n",
+    "k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,\n",
+    "          150, 160, 170, 180, 190, 200]\n",
+    "\n",
+    "train_scores = np.zeros([X_train.shape[0], n_clf])\n",
+    "test_scores = np.zeros([X_test.shape[0], n_clf])\n",
+    "\n",
+    "print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf))\n",
+    "\n",
+    "for i in range(n_clf):\n",
+    "    k = k_list[i]\n",
+    "\n",
+    "    clf = KNN(n_neighbors=k, method='largest')\n",
+    "    clf.fit(X_train_norm)\n",
+    "\n",
+    "    train_scores[:, i] = clf.decision_scores_\n",
+    "    test_scores[:, i] = clf.decision_function(X_test_norm)\n",
+    "\n",
+    "# Decision scores have to be normalized before combination\n",
+    "train_scores_norm, test_scores_norm = standardizer(train_scores,\n",
+    "                                                   test_scores)\n",
+    "# Combination by average\n",
+    "y_by_average = average(test_scores_norm)\n",
+    "evaluate_print('Combination by Average', y_test, y_by_average)\n",
+    "\n",
+    "# Combination by max\n",
+    "y_by_maximization = maximization(test_scores_norm)\n",
+    "evaluate_print('Combination by Maximization', y_test, y_by_maximization)\n",
+    "\n",
+    "# Combination by max\n",
+    "y_by_maximization = median(test_scores_norm)\n",
+    "evaluate_print('Combination by Median', y_test, y_by_maximization)\n",
+    "\n",
+    "# Combination by aom\n",
+    "y_by_aom = aom(test_scores_norm, n_buckets=5)\n",
+    "evaluate_print('Combination by AOM', y_test, y_by_aom)\n",
+    "\n",
+    "# Combination by moa\n",
+    "y_by_moa = moa(test_scores_norm, n_buckets=5)\n",
+    "evaluate_print('Combination by MOA', y_test, y_by_moa)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "2e0b42bf-2874-4c97-8982-4e71d1c2e7e7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-0.36218036] -0.36218036368474915\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(average(test_scores_norm[0:1]), np.mean(test_scores_norm[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "c652fe02-0915-4ab5-8ebc-aa6d718690d7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-0.3424924]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(maximization(test_scores_norm[0:1]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "ed319e46-eedf-4662-ac42-af81b9c8d0db",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([-0.3503167])"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "aom(test_scores_norm[0:1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "edeff1ce-20f5-4862-a12b-e60085dc2f3a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([-0.35627745])"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "moa(test_scores_norm[0:1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "1561191b-0f4b-4eae-9df0-8d70c4bf03fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.utils import shuffle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "3421e4e1-704d-4bd9-af2f-904443808785",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "shuffled_list = shuffle(list(range(0,20,1)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c804008b-19d3-4a5e-b4c3-6e7227a4e45a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# average == averaging all estimators scores\n",
+    "# median == take median of all estimators scores\n",
+    "# average of maximization == shufflely divide into n groups, and take each groups maximum scores, finally averaging them\n",
+    "# maximization of average == shufflely divide into n groups, and take each groups average scores, finally maximizing them"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5a2c61db-e888-4510-8e0e-21a76a7d95b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d094a09a-d59e-4f26-a46e-5d50e83c314b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t = np.random.random_sample(240)*100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "8d9686b0-a446-4f8b-9f63-2dfb7174425e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[(60.316, 65.267], (94.973, 99.924], (55.365, 60.316], (15.756, 20.707], (50.413, 55.365], ..., (35.56, 40.511], (70.218, 75.169], (94.973, 99.924], (10.805, 15.756], (80.12, 85.071]]\n",
+       "Length: 240\n",
+       "Categories (20, interval[float64, right]): [(0.803, 5.854] < (5.854, 10.805] < (10.805, 15.756] < (15.756, 20.707] ... (80.12, 85.071] < (85.071, 90.022] < (90.022, 94.973] < (94.973, 99.924]]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.cut(t, bins=20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5b1b080-67f2-48bd-be7c-80e0c8c8cca8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pyemits-BM0BzTys-py3.8",
+   "language": "python",
+   "name": "pyemits-bm0bztys-py3.8"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyemits/__init__.py b/pyemits/__init__.py
@@ -0,0 +1,11 @@
+"""
+you have to specify the module in there
+so that you can use the api call like pd.DataFrame, pd.to_datetime
+"""
+
+from pyemits import common
+from pyemits import core
+from pyemits import evaluation
+from pyemits import forecast
+from pyemits import io
+from pyemits.common import utils
diff --git a/pyemits/common/__init__.py b/pyemits/common/__init__.py
@@ -0,0 +1,3 @@
+"""
+fundamental layer, commonly called in everywhere
+"""