added initial connection to predicting with sagemaker

opensearch-project · Jul 19, 2022 · 3644f84 · 3644f84
1 parent bfcd903
commit 3644f84
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 2 deletions.
diff --git a/eland/__init__.py b/eland/__init__.py
@@ -31,6 +31,7 @@
 from .index import Index
 from .ndframe import NDFrame
 from .series import Series
+from .sagemaker_tools import make_sagemaker_prediction
 
 __all__ = [
     "DataFrame",
@@ -41,4 +42,5 @@
     "eland_to_pandas",
     "csv_to_eland",
     "SortOrder",
+    "make_sagemaker_prediction"
 ]
diff --git a/eland/sagemaker_tools.py b/eland/sagemaker_tools.py
@@ -0,0 +1,37 @@
+import json
+
+import numpy as np
+from eland import DataFrame
+from typing import List, Optional
+
+from sagemaker import RealTimePredictor
+
+
+def make_sagemaker_prediction(endpoint_name: str,
+                              data: DataFrame,
+                              column_order: Optional[List[str]] = None
+                              ) -> np.array:
+    """
+    Make a prediction on an eland dataframe using a deployed SageMaker model endpoint.
+
+    Parameters
+    ----------
+    endpoint_name: string representing name of SageMaker endpoint
+    data: eland DataFrame representing data to feed to SageMaker model. The dataframe must match the input datatypes
+        of the model and also have the correct number of columns.
+    column_order: list of string values representing the proper order that the columns should be read into the
+        SageMaker model. Must be a permutation of the column names of the eland DataFrame.
+
+    Returns
+    ----------
+    np.array representing the output of the model on input data
+    """
+    predictor = RealTimePredictor(endpoint=endpoint_name, content_type='text/csv')
+
+    test_data = data
+    if column_order is not None:
+        test_data = test_data[column_order]
+
+    preds = predictor.predict(test_data.to_csv(header=False, index=False))
+    preds = np.array(json.loads(preds.decode('utf-8'))['probabilities'])
+    return preds
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -5,6 +5,8 @@ elasticsearch>=8,<9
 pandas>=1.2,<2
 matplotlib<4
 numpy<2
+opensearch-py>=2
+sagemaker>=1.72,<2
 tqdm<5
 
 #

diff --git a/requirements.txt b/requirements.txt
@@ -5,4 +5,5 @@ elasticsearch>=8,<9
 pandas>=1.2,<2
 matplotlib<4
 numpy<2
-opensearch-py>=2
+opensearch-py>=2
+sagemaker>=1.72,<2
diff --git a/tests/dataframe/test_es_query_pytest.py b/tests/dataframe/test_es_query_pytest.py
@@ -46,7 +46,6 @@ def test_es_query_allows_query_in_dict(self):
         assert len(left) > 0
         assert_eland_frame_equal(left, right)
 
-    # @pytest.mark.skip(reason="OpenSearch currently does not support geosearch")
     def test_es_query_geo_location(self):
         df = self.ed_ecommerce()
         cur_nearby = df.es_query(