WIP - avoid cudf imports

nv-morpheus · rapids-bot · Oct 18, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 22, 2024
commit 6e155a99b3633129f9e6298ee1c2a2629cbe5c90
@@ -23,13 +23,13 @@
 from datetime import timedelta
 from urllib.parse import urlparse
 
-import pandas as pd
 import requests
 import requests_cache
 
 from morpheus.messages import MessageMeta
 from morpheus.utils.type_aliases import DataFrameType
 from morpheus.utils.type_aliases import DataFrameTypeStr
+from morpheus.utils.type_utils import get_df_class
 
 logger = logging.getLogger(__name__)
 
@@ -142,7 +142,7 @@ def __init__(self,
         self._run_indefinitely = run_indefinitely
         self._interval_secs = interval_secs
         self._interval_td = timedelta(seconds=self._interval_secs)
-        self._df_type = df_type
+        self._df_class: type[DataFrameType] = get_df_class(df_type)
 
         self._enable_cache = enable_cache
 
@@ -351,7 +351,7 @@ def fetch_dataframes(self):
 
         Yeilds
         ------
-        cudf.DataFrame
+        DataFrameType
             A DataFrame containing feed entry data.
 
         Raises
@@ -376,14 +376,14 @@ def fetch_dataframes(self):
                         entry_accumulator.append(entry)
 
                         if self._batch_size > 0 and len(entry_accumulator) >= self._batch_size:
-                            yield cudf.DataFrame(entry_accumulator)
+                            yield self._df_class(entry_accumulator)
                             entry_accumulator.clear()
 
             self._previous_entries = current_entries
 
             # Yield any remaining entries.
             if entry_accumulator:
-                yield cudf.DataFrame(entry_accumulator)
+                yield self._df_class(entry_accumulator)
             else:
                 logger.debug("No new entries found.")
 

@@ -16,10 +16,11 @@
 import os
 import typing
 
-import cudf
-
 import morpheus
 from morpheus.parsers.event_parser import EventParser
+from morpheus.utils.type_aliases import DataFrameType
+from morpheus.utils.type_aliases import SeriesType
+from morpheus.utils.type_utils import get_df_pkg_from_obj
 
 log = logging.getLogger(__name__)
 
@@ -41,17 +42,17 @@ def __init__(self, interested_eventcodes=None):
         self._event_regex = self._load_regex_yaml(regex_filepath)
         EventParser.__init__(self, self.get_columns(), self.EVENT_NAME)
 
-    def parse(self, text: cudf.Series) -> cudf.Series:
+    def parse(self, text: SeriesType) -> DataFrameType:
         """Parses the Windows raw event.
 
         Parameters
         ----------
-        text : cudf.Series
+        text : SeriesType
             Raw event log text to be parsed
 
         Returns
         -------
-        cudf.DataFrame
+        DataFrameType
             Parsed logs dataframe
         """
         # Clean raw data to be consistent.
@@ -65,23 +66,25 @@ def parse(self, text: cudf.Series) -> cudf.Series:
                 temp = self.parse_raw_event(input_chunk, self._event_regex[eventcode])
                 if not temp.empty:
                     output_chunks.append(temp)
-        parsed_dataframe = cudf.concat(output_chunks)
+
+        df_pkg = get_df_pkg_from_obj(text)
+        parsed_dataframe = df_pkg.concat(output_chunks)
         # Replace null values with empty.
         parsed_dataframe = parsed_dataframe.fillna("")
         return parsed_dataframe
 
-    def clean_raw_data(self, text: cudf.Series) -> cudf.Series:
+    def clean_raw_data(self, text: SeriesType) -> SeriesType:
         """
         Lower casing and replacing escape characters.
 
         Parameters
         ----------
-        text : cudf.Series
+        text : SeriesType
             Raw event log text to be clean
 
         Returns
         -------
-        cudf.Series
+        SeriesType
             Clean raw event log text
         """
         text = (text.str.lower().str.replace("\\\\t", "").str.replace("\\\\r", "").str.replace("\\\\n", "|"))

@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cudf
+from morpheus.utils.type_aliases import DataFrameType
+from morpheus.utils.type_aliases import DataFrameTypeStr
+from morpheus.utils.type_utils import get_df_pkg
 
 TYPE_DICT = {
     "bool": "bool",
@@ -36,7 +38,7 @@
 }
 
 
-def parse(filepath: str) -> cudf.DataFrame:
+def parse(filepath: str, df_type: DataFrameTypeStr = "cudf") -> DataFrameType:
     """
     Parse Zeek log file and return cuDF dataframe. Uses header comments to get column names/types
     and configure parser.
@@ -45,20 +47,23 @@ def parse(filepath: str) -> cudf.DataFrame:
     ----------
     filepath : str
         File path of Zeek log file
+    df_type : DataFrameTypeStr, default 'cudf'
+        Type of dataframe to return. Either 'cudf' or 'pandas'
 
     Returns
     -------
-    cudf.DataFrame
+    DataFrameType
         Parsed Zeek log dataframe
     """
-    header_gdf = cudf.read_csv(filepath, names=["line"], nrows=8)
+    df_pkg = get_df_pkg(df_type)
+    header_gdf = df_pkg.read_csv(filepath, names=["line"], nrows=8)
     lines_gdf = header_gdf["line"].str.split()
 
     column_names = lines_gdf.iloc[6][1:]
     column_types = lines_gdf.iloc[7][1:]
     column_dtypes = list(map(lambda x: TYPE_DICT.get(x, "str"), column_types))
 
-    log_gdf = cudf.read_csv(
+    log_gdf = df_pkg.read_csv(
         filepath,
         delimiter="\t",
         dtype=column_dtypes,

diff --git a/python/morpheus/morpheus/service/vdb/faiss_vdb_service.py b/python/morpheus/morpheus/service/vdb/faiss_vdb_service.py
@@ -17,12 +17,9 @@
 import time
 import typing
 
-import pandas as pd
-
-import cudf
-
 from morpheus.service.vdb.vector_db_service import VectorDBResourceService
 from morpheus.service.vdb.vector_db_service import VectorDBService
+from morpheus.utils.type_aliases import DataFrameType
 
 logger = logging.getLogger(__name__)
 
@@ -81,13 +78,13 @@ def insert(self, data: list[list] | list[dict], **kwargs) -> dict:
         """
         raise NotImplementedError("Insert operation is not supported in FAISS")
 
-    def insert_dataframe(self, df: typing.Union[cudf.DataFrame, pd.DataFrame], **kwargs) -> dict:
+    def insert_dataframe(self, df: DataFrameType, **kwargs) -> dict:
         """
         Insert a dataframe entires into the vector database.
 
         Parameters
         ----------
-        df : typing.Union[cudf.DataFrame, pd.DataFrame]
+        df : DataFrameType
             Dataframe to be inserted into the collection.
         **kwargs
             Extra keyword arguments specific to the vector database implementation.
@@ -368,19 +365,15 @@ def create(self, name: str, overwrite: bool = False, **kwargs):
         """
         raise NotImplementedError("create operation is not supported in FAISS")
 
-    def create_from_dataframe(self,
-                              name: str,
-                              df: typing.Union[cudf.DataFrame, pd.DataFrame],
-                              overwrite: bool = False,
-                              **kwargs) -> None:
+    def create_from_dataframe(self, name: str, df: DataFrameType, overwrite: bool = False, **kwargs) -> None:
         """
         Create collections in the vector database.
 
         Parameters
         ----------
         name : str
             Name of the collection.
-        df : Union[cudf.DataFrame, pd.DataFrame]
+        df : DataFrameType
             The dataframe to create the collection from.
         overwrite : bool, optional
             Whether to overwrite the collection if it already exists. Default is False.
@@ -416,16 +409,15 @@ def insert(self, name: str, data: list[list] | list[dict], **kwargs) -> dict[str
 
         raise NotImplementedError("create_from_dataframe operation is not supported in FAISS")
 
-    def insert_dataframe(self, name: str, df: typing.Union[cudf.DataFrame, pd.DataFrame],
-                         **kwargs) -> dict[str, typing.Any]:
+    def insert_dataframe(self, name: str, df: DataFrameType, **kwargs) -> dict[str, typing.Any]:
         """
         Converts dataframe to rows and insert to the vector database.
 
         Parameters
         ----------
         name : str
             Name of the collection to be inserted.
-        df : typing.Union[cudf.DataFrame, pd.DataFrame]
+        df : DataFrameType
             Dataframe to be inserted in the collection.
         **kwargs
             Additional keyword arguments containing collection configuration.

diff --git a/python/morpheus/morpheus/service/vdb/milvus_vector_db_service.py b/python/morpheus/morpheus/service/vdb/milvus_vector_db_service.py
@@ -20,13 +20,12 @@
 import typing
 from functools import wraps
 
-import cudf
-
 from morpheus.io.utils import cudf_string_cols_exceed_max_bytes
 from morpheus.io.utils import truncate_string_cols_by_bytes
 from morpheus.service.vdb.vector_db_service import VectorDBResourceService
 from morpheus.service.vdb.vector_db_service import VectorDBService
 from morpheus.utils.type_aliases import DataFrameType
+from morpheus.utils.type_utils import is_cudf_type
 
 logger = logging.getLogger(__name__)
 
@@ -327,7 +326,7 @@ def insert_dataframe(self, df: DataFrameType, **kwargs: dict[str, typing.Any]) -
                 logger.info("Skipped checking 'None' in the field: %s, with datatype: %s", field_name, dtype)
 
         needs_truncate = self._truncate_long_strings
-        if needs_truncate and isinstance(df, cudf.DataFrame):
+        if needs_truncate and is_cudf_type(df):
             # Cudf specific optimization, we can avoid a costly call to truncate_string_cols_by_bytes if all of the
             # string columns are already below the max length
             needs_truncate = cudf_string_cols_exceed_max_bytes(df, self._fields_max_length)
@@ -336,7 +335,7 @@ def insert_dataframe(self, df: DataFrameType, **kwargs: dict[str, typing.Any]) -
         column_names = [field.name for field in self._fields if not field.auto_id]
 
         collection_df = df[column_names]
-        if isinstance(collection_df, cudf.DataFrame):
+        if is_cudf_type(collection_df):
             collection_df = collection_df.to_pandas()
 
         if needs_truncate:
@@ -728,7 +727,7 @@ def _build_schema_conf(self, df: DataFrameType) -> list[dict]:
         # Always add a primary key
         fields.append({"name": "pk", "dtype": pymilvus.DataType.INT64, "is_primary": True, "auto_id": True})
 
-        if isinstance(df, cudf.DataFrame):
+        if is_cudf_type(df):
             df = df.to_pandas()
 
         # Loop over all of the columns of the first row and build the schema

diff --git a/python/morpheus/morpheus/service/vdb/vector_db_service.py b/python/morpheus/morpheus/service/vdb/vector_db_service.py
@@ -17,9 +17,7 @@
 from abc import ABC
 from abc import abstractmethod
 
-import pandas as pd
-
-import cudf
+from morpheus.utils.type_aliases import DataFrameType
 
 logger = logging.getLogger(__name__)
 
@@ -50,13 +48,13 @@ def insert(self, data: list[list] | list[dict], **kwargs: dict[str, typing.Any])
         pass
 
     @abstractmethod
-    def insert_dataframe(self, df: typing.Union[cudf.DataFrame, pd.DataFrame], **kwargs: dict[str, typing.Any]) -> dict:
+    def insert_dataframe(self, df: DataFrameType, **kwargs: dict[str, typing.Any]) -> dict:
         """
         Insert a dataframe into the vector database.
 
         Parameters
         ----------
-        df : typing.Union[cudf.DataFrame, pd.DataFrame]
+        df : DataFrameType
             Dataframe to be inserted into the resource.
         **kwargs : dict[str, typing.Any]
             Extra keyword arguments specific to the vector database implementation.
@@ -241,18 +239,15 @@ def insert(self, name: str, data: list[list] | list[dict], **kwargs: dict[str, t
         pass
 
     @abstractmethod
-    def insert_dataframe(self,
-                         name: str,
-                         df: typing.Union[cudf.DataFrame, pd.DataFrame],
-                         **kwargs: dict[str, typing.Any]) -> dict:
+    def insert_dataframe(self, name: str, df: DataFrameType, **kwargs: dict[str, typing.Any]) -> dict:
         """
         Converts dataframe to rows and insert into the vector database resource.
 
         Parameters
         ----------
         name : str
             Name of the resource to be inserted.
-        df : typing.Union[cudf.DataFrame, pd.DataFrame]
+        df : DataFrameType
             Dataframe to be inserted.
         **kwargs : dict[str, typing.Any]
             Additional keyword arguments containing collection configuration.
@@ -391,7 +386,7 @@ def create(self, name: str, overwrite: bool = False, **kwargs: dict[str, typing.
     @abstractmethod
     def create_from_dataframe(self,
                               name: str,
-                              df: typing.Union[cudf.DataFrame, pd.DataFrame],
+                              df: DataFrameType,
                               overwrite: bool = False,
                               **kwargs: dict[str, typing.Any]) -> None:
         """
@@ -401,7 +396,7 @@ def create_from_dataframe(self,
         ----------
         name : str
             Name of the resource.
-        df : Union[cudf.DataFrame, pd.DataFrame]
+        df : DataFrameType
             The dataframe to create the resource from.
         overwrite : bool, optional
             Whether to overwrite the resource if it already exists. Default is False.

@@ -23,6 +23,7 @@
 from morpheus.pipeline.preallocator_mixin import PreallocatorMixin
 from morpheus.pipeline.single_output_source import SingleOutputSource
 from morpheus.pipeline.stage_schema import StageSchema
+from morpheus.utils.type_utils import exec_mode_to_df_type_str
 
 logger = logging.getLogger(__name__)
 
@@ -82,7 +83,8 @@ def __init__(self,
                                          strip_markup=strip_markup,
                                          stop_after=stop_after,
                                          interval_secs=interval_secs,
-                                         should_stop_fn=self.is_stop_requested)
+                                         should_stop_fn=self.is_stop_requested,
+                                         df_type=exec_mode_to_df_type_str(c.execution_mode))
 
     @property
     def name(self) -> str:

@@ -241,3 +241,14 @@ def is_cudf_type(obj: typing.Any) -> bool:
     Check if a given object (DataFrame, Series, RangeIndex etc...) is a cuDF type.
     """
     return "cudf" in str(type(obj))
+
+
+def get_df_pkg_from_obj(obj: typing.Any) -> types.ModuleType:
+    """
+    Return the appropriate DataFrame package based on the DataFrame object.
+    """
+    if is_cudf_type(obj):
+        import cudf
+        return cudf
+
+    return pd
diff --git a/tests/test_windows_event_parser.py b/tests/test_windows_event_parser.py
@@ -630,6 +630,7 @@ def test_windows_event_parser():
         test_logs = fh.readlines()
     test_input = cudf.Series(test_logs)
     test_output_df = wep.parse(test_input)
+
     for parsed_rec in test_output_df.to_records():
         eventcode = parsed_rec["eventcode"]
         validate_func = VALIDATE_DICT.get(eventcode, unknown_record_type)