microsoft · petebryan · Sep 20, 2022 · Sep 17, 2022 · Sep 19, 2022 · Sep 19, 2022
diff --git a/docs/source/visualization/ProcessTree.rst b/docs/source/visualization/ProcessTree.rst
@@ -201,13 +201,15 @@ data (pd.DataFrame)
    DataFrame containing one or more Process Trees. This should be the
    output of ``build_process_tree`` described above.
 
-schema (ProcSchema, optional)
+schema (Dict | ProcSchema, optional)
    The data schema to use for the data set, by default None. If None
    the schema is inferred. A schema object maps generic field names
    (e.g. ``process_name``) on to a data-specific name (e.g. ``exe``
    in the case of Linux audit data). This is usually not required
    since the function will try to infer the schema from fields in the
    input DataFrame.
+   This can be supplied as a ProcSchema instance or a dictionary
+   with required schema mappings.
 
 output_var (str, optional)
    Output variable for selected items in the tree, by default None.
@@ -719,11 +721,11 @@ are shown below.
 ===================  =====================  =====================  ===========================
 Generic name         Win 4688 schema        Linux auditd schema    MDE schema
 ===================  =====================  =====================  ===========================
-time_stamp           TimeGenerated          TimeGenerated          CreatedProcessCreationTime
-process_name         NewProcessName         exe                    CreatedProcessName
-process_id           NewProcessId           pid                    CreatedProcessId
+\*time_stamp         TimeGenerated          TimeGenerated          CreatedProcessCreationTime
+\*process_name       NewProcessName         exe                    CreatedProcessName
+\*process_id         NewProcessId           pid                    CreatedProcessId
 parent_name          ParentProcessName      *(not used)*           ParentProcessName
-parent_id            ProcessId              ppid                   CreatedProcessParentId
+\*parent_id          ProcessId              ppid                   CreatedProcessParentId
 logon_id             SubjectLogonId         ses                    InitiatingProcessLogonId
 target_logon_id      TargetLogonId          *(not used)*           LogonId
 cmd_line             CommandLine            cmdline                CreatedProcessCommandLine
@@ -733,6 +735,9 @@ host_name_column     Computer               Computer               ComputerDnsNa
 event_id_column      EventID                EventType              *(not used)*
 ===================  =====================  =====================  ===========================
 
+\* indicates a mandatory field. You must supply mappings from your
+source data for these items. Others are optional but will provide more
+information to the user in the plotted tree.
 
 If your schema differs from, but is similar to one of the built-in
 schema mappings you can adapt one of these or supply a custom schema
@@ -783,11 +788,39 @@ You can also supply a schema as a Python ``dict``, with the keys
 being the generic internal name and the values, the names of the columns
 in the input data. Both keys and values are strings except where
 otherwise indicated above.
-
-The ``time_stamp`` column **must** be a pandas Timestamp (Python datetime)
-type. If your data is in another format (e.g. Unix timestamp or date string)
-you should
-convert this before trying to use the process tree tools. The example
+Use :py:meth:`~msticpy.transform.process_tree_schema.ProcSchema.blank_schema_dict`
+to get a blank schema dictionary.
+
+.. code:: python3
+
+   from msticpy.transform.proc_tree_schema import ProcSchema
+   ProcSchema.blank_schema_dict()
+
+.. parsed-literal::
+
+   {'process_name': 'required',
+   'process_id': 'required',
+   'parent_id': 'required',
+   'time_stamp': 'required',
+   'cmd_line': None,
+   'path_separator': None,
+   'user_name': None,
+   'logon_id': None,
+   'host_name_column': None,
+   'parent_name': None,
+   'target_logon_id': None,
+   'user_id': None,
+   'event_id_column': None,
+   'event_id_identifier': None}
+
+
+The ``time_stamp`` column **should** be a pandas ``Timestamp`` (Python ``datetime``)
+type. If your data is in another format (e.g. Unix timestamp or date string),
+the process tree module will try to convert it before building the process tree
+plot. This uses ``pandas`` to convert to native ``Timestamp``
+
+If the auto-conversion does not work, convert the timestamp field before
+trying to use the process tree tools. The example
 below shows extracting the timestamp from the auditd ``mssg_id`` field.
 
 

diff --git a/msticpy/transform/proc_tree_build_mde.py b/msticpy/transform/proc_tree_build_mde.py
@@ -321,7 +321,7 @@ def _map_columns(
 
 
 def convert_mde_schema_to_internal(
-    data: pd.DataFrame, schema: ProcSchema
+    data: pd.DataFrame, schema: ProcSchema, **kwargs
 ) -> pd.DataFrame:
     """
     Convert DeviceProcessEvents schema data to internal MDE schema.
@@ -360,5 +360,14 @@ def convert_mde_schema_to_internal(
     data["InitiatingProcessFolderPath"] = data.InitiatingProcessFolderPath.apply(
         lambda x: x.rsplit("\\", maxsplit=1)[0]
     )
+    # re-write any field name references in kwargs
+    plot_args = kwargs.pop("plot_args", {})
+    for arg_name, arg_value in plot_args.items():
+        if isinstance(arg_value, str) and arg_value in _SENTINEL_MDE_MAP:
+            plot_args[arg_name] = _SENTINEL_MDE_MAP[arg_value]
+        if isinstance(arg_value, list):
+            plot_args[arg_name] = [
+                _SENTINEL_MDE_MAP.get(field, field) for field in arg_value
+            ]
 
     return data.rename(columns=_SENTINEL_MDE_MAP)
diff --git a/msticpy/transform/proc_tree_build_winlx.py b/msticpy/transform/proc_tree_build_winlx.py
@@ -10,6 +10,7 @@
 from .._version import VERSION
 from ..common.data_utils import ensure_df_datetimes
 from .proc_tree_schema import ColNames as Col
+from .proc_tree_schema import ProcSchema
 
 __version__ = VERSION
 __author__ = "Ian Hellen"
@@ -50,7 +51,7 @@ def extract_process_tree(
 
     """
     # Clean data
-    procs_cln = _clean_proc_data(procs, schema)
+    procs_cln, schema = _clean_proc_data(procs, schema)
 
     # Merge parent-child
     merged_procs = _merge_parent_by_time(procs_cln, schema)
@@ -97,22 +98,30 @@ def _clean_proc_data(
     # Convert any numeric schema cols to str types
     procs_cln = _num_cols_to_str(procs_cln, schema)
 
-    procs_cln[Col.EffectiveLogonId] = procs_cln[schema.logon_id]
-    # Create effective logon Id for Windows, if the TargetLogonId is not 0x0
-    if schema.target_logon_id:
-        has_tgt_logonid = (procs_cln[schema.target_logon_id] != "0x0") & (
-            procs_cln[schema.target_logon_id].notna()
-        )
-        procs_cln.loc[has_tgt_logonid, Col.EffectiveLogonId] = procs_cln[
-            schema.target_logon_id
-        ]
+    if schema.logon_id not in procs_cln.columns:
+        schema = ProcSchema(**(attr.asdict(schema)))
+        schema.logon_id = None  # type: ignore
+
+    if schema.logon_id:
+        procs_cln[Col.EffectiveLogonId] = procs_cln[schema.logon_id]
+        # Create effective logon Id for Windows, if the TargetLogonId is not 0x0
+        if schema.target_logon_id:
+            has_tgt_logonid = (procs_cln[schema.target_logon_id] != "0x0") & (
+                procs_cln[schema.target_logon_id].notna()
+            )
+            procs_cln.loc[has_tgt_logonid, Col.EffectiveLogonId] = procs_cln[
+                schema.target_logon_id
+            ]
+    else:
+        procs_cln[Col.EffectiveLogonId] = 0
+
     procs_cln[Col.new_process_lc] = procs_cln[schema.process_name].str.lower()
     if schema.parent_name:
         no_pproc = procs_cln[schema.parent_name] == ""
         procs_cln.loc[no_pproc, schema.parent_name] = "unknown"
         procs_cln[Col.parent_proc_lc] = procs_cln[schema.parent_name].str.lower()
     procs_cln[Col.source_index] = procs_cln.index
-    return procs_cln
+    return procs_cln, schema
 
 
 def _num_cols_to_str(
@@ -216,38 +225,46 @@ def _extract_inferred_parents(
         # from the merged data for ALL processes
         merged_procs[parent_col_name] = merged_procs[f"{schema.process_name}_par"]
         merged_procs[Col.parent_proc_lc] = merged_procs[Col.new_process_lc_par]
-    merged_procs.loc[root_procs_crit, Col.EffectiveLogonId_par] = merged_procs[
-        schema.logon_id
-    ]
+    if schema.logon_id:
+        merged_procs.loc[root_procs_crit, Col.EffectiveLogonId_par] = merged_procs[
+            schema.logon_id
+        ]
+    else:
+        merged_procs.loc[root_procs_crit, Col.EffectiveLogonId_par] = 0
     merged_procs.loc[root_procs_crit, Col.timestamp_orig_par] = time_zero
 
-    # Extract synthentic rows for the parents of root processes
+    # Extract synthetic rows for the parents of root processes
     parent_cols = [
-        schema.host_name_column,
+        *([schema.host_name] if schema.host_name else []),
         schema.parent_id,
         Col.EffectiveLogonId_par,
         parent_col_name,
         Col.parent_proc_lc,
+        *([schema.event_id_column] if schema.event_id_column else []),
     ]
-    if schema.event_id_column:
-        parent_cols.append(schema.event_id_column)
+    rename_par_cols = {
+        schema.parent_id: schema.process_id,
+        schema.parent_name: schema.process_name,
+        Col.parent_proc_lc: Col.new_process_lc,
+    }
+    if schema.logon_id:
+        rename_par_cols[Col.EffectiveLogonId_par] = schema.logon_id
     inferred_parents = (
         merged_procs[root_procs_crit][parent_cols]
-        .rename(
-            columns={
-                schema.parent_id: schema.process_id,
-                schema.parent_name: schema.process_name,
-                Col.parent_proc_lc: Col.new_process_lc,
-                Col.EffectiveLogonId_par: schema.logon_id,
-            }
-        )
-        .assign(time_stamp=time_zero, EffectiveLogonId=merged_procs[schema.logon_id])
+        .rename(columns=rename_par_cols)
+        .assign(time_stamp=time_zero)
         .rename(columns={"time_stamp": schema.time_stamp})
-        .drop_duplicates()
     )
+    if schema.logon_id:
+        inferred_parents[Col.EffectiveLogonId] = merged_procs[schema.logon_id]
+    else:
+        inferred_parents[Col.EffectiveLogonId] = 0
 
     return pd.concat(
-        [merged_procs, inferred_parents], ignore_index=True, axis=0, sort=False
+        [merged_procs, inferred_parents.drop_duplicates()],
+        ignore_index=True,
+        axis=0,
+        sort=False,
     )
 
 

diff --git a/msticpy/transform/proc_tree_builder.py b/msticpy/transform/proc_tree_builder.py
@@ -34,6 +34,7 @@ def build_process_tree(
     schema: Union[ProcSchema, Dict[str, Any]] = None,
     show_summary: bool = False,
     debug: bool = False,
+    **kwargs,
 ) -> pd.DataFrame:
     """
     Build process trees from the process events.
@@ -63,12 +64,12 @@ def build_process_tree(
     ProcSchema
 
     """
+    if isinstance(schema, dict):
+        schema = ProcSchema(**schema)
     # If schema is none, infer schema from columns
     if not schema or schema == MDE_INT_EVENT_SCH:
         # Special case for MDE - since there are two possible schemas
         schema = infer_schema(procs)
-    if isinstance(schema, dict):
-        schema = ProcSchema(**schema)
 
     if not schema:
         raise TypeError(
@@ -78,7 +79,9 @@ def build_process_tree(
         )
 
     if schema == MDE_EVENT_SCH:
-        procs = mde.convert_mde_schema_to_internal(procs, schema=MDE_EVENT_SCH)
+        procs = mde.convert_mde_schema_to_internal(
+            procs, schema=MDE_EVENT_SCH, plot_args=kwargs.pop("plot_args", {})
+        )
         schema = MDE_INT_EVENT_SCH
     if schema == MDE_INT_EVENT_SCH:
         extr_proc_tree = mde.extract_process_tree(procs, debug=debug)
@@ -91,7 +94,7 @@ def build_process_tree(
 
     if show_summary:
         print(get_summary_info(proc_tree))
-    return proc_tree
+    return proc_tree.sort_values(by=["path", schema.time_stamp], ascending=True)
 
 
 def infer_schema(data: Union[pd.DataFrame, pd.Series]) -> Optional[ProcSchema]:
@@ -110,11 +113,11 @@ def infer_schema(data: Union[pd.DataFrame, pd.Series]) -> Optional[ProcSchema]:
 
     """
     src_cols = data.columns if isinstance(data, pd.DataFrame) else data.index
-    schema_matches = {}
-    for schema in SUPPORTED_SCHEMAS:
-        matching_cols = set(src_cols) & set(schema.columns)
-        schema_matches[len(matching_cols)] = schema
-    if max(schema_matches) > 5:
+    schema_matches = {
+        len(set(src_cols) & set(schema.columns)): schema for schema in SUPPORTED_SCHEMAS
+    }
+
+    if max(schema_matches) >= 4:
         return schema_matches[max(schema_matches)]
     return None
 

diff --git a/msticpy/transform/proc_tree_schema.py b/msticpy/transform/proc_tree_schema.py
@@ -42,12 +42,12 @@ class ProcSchema:
     process_name: str
     process_id: str
     parent_id: str
-    logon_id: str
-    cmd_line: str
-    user_name: str
-    path_separator: str
-    host_name_column: str
-    time_stamp: str = "TimeGenerated"
+    time_stamp: str
+    cmd_line: Optional[str] = None
+    path_separator: str = "\\"
+    user_name: Optional[str] = None
+    logon_id: Optional[str] = None
+    host_name_column: Optional[str] = None
     parent_name: Optional[str] = None
     target_logon_id: Optional[str] = None
     user_id: Optional[str] = None
@@ -72,11 +72,8 @@ def required_columns(self):
             "process_name",
             "process_id",
             "parent_id",
-            "logon_id",
             "cmd_line",
-            "user_name",
             "path_separator",
-            "host_name_column",
             "time_stamp",
         ]
 
@@ -102,6 +99,11 @@ def get_df_cols(self, data: pd.DataFrame):
         """Return the subset of columns that are present in `data`."""
         return [col for col in self.columns if col in data.columns]
 
+    @property
+    def host_name(self) -> Optional[str]:
+        """Return host name column."""
+        return self.host_name_column
+
     @property
     def event_type_col(self) -> str:
         """
@@ -146,6 +148,16 @@ def event_filter(self) -> Any:
             "Unknown schema - there is no value for the 'event_id_identifier' in the schema."
         )
 
+    @classmethod
+    def blank_schema_dict(cls) -> Dict[str, Any]:
+        """Return blank schema dictionary."""
+        return {
+            field: "required"
+            if (attrib.default or attrib.default == attr.NOTHING)
+            else None
+            for field, attrib in attr.fields_dict(cls).items()
+        }
+
 
 WIN_EVENT_SCH = ProcSchema(
     time_stamp="TimeGenerated",