Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 43 additions & 10 deletions docs/source/visualization/ProcessTree.rst
Original file line number Diff line number Diff line change
Expand Up @@ -201,13 +201,15 @@ data (pd.DataFrame)
DataFrame containing one or more Process Trees. This should be the
output of ``build_process_tree`` described above.

schema (ProcSchema, optional)
schema (Dict | ProcSchema, optional)
The data schema to use for the data set, by default None. If None
the schema is inferred. A schema object maps generic field names
(e.g. ``process_name``) on to a data-specific name (e.g. ``exe``
in the case of Linux audit data). This is usually not required
since the function will try to infer the schema from fields in the
input DataFrame.
This can be supplied as a ProcSchema instance or a dictionary
with required schema mappings.

output_var (str, optional)
Output variable for selected items in the tree, by default None.
Expand Down Expand Up @@ -719,11 +721,11 @@ are shown below.
=================== ===================== ===================== ===========================
Generic name Win 4688 schema Linux auditd schema MDE schema
=================== ===================== ===================== ===========================
time_stamp TimeGenerated TimeGenerated CreatedProcessCreationTime
process_name NewProcessName exe CreatedProcessName
process_id NewProcessId pid CreatedProcessId
\*time_stamp TimeGenerated TimeGenerated CreatedProcessCreationTime
\*process_name NewProcessName exe CreatedProcessName
\*process_id NewProcessId pid CreatedProcessId
parent_name ParentProcessName *(not used)* ParentProcessName
parent_id ProcessId ppid CreatedProcessParentId
\*parent_id ProcessId ppid CreatedProcessParentId
logon_id SubjectLogonId ses InitiatingProcessLogonId
target_logon_id TargetLogonId *(not used)* LogonId
cmd_line CommandLine cmdline CreatedProcessCommandLine
Expand All @@ -733,6 +735,9 @@ host_name_column Computer Computer ComputerDnsNa
event_id_column EventID EventType *(not used)*
=================== ===================== ===================== ===========================

\* indicates a mandatory field. You must supply mappings from your
source data for these items. Others are optional but will provide more
information to the user in the plotted tree.

If your schema differs from, but is similar to one of the built-in
schema mappings you can adapt one of these or supply a custom schema
Expand Down Expand Up @@ -783,11 +788,39 @@ You can also supply a schema as a Python ``dict``, with the keys
being the generic internal name and the values, the names of the columns
in the input data. Both keys and values are strings except where
otherwise indicated above.

The ``time_stamp`` column **must** be a pandas Timestamp (Python datetime)
type. If your data is in another format (e.g. Unix timestamp or date string)
you should
convert this before trying to use the process tree tools. The example
Use :py:meth:`~msticpy.transform.process_tree_schema.ProcSchema.blank_schema_dict`
to get a blank schema dictionary.

.. code:: python3

from msticpy.transform.proc_tree_schema import ProcSchema
ProcSchema.blank_schema_dict()

.. parsed-literal::

{'process_name': 'required',
'process_id': 'required',
'parent_id': 'required',
'time_stamp': 'required',
'cmd_line': None,
'path_separator': None,
'user_name': None,
'logon_id': None,
'host_name_column': None,
'parent_name': None,
'target_logon_id': None,
'user_id': None,
'event_id_column': None,
'event_id_identifier': None}


The ``time_stamp`` column **should** be a pandas ``Timestamp`` (Python ``datetime``)
type. If your data is in another format (e.g. Unix timestamp or date string),
the process tree module will try to convert it before building the process tree
plot. This uses ``pandas`` to convert to native ``Timestamp``

If the auto-conversion does not work, convert the timestamp field before
trying to use the process tree tools. The example
below shows extracting the timestamp from the auditd ``mssg_id`` field.


Expand Down
11 changes: 10 additions & 1 deletion msticpy/transform/proc_tree_build_mde.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def _map_columns(


def convert_mde_schema_to_internal(
data: pd.DataFrame, schema: ProcSchema
data: pd.DataFrame, schema: ProcSchema, **kwargs
) -> pd.DataFrame:
"""
Convert DeviceProcessEvents schema data to internal MDE schema.
Expand Down Expand Up @@ -360,5 +360,14 @@ def convert_mde_schema_to_internal(
data["InitiatingProcessFolderPath"] = data.InitiatingProcessFolderPath.apply(
lambda x: x.rsplit("\\", maxsplit=1)[0]
)
# re-write any field name references in kwargs
plot_args = kwargs.pop("plot_args", {})
for arg_name, arg_value in plot_args.items():
if isinstance(arg_value, str) and arg_value in _SENTINEL_MDE_MAP:
plot_args[arg_name] = _SENTINEL_MDE_MAP[arg_value]
if isinstance(arg_value, list):
plot_args[arg_name] = [
_SENTINEL_MDE_MAP.get(field, field) for field in arg_value
]

return data.rename(columns=_SENTINEL_MDE_MAP)
75 changes: 46 additions & 29 deletions msticpy/transform/proc_tree_build_winlx.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .._version import VERSION
from ..common.data_utils import ensure_df_datetimes
from .proc_tree_schema import ColNames as Col
from .proc_tree_schema import ProcSchema

__version__ = VERSION
__author__ = "Ian Hellen"
Expand Down Expand Up @@ -50,7 +51,7 @@ def extract_process_tree(

"""
# Clean data
procs_cln = _clean_proc_data(procs, schema)
procs_cln, schema = _clean_proc_data(procs, schema)

# Merge parent-child
merged_procs = _merge_parent_by_time(procs_cln, schema)
Expand Down Expand Up @@ -97,22 +98,30 @@ def _clean_proc_data(
# Convert any numeric schema cols to str types
procs_cln = _num_cols_to_str(procs_cln, schema)

procs_cln[Col.EffectiveLogonId] = procs_cln[schema.logon_id]
# Create effective logon Id for Windows, if the TargetLogonId is not 0x0
if schema.target_logon_id:
has_tgt_logonid = (procs_cln[schema.target_logon_id] != "0x0") & (
procs_cln[schema.target_logon_id].notna()
)
procs_cln.loc[has_tgt_logonid, Col.EffectiveLogonId] = procs_cln[
schema.target_logon_id
]
if schema.logon_id not in procs_cln.columns:
schema = ProcSchema(**(attr.asdict(schema)))
schema.logon_id = None # type: ignore

if schema.logon_id:
procs_cln[Col.EffectiveLogonId] = procs_cln[schema.logon_id]
# Create effective logon Id for Windows, if the TargetLogonId is not 0x0
if schema.target_logon_id:
has_tgt_logonid = (procs_cln[schema.target_logon_id] != "0x0") & (
procs_cln[schema.target_logon_id].notna()
)
procs_cln.loc[has_tgt_logonid, Col.EffectiveLogonId] = procs_cln[
schema.target_logon_id
]
else:
procs_cln[Col.EffectiveLogonId] = 0

procs_cln[Col.new_process_lc] = procs_cln[schema.process_name].str.lower()
if schema.parent_name:
no_pproc = procs_cln[schema.parent_name] == ""
procs_cln.loc[no_pproc, schema.parent_name] = "unknown"
procs_cln[Col.parent_proc_lc] = procs_cln[schema.parent_name].str.lower()
procs_cln[Col.source_index] = procs_cln.index
return procs_cln
return procs_cln, schema


def _num_cols_to_str(
Expand Down Expand Up @@ -216,38 +225,46 @@ def _extract_inferred_parents(
# from the merged data for ALL processes
merged_procs[parent_col_name] = merged_procs[f"{schema.process_name}_par"]
merged_procs[Col.parent_proc_lc] = merged_procs[Col.new_process_lc_par]
merged_procs.loc[root_procs_crit, Col.EffectiveLogonId_par] = merged_procs[
schema.logon_id
]
if schema.logon_id:
merged_procs.loc[root_procs_crit, Col.EffectiveLogonId_par] = merged_procs[
schema.logon_id
]
else:
merged_procs.loc[root_procs_crit, Col.EffectiveLogonId_par] = 0
merged_procs.loc[root_procs_crit, Col.timestamp_orig_par] = time_zero

# Extract synthentic rows for the parents of root processes
# Extract synthetic rows for the parents of root processes
parent_cols = [
schema.host_name_column,
*([schema.host_name] if schema.host_name else []),
schema.parent_id,
Col.EffectiveLogonId_par,
parent_col_name,
Col.parent_proc_lc,
*([schema.event_id_column] if schema.event_id_column else []),
]
if schema.event_id_column:
parent_cols.append(schema.event_id_column)
rename_par_cols = {
schema.parent_id: schema.process_id,
schema.parent_name: schema.process_name,
Col.parent_proc_lc: Col.new_process_lc,
}
if schema.logon_id:
rename_par_cols[Col.EffectiveLogonId_par] = schema.logon_id
inferred_parents = (
merged_procs[root_procs_crit][parent_cols]
.rename(
columns={
schema.parent_id: schema.process_id,
schema.parent_name: schema.process_name,
Col.parent_proc_lc: Col.new_process_lc,
Col.EffectiveLogonId_par: schema.logon_id,
}
)
.assign(time_stamp=time_zero, EffectiveLogonId=merged_procs[schema.logon_id])
.rename(columns=rename_par_cols)
.assign(time_stamp=time_zero)
.rename(columns={"time_stamp": schema.time_stamp})
.drop_duplicates()
)
if schema.logon_id:
inferred_parents[Col.EffectiveLogonId] = merged_procs[schema.logon_id]
else:
inferred_parents[Col.EffectiveLogonId] = 0

return pd.concat(
[merged_procs, inferred_parents], ignore_index=True, axis=0, sort=False
[merged_procs, inferred_parents.drop_duplicates()],
ignore_index=True,
axis=0,
sort=False,
)


Expand Down
21 changes: 12 additions & 9 deletions msticpy/transform/proc_tree_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def build_process_tree(
schema: Union[ProcSchema, Dict[str, Any]] = None,
show_summary: bool = False,
debug: bool = False,
**kwargs,
) -> pd.DataFrame:
"""
Build process trees from the process events.
Expand Down Expand Up @@ -63,12 +64,12 @@ def build_process_tree(
ProcSchema

"""
if isinstance(schema, dict):
schema = ProcSchema(**schema)
# If schema is none, infer schema from columns
if not schema or schema == MDE_INT_EVENT_SCH:
# Special case for MDE - since there are two possible schemas
schema = infer_schema(procs)
if isinstance(schema, dict):
schema = ProcSchema(**schema)

if not schema:
raise TypeError(
Expand All @@ -78,7 +79,9 @@ def build_process_tree(
)

if schema == MDE_EVENT_SCH:
procs = mde.convert_mde_schema_to_internal(procs, schema=MDE_EVENT_SCH)
procs = mde.convert_mde_schema_to_internal(
procs, schema=MDE_EVENT_SCH, plot_args=kwargs.pop("plot_args", {})
)
schema = MDE_INT_EVENT_SCH
if schema == MDE_INT_EVENT_SCH:
extr_proc_tree = mde.extract_process_tree(procs, debug=debug)
Expand All @@ -91,7 +94,7 @@ def build_process_tree(

if show_summary:
print(get_summary_info(proc_tree))
return proc_tree
return proc_tree.sort_values(by=["path", schema.time_stamp], ascending=True)


def infer_schema(data: Union[pd.DataFrame, pd.Series]) -> Optional[ProcSchema]:
Expand All @@ -110,11 +113,11 @@ def infer_schema(data: Union[pd.DataFrame, pd.Series]) -> Optional[ProcSchema]:

"""
src_cols = data.columns if isinstance(data, pd.DataFrame) else data.index
schema_matches = {}
for schema in SUPPORTED_SCHEMAS:
matching_cols = set(src_cols) & set(schema.columns)
schema_matches[len(matching_cols)] = schema
if max(schema_matches) > 5:
schema_matches = {
len(set(src_cols) & set(schema.columns)): schema for schema in SUPPORTED_SCHEMAS
}

if max(schema_matches) >= 4:
return schema_matches[max(schema_matches)]
return None

Expand Down
30 changes: 21 additions & 9 deletions msticpy/transform/proc_tree_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ class ProcSchema:
process_name: str
process_id: str
parent_id: str
logon_id: str
cmd_line: str
user_name: str
path_separator: str
host_name_column: str
time_stamp: str = "TimeGenerated"
time_stamp: str
cmd_line: Optional[str] = None
path_separator: str = "\\"
user_name: Optional[str] = None
logon_id: Optional[str] = None
host_name_column: Optional[str] = None
parent_name: Optional[str] = None
target_logon_id: Optional[str] = None
user_id: Optional[str] = None
Expand All @@ -72,11 +72,8 @@ def required_columns(self):
"process_name",
"process_id",
"parent_id",
"logon_id",
"cmd_line",
"user_name",
"path_separator",
"host_name_column",
"time_stamp",
]

Expand All @@ -102,6 +99,11 @@ def get_df_cols(self, data: pd.DataFrame):
"""Return the subset of columns that are present in `data`."""
return [col for col in self.columns if col in data.columns]

@property
def host_name(self) -> Optional[str]:
"""Return host name column."""
return self.host_name_column

@property
def event_type_col(self) -> str:
"""
Expand Down Expand Up @@ -146,6 +148,16 @@ def event_filter(self) -> Any:
"Unknown schema - there is no value for the 'event_id_identifier' in the schema."
)

@classmethod
def blank_schema_dict(cls) -> Dict[str, Any]:
"""Return blank schema dictionary."""
return {
field: "required"
if (attrib.default or attrib.default == attr.NOTHING)
else None
for field, attrib in attr.fields_dict(cls).items()
}


WIN_EVENT_SCH = ProcSchema(
time_stamp="TimeGenerated",
Expand Down
Loading