OpenAdaptAI · abrichr · Jun 9, 2023 · May 30, 2023 · May 30, 2023 · Jun 1, 2023
diff --git a/.gitignore b/.gitignore
@@ -13,12 +13,14 @@ cache
 
 # db
 *.db
+*.db-journal
 
 # VSCode
 .VSCode
+.vsCode
 
 # Generated performance charts
 performance
 
 # Generated when adding editable dependencies in requirements.txt (-e)
-src
+src
diff --git a/README.md b/README.md
@@ -31,6 +31,7 @@ source .venv/bin/activate
 pip install wheel
 pip install -r requirements.txt
 pip install -e .
+python -m spacy download en_core_web_trf
 alembic upgrade head
 pytest
 ```

diff --git a/assets/test_scrub_image.png b/assets/test_scrub_image.png
diff --git a/openadapt/config.py b/openadapt/config.py
@@ -1,3 +1,14 @@
+"""Script containing configurations for the openadapt application.
+
+Usage:
+
+    from openadapt import config
+    ...
+    config.<setting>
+    ...
+
+"""
+
 import multiprocessing
 import os
 import pathlib
@@ -13,12 +24,16 @@
     "DB_ECHO": False,
     "DB_FNAME": "openadapt.db",
     "OPENAI_API_KEY": "<set your api key in .env>",
-    #"OPENAI_MODEL_NAME": "gpt-4",
+    # "OPENAI_MODEL_NAME": "gpt-4",
     "OPENAI_MODEL_NAME": "gpt-3.5-turbo",
     # may incur significant performance penalty
     "RECORD_READ_ACTIVE_ELEMENT_STATE": False,
     # TODO: remove?
     "REPLAY_STRIP_ELEMENT_STATE": True,
+    # ACTION EVENT CONFIGURATIONS
+    "ACTION_TEXT_SEP": "-",
+    "ACTION_TEXT_NAME_PREFIX": "<",
+    "ACTION_TEXT_NAME_SUFFIX": ">"
 }
 
 
@@ -38,8 +53,61 @@ def getenv_fallback(var_name):
 ROOT_DIRPATH = pathlib.Path(__file__).parent.parent.resolve()
 DB_FPATH = ROOT_DIRPATH / DB_FNAME
 DB_URL = f"sqlite:///{DB_FPATH}"
+DIRNAME_PERFORMANCE_PLOTS = "performance"
 
 if multiprocessing.current_process().name == "MainProcess":
     for key, val in locals().items():
         if not key.startswith("_") and key.isupper():
             logger.info(f"{key}={val}")
+
+
+# SCRUBBING CONFIGURATIONS
+SCRUB_ENABLED = True
+SCRUB_CHAR = "*"
+SCRUB_LANGUAGE = "en"
+SCRUB_CONFIG_TRF = {
+    "nlp_engine_name": "spacy",
+    "models": [
+        {
+        "lang_code": "en", 
+        "model_name": "en_core_web_trf"
+        }
+    ],
+}
+DEFAULT_SCRUB_FILL_COLOR = (255,)
+SCRUB_IGNORE_ENTITIES = [
+    # 'US_PASSPORT',
+    # 'US_DRIVER_LICENSE',
+    # 'CRYPTO',
+    # 'UK_NHS',
+    # 'PERSON',
+    # 'CREDIT_CARD',
+    # 'US_BANK_NUMBER',
+    # 'PHONE_NUMBER',
+    # 'US_ITIN',
+    # 'AU_ABN',
+    "DATE_TIME",
+    # 'NRP',
+    # 'SG_NRIC_FIN',
+    # 'AU_ACN',
+    # 'IP_ADDRESS',
+    # 'EMAIL_ADDRESS',
+    "URL",
+    # 'IBAN_CODE',
+    # 'AU_TFN',
+    # 'LOCATION',
+    # 'AU_MEDICARE',
+    # 'US_SSN',
+    # 'MEDICAL_LICENSE'
+]
+SCRUB_KEYS_HTML = [
+    "text",
+    "canonical_text",
+    "title",
+    "state",
+    "task_description",
+    "key_char",
+    "canonical_key_char",
+    "key_vk",
+    "children",
+]
diff --git a/openadapt/models.py b/openadapt/models.py
@@ -6,7 +6,7 @@
 import numpy as np
 import sqlalchemy as sa
 
-from openadapt import db, utils, window
+from openadapt import config, db, utils, window
 
 
 # https://groups.google.com/g/sqlalchemy/c/wlr7sShU6-k
@@ -133,9 +133,9 @@ def canonical_key(self):
         )
 
     def _text(self, canonical=False):
-        sep = self._text_sep
-        name_prefix = self._text_name_prefix
-        name_suffix = self._text_name_suffix
+        sep = config.ACTION_TEXT_SEP
+        name_prefix = config.ACTION_TEXT_NAME_PREFIX
+        name_suffix = config.ACTION_TEXT_NAME_SUFFIX
         if canonical:
             key_attr = self.canonical_key
             key_name_attr = self.canonical_key_name
@@ -201,10 +201,6 @@ def __str__(self):
         rval = " ".join(attrs)
         return rval
 
-    _text_sep = "-"
-    _text_name_prefix = "<"
-    _text_name_suffix = ">"
-
     @classmethod
     def from_children(cls, children_dicts):
         children = [

diff --git a/openadapt/record.py b/openadapt/record.py
@@ -22,7 +22,7 @@
 import fire
 import mss.tools
 
-from openadapt import config, crud, utils, window
+from openadapt import config, crud, scrub, utils, window
 
 
 EVENT_TYPES = ("screen", "action", "window")
@@ -389,7 +389,7 @@ def read_window_events(
             #   File "...\env\lib\site-packages\loguru\_logger.py", line 1964, in _log
             #       for handler in core.handlers.values):
             #   RuntimeError: dictionary changed size during iteration
-            _window_data = dict(window_data)
+            _window_data = window_data
             _window_data.pop("state")
             logger.info(f"{_window_data=}")
         if window_data != prev_window_data:
@@ -524,8 +524,7 @@ def record(
     """
 
     utils.configure_logging(logger, LOG_LEVEL)
-
-    logger.info(f"{task_description=}")
+    logger.info(f"{scrub.scrub_text(task_description)=}")
 
     recording = create_recording(task_description)
     recording_timestamp = recording.timestamp