Skip to content

Commit d784d37

Browse files
authored
Merge pull request #211 from KrishPatel13/feature/scrub-final
feat(scrub): Scrubbing and Fix Atomacos
2 parents 45b77ce + 5ceea9a commit d784d37

File tree

11 files changed

+665
-31
lines changed

11 files changed

+665
-31
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ cache
1313

1414
# db
1515
*.db
16+
*.db-journal
1617

1718
# VSCode
1819
.VSCode
20+
.vsCode
1921

2022
# Generated performance charts
2123
performance
2224

2325
# Generated when adding editable dependencies in requirements.txt (-e)
24-
src
26+
src

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ source .venv/bin/activate
3131
pip install wheel
3232
pip install -r requirements.txt
3333
pip install -e .
34+
python -m spacy download en_core_web_trf
3435
alembic upgrade head
3536
pytest
3637
```

assets/test_scrub_image.png

49 KB
Loading

openadapt/config.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
"""Script containing configurations for the openadapt application.
2+
3+
Usage:
4+
5+
from openadapt import config
6+
...
7+
config.<setting>
8+
...
9+
10+
"""
11+
112
import multiprocessing
213
import os
314
import pathlib
@@ -13,12 +24,16 @@
1324
"DB_ECHO": False,
1425
"DB_FNAME": "openadapt.db",
1526
"OPENAI_API_KEY": "<set your api key in .env>",
16-
#"OPENAI_MODEL_NAME": "gpt-4",
27+
# "OPENAI_MODEL_NAME": "gpt-4",
1728
"OPENAI_MODEL_NAME": "gpt-3.5-turbo",
1829
# may incur significant performance penalty
1930
"RECORD_READ_ACTIVE_ELEMENT_STATE": False,
2031
# TODO: remove?
2132
"REPLAY_STRIP_ELEMENT_STATE": True,
33+
# ACTION EVENT CONFIGURATIONS
34+
"ACTION_TEXT_SEP": "-",
35+
"ACTION_TEXT_NAME_PREFIX": "<",
36+
"ACTION_TEXT_NAME_SUFFIX": ">"
2237
}
2338

2439

@@ -38,8 +53,61 @@ def getenv_fallback(var_name):
3853
ROOT_DIRPATH = pathlib.Path(__file__).parent.parent.resolve()
3954
DB_FPATH = ROOT_DIRPATH / DB_FNAME
4055
DB_URL = f"sqlite:///{DB_FPATH}"
56+
DIRNAME_PERFORMANCE_PLOTS = "performance"
4157

4258
if multiprocessing.current_process().name == "MainProcess":
4359
for key, val in locals().items():
4460
if not key.startswith("_") and key.isupper():
4561
logger.info(f"{key}={val}")
62+
63+
64+
# SCRUBBING CONFIGURATIONS
65+
SCRUB_ENABLED = True
66+
SCRUB_CHAR = "*"
67+
SCRUB_LANGUAGE = "en"
68+
SCRUB_CONFIG_TRF = {
69+
"nlp_engine_name": "spacy",
70+
"models": [
71+
{
72+
"lang_code": "en",
73+
"model_name": "en_core_web_trf"
74+
}
75+
],
76+
}
77+
DEFAULT_SCRUB_FILL_COLOR = (255,)
78+
SCRUB_IGNORE_ENTITIES = [
79+
# 'US_PASSPORT',
80+
# 'US_DRIVER_LICENSE',
81+
# 'CRYPTO',
82+
# 'UK_NHS',
83+
# 'PERSON',
84+
# 'CREDIT_CARD',
85+
# 'US_BANK_NUMBER',
86+
# 'PHONE_NUMBER',
87+
# 'US_ITIN',
88+
# 'AU_ABN',
89+
"DATE_TIME",
90+
# 'NRP',
91+
# 'SG_NRIC_FIN',
92+
# 'AU_ACN',
93+
# 'IP_ADDRESS',
94+
# 'EMAIL_ADDRESS',
95+
"URL",
96+
# 'IBAN_CODE',
97+
# 'AU_TFN',
98+
# 'LOCATION',
99+
# 'AU_MEDICARE',
100+
# 'US_SSN',
101+
# 'MEDICAL_LICENSE'
102+
]
103+
SCRUB_KEYS_HTML = [
104+
"text",
105+
"canonical_text",
106+
"title",
107+
"state",
108+
"task_description",
109+
"key_char",
110+
"canonical_key_char",
111+
"key_vk",
112+
"children",
113+
]

openadapt/models.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import numpy as np
77
import sqlalchemy as sa
88

9-
from openadapt import db, utils, window
9+
from openadapt import config, db, utils, window
1010

1111

1212
# https://groups.google.com/g/sqlalchemy/c/wlr7sShU6-k
@@ -133,9 +133,9 @@ def canonical_key(self):
133133
)
134134

135135
def _text(self, canonical=False):
136-
sep = self._text_sep
137-
name_prefix = self._text_name_prefix
138-
name_suffix = self._text_name_suffix
136+
sep = config.ACTION_TEXT_SEP
137+
name_prefix = config.ACTION_TEXT_NAME_PREFIX
138+
name_suffix = config.ACTION_TEXT_NAME_SUFFIX
139139
if canonical:
140140
key_attr = self.canonical_key
141141
key_name_attr = self.canonical_key_name
@@ -201,10 +201,6 @@ def __str__(self):
201201
rval = " ".join(attrs)
202202
return rval
203203

204-
_text_sep = "-"
205-
_text_name_prefix = "<"
206-
_text_name_suffix = ">"
207-
208204
@classmethod
209205
def from_children(cls, children_dicts):
210206
children = [

openadapt/record.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import fire
2323
import mss.tools
2424

25-
from openadapt import config, crud, utils, window
25+
from openadapt import config, crud, scrub, utils, window
2626

2727

2828
EVENT_TYPES = ("screen", "action", "window")
@@ -389,7 +389,7 @@ def read_window_events(
389389
# File "...\env\lib\site-packages\loguru\_logger.py", line 1964, in _log
390390
# for handler in core.handlers.values):
391391
# RuntimeError: dictionary changed size during iteration
392-
_window_data = dict(window_data)
392+
_window_data = window_data
393393
_window_data.pop("state")
394394
logger.info(f"{_window_data=}")
395395
if window_data != prev_window_data:
@@ -524,8 +524,7 @@ def record(
524524
"""
525525

526526
utils.configure_logging(logger, LOG_LEVEL)
527-
528-
logger.info(f"{task_description=}")
527+
logger.info(f"{scrub.scrub_text(task_description)=}")
529528

530529
recording = create_recording(task_description)
531530
recording_timestamp = recording.timestamp

0 commit comments

Comments
 (0)