Skip to content

Commit

Permalink
DB mode minor issues patches (#179)
Browse files Browse the repository at this point in the history
  • Loading branch information
jpwchang authored Sep 29, 2022
1 parent ac4785d commit 82659a8
Show file tree
Hide file tree
Showing 8 changed files with 187 additions and 79 deletions.
12 changes: 12 additions & 0 deletions convokit/model/convoKitIndex.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ def __init__(
self.type_check = True # toggle-able to enable/disable type checks on metadata additions
self.lock_metadata_deletion = {"utterance": True, "conversation": True, "speaker": True}

def create_new_index(self, obj_type: str, key: str):
"""
Create a new entry in the obj_type index with a blank type list,
representing an "Any" type which might be later refined.
:param obj_type: utterance, conversation, or speaker
:param key: string
:param class_type: class type
"""
if key not in self.indices[obj_type]:
self.indices[obj_type][key] = []

def update_index(self, obj_type: str, key: str, class_type: str):
"""
Append the class_type to the index
Expand Down
51 changes: 37 additions & 14 deletions convokit/model/convoKitMeta.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,23 @@ class ConvoKitMeta(MutableMapping, dict):
ConvoKitMeta is a dictlike object that stores the metadata attributes of a corpus component
"""

def __init__(self, owner, convokit_index, obj_type):
def __init__(self, owner, convokit_index, obj_type, overwrite=False):
self.owner = owner # Corpus or CorpusComponent
self.index: ConvoKitIndex = convokit_index
self.obj_type = obj_type

self._get_storage().initialize_data_for_component("meta", self.storage_key)
self._get_storage().initialize_data_for_component(
"meta", self.storage_key, overwrite=overwrite
)

@property
def storage_key(self) -> str:
return f"{self.obj_type}_{self.owner.id}"

def __getitem__(self, item):
item_type = self.index.get_index(self.obj_type).get(item, None)
return self._get_storage().get_data("meta", self.storage_key, item, item_type)
return self._get_storage().get_data(
"meta", self.storage_key, item, self.index.get_index(self.obj_type)
)

def _get_storage(self):
# special case for Corpus meta since that's the only time owner is not a CorpusComponent
Expand All @@ -44,12 +47,15 @@ def _get_storage(self):

@staticmethod
def _check_type_and_update_index(index, obj_type, key, value):
if not isinstance(value, type(None)): # do nothing to index if value is None
if key not in index.indices[obj_type]:
if key not in index.indices[obj_type]:
if isinstance(value, type(None)): # new entry with None type means can't infer type yet
index.create_new_index(obj_type, key=key)
else:
type_ = _optimized_type_check(value)
index.update_index(obj_type, key=key, class_type=type_)
else:
# entry exists
else:
# entry exists
if not isinstance(value, type(None)): # do not update index if value is None
if index.get_index(obj_type)[key] != ["bin"]: # if "bin" do no further checks
if str(type(value)) not in index.get_index(obj_type)[key]:
new_type = _optimized_type_check(value)
Expand All @@ -66,8 +72,9 @@ def __setitem__(self, key, value):

if self.index.type_check:
ConvoKitMeta._check_type_and_update_index(self.index, self.obj_type, key, value)
item_type = self.index.get_index(self.obj_type).get(key, None)
self._get_storage().update_data("meta", self.storage_key, key, value, item_type)
self._get_storage().update_data(
"meta", self.storage_key, key, value, self.index.get_index(self.obj_type)
)

def __delitem__(self, key):
if self.obj_type == "corpus":
Expand All @@ -87,19 +94,35 @@ def __delitem__(self, key):
self._get_storage().delete_data("meta", self.storage_key, key)

def __iter__(self):
return self._get_storage().get_data("meta", self.storage_key).__iter__()
return (
self._get_storage()
.get_data("meta", self.storage_key, index=self.index.get_index(self.obj_type))
.__iter__()
)

def __len__(self):
return self._get_storage().get_data("meta", self.storage_key).__len__()
return (
self._get_storage()
.get_data("meta", self.storage_key, index=self.index.get_index(self.obj_type))
.__len__()
)

def __contains__(self, x):
return self._get_storage().get_data("meta", self.storage_key).__contains__(x)
return (
self._get_storage()
.get_data("meta", self.storage_key, index=self.index.get_index(self.obj_type))
.__contains__(x)
)

def __repr__(self) -> str:
return "ConvoKitMeta(" + self.to_dict().__repr__() + ")"

def to_dict(self):
return dict(self._get_storage().get_data("meta", self.storage_key))
return dict(
self._get_storage().get_data(
"meta", self.storage_key, index=self.index.get_index(self.obj_type)
)
)

def reinitialize_from(self, other: Union["ConvoKitMeta", dict]):
"""
Expand Down
16 changes: 12 additions & 4 deletions convokit/model/corpusComponent.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(

if meta is None:
meta = dict()
self.meta = self.init_meta(meta)
self._meta = self.init_meta(meta)

def get_owner(self):
return self._owner
Expand Down Expand Up @@ -64,11 +64,11 @@ def set_owner(self, owner):
previous_owner.storage.delete_data("meta", self.meta.storage_key)
else:
del self._temp_storage
self.meta = self.init_meta(meta_vals)
self._meta = self.init_meta(meta_vals)

owner = property(get_owner, set_owner)

def init_meta(self, meta):
def init_meta(self, meta, overwrite=False):
if self._owner is None:
# ConvoKitMeta instances are not allowed for ownerless (standalone)
# components since they must be backed by a StorageManager. In this
Expand All @@ -79,7 +79,7 @@ def init_meta(self, meta):
else:
if isinstance(meta, ConvoKitMeta) and meta.owner is self._owner:
return meta
ck_meta = ConvoKitMeta(self, self.owner.meta_index, self.obj_type)
ck_meta = ConvoKitMeta(self, self.owner.meta_index, self.obj_type, overwrite=overwrite)
for key, value in meta.items():
ck_meta[key] = value
return ck_meta
Expand All @@ -100,6 +100,14 @@ def set_id(self, value):

id = property(get_id, set_id)

def get_meta(self):
return self._meta

def set_meta(self, new_meta):
self._meta = self.init_meta(new_meta, overwrite=True)

meta = property(get_meta, set_meta)

def get_data(self, property_name):
if self._owner is None:
return self._temp_storage[property_name]
Expand Down
65 changes: 51 additions & 14 deletions convokit/model/corpus_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def unpack_binary_data_for_utts(utterances, filename, utterance_index, exclude_m
:return:
"""
for field, field_types in utterance_index.items():
if field_types[0] == "bin" and field not in exclude_meta:
if len(field_types) > 0 and field_types[0] == "bin" and field not in exclude_meta:
with open(os.path.join(filename, field + "-bin.p"), "rb") as f:
l_bin = pickle.load(f)
for i, ut in enumerate(utterances):
Expand Down Expand Up @@ -211,7 +211,7 @@ def unpack_binary_data(filename, objs_data, object_index, obj_type, exclude_meta
"""
# unpack speaker meta
for field, field_types in object_index.items():
if field_types[0] == "bin" and field not in exclude_meta:
if len(field_types) > 0 and field_types[0] == "bin" and field not in exclude_meta:
with open(os.path.join(filename, field + "-{}-bin.p".format(obj_type)), "rb") as f:
l_bin = pickle.load(f)
for obj, data in objs_data.items():
Expand Down Expand Up @@ -501,7 +501,7 @@ def dump_helper_bin(d: ConvoKitMeta, d_bin: Dict, fields_to_skip=None) -> Dict:
if k in fields_to_skip:
continue
try:
if obj_idx[k][0] == "bin":
if len(obj_idx[k]) > 0 and obj_idx[k][0] == "bin":
d_out[k] = "{}{}{}".format(BIN_DELIM_L, len(d_bin[k]), BIN_DELIM_R)
d_bin[k].append(v)
else:
Expand Down Expand Up @@ -589,17 +589,26 @@ def load_binary_metadata(filename, index, exclude_meta=None):
if meta_type == ["bin"] and (
exclude_meta is None or meta_key not in exclude_meta[component_type]
):
# filename format differs for utterances versus everything else
filename_suffix = (
"-bin.p"
if component_type == "utterance"
else "-{}-bin.p".format(component_type)
)
try:
with open(
os.path.join(filename, meta_key + "-{}-bin.p".format(component_type)), "rb"
) as f:
with open(os.path.join(filename, meta_key + filename_suffix), "rb") as f:
l_bin = pickle.load(f)
binary_data[component_type][meta_key] = l_bin
except FileNotFoundError:
warn(
f"Metadata field {meta_key} is specified to have binary type but no saved binary data was found. This field will be skipped."
)
return binary_data
# update the exclude_meta list to force this field to get skipped
# in the subsequent corpus loading logic
if exclude_meta is None:
exclude_meta = defaultdict(list)
exclude_meta[component_type].append(meta_key)
return binary_data, exclude_meta


def load_jsonlist_to_db(
Expand Down Expand Up @@ -661,9 +670,9 @@ def load_jsonlist_to_db(
del utt_meta[exclude_key]
if bin_meta is not None:
for key, bin_list in bin_meta.items():
bin_locator = utt_meta[key]
bin_locator = utt_meta.get(key, None)
if (
type(bin_locator) == "str"
type(bin_locator) == str
and bin_locator.startswith(BIN_DELIM_L)
and bin_locator.endswith(BIN_DELIM_R)
):
Expand Down Expand Up @@ -718,9 +727,9 @@ def load_json_to_db(
)
if bin_meta is not None:
for key, bin_list in bin_meta.items():
bin_locator = meta[key]
bin_locator = meta.get(key, None)
if (
type(bin_locator) == "str"
type(bin_locator) == str
and bin_locator.startswith(BIN_DELIM_L)
and bin_locator.endswith(BIN_DELIM_R)
):
Expand All @@ -745,9 +754,9 @@ def load_corpus_info_to_db(filename, db, collection_prefix, exclude_meta=None, b
corpus_meta = {k: v for k, v in json.load(f).items() if k not in exclude_meta}
if bin_meta is not None:
for key, bin_list in bin_meta.items():
bin_locator = corpus_meta[key]
bin_locator = corpus_meta.get(key, None)
if (
type(bin_locator) == "str"
type(bin_locator) == str
and bin_locator.startswith(BIN_DELIM_L)
and bin_locator.endswith(BIN_DELIM_R)
):
Expand All @@ -758,6 +767,15 @@ def load_corpus_info_to_db(filename, db, collection_prefix, exclude_meta=None, b
)


def clean_up_excluded_meta(meta_index, exclude_meta):
"""
Remove excluded metadata from the metadata index
"""
for component_type, excluded_keys in exclude_meta.items():
for key in excluded_keys:
meta_index.del_from_index(component_type, key)


def populate_db_from_file(
filename,
db,
Expand All @@ -775,7 +793,7 @@ def populate_db_from_file(
used by a DBStorageManager, sourcing data from the valid ConvoKit Corpus
data pointed to by the filename parameter.
"""
binary_meta = load_binary_metadata(
binary_meta, updated_exclude_meta = load_binary_metadata(
filename,
meta_index,
{
Expand All @@ -786,6 +804,14 @@ def populate_db_from_file(
},
)

# exclusion lists may have changed if errors were encountered while loading
# the binary metadata
if updated_exclude_meta is not None:
exclude_utterance_meta = updated_exclude_meta["utterance"]
exclude_conversation_meta = updated_exclude_meta["conversation"]
exclude_speaker_meta = updated_exclude_meta["speaker"]
exclude_overall_meta = updated_exclude_meta["corpus"]

# first load the utterance data
inserted_utt_ids = load_jsonlist_to_db(
os.path.join(filename, "utterances.jsonl"),
Expand All @@ -811,6 +837,17 @@ def populate_db_from_file(
filename, db, collection_prefix, exclude_overall_meta, binary_meta["corpus"]
)

# make sure skipped metadata isn't kept in the final index
clean_up_excluded_meta(
meta_index,
{
"utterance": exclude_utterance_meta,
"conversation": exclude_conversation_meta,
"speaker": exclude_speaker_meta,
"corpus": exclude_overall_meta,
},
)

return inserted_utt_ids


Expand Down
Loading

0 comments on commit 82659a8

Please sign in to comment.