11
11
# See the License for the specific language governing permissions and
12
12
# limitations under the License.
13
13
14
-
15
- from typing import List , Iterator , Tuple , Optional , Any
14
+ from typing import Dict , List , Iterator , Tuple , Optional , Any
16
15
import uuid
17
16
from bisect import bisect_left
18
17
from heapq import heappush , heappop
18
+ from sortedcontainers import SortedList
19
19
20
20
from forte .utils import get_class
21
21
from forte .data .base_store import BaseStore
@@ -29,7 +29,9 @@ class DataStore(BaseStore):
29
29
# TODO: temporarily disable this for development purposes.
30
30
# pylint: disable=pointless-string-statement
31
31
32
- def __init__ (self , onto_file_path : Optional [str ] = None ):
32
+ def __init__ (
33
+ self , onto_file_path : Optional [str ] = None , dynamically_add_type = True
34
+ ):
33
35
r"""An implementation of the data store object that mainly uses
34
36
primitive types. This class will be used as the internal data
35
37
representation behind data pack. The usage of primitive types provides
@@ -121,15 +123,23 @@ def __init__(self, onto_file_path: Optional[str] = None):
121
123
onto_file_path (str, optional): the path to the ontology file.
122
124
"""
123
125
super ().__init__ ()
124
- self .onto_file_path = onto_file_path
126
+
127
+ if onto_file_path is None and not dynamically_add_type :
128
+ raise RuntimeError (
129
+ "DataStore is initialized with no existing types. Setting"
130
+ "dynamically_add_type to False without providing onto_file_path"
131
+ "will lead to no usable type in DataStore."
132
+ )
133
+ self ._onto_file_path = onto_file_path
134
+ self ._dynamically_add_type = dynamically_add_type
125
135
126
136
"""
127
137
The ``_type_attributes`` is a private dictionary that provides
128
138
``type_name``, their parent entry, and the order of corresponding attributes.
129
139
The keys are fully qualified names of every type; The value is a dictionary with
130
140
two keys. Key ``attribute`` provides an inner dictionary with all valid attributes
131
- for this type and the indices of attributes among these lists. Key ``parent_entry ``
132
- is a string representing the direct parent of this type.
141
+ for this type and the indices of attributes among these lists. Key ``parent_class ``
142
+ is a string representing the ancesters of this type.
133
143
134
144
This structure is supposed to be built dynamically. When a user adds new entries,
135
145
data_store will check unknown types and add them to ``_type_attributes``.
@@ -144,20 +154,20 @@ def __init__(self, onto_file_path: Optional[str] = None):
144
154
# "attributes": {"pos": 4, "ud_xpos": 5,
145
155
# "lemma": 6, "chunk": 7, "ner": 8, "sense": 9,
146
156
# "is_root": 10, "ud_features": 11, "ud_misc": 12},
147
- # "parent_entry ": "forte.data.ontology.top.Annotation", },
157
+ # "parent_class ": "forte.data.ontology.top.Annotation", },
148
158
# "ft.onto.base_ontology.Document": {
149
159
# "attributes": {"document_class": 4,
150
160
# "sentiment": 5, "classifications": 6},
151
- # "parent_entry ": "forte.data.ontology.top.Annotation", },
161
+ # "parent_class ": "forte.data.ontology.top.Annotation", },
152
162
# "ft.onto.base_ontology.Sentence": {
153
163
# "attributes": {"speaker": 4,
154
164
# "part_id": 5, "sentiment": 6,
155
165
# "classification": 7, "classifications": 8},
156
- # "parent_entry ": "forte.data.ontology.top.Annotation", }
166
+ # "parent_class ": "forte.data.ontology.top.Annotation", }
157
167
# }
158
168
"""
159
169
self ._type_attributes : dict = {}
160
- if self .onto_file_path :
170
+ if self ._onto_file_path :
161
171
self ._parse_onto_file ()
162
172
163
173
"""
@@ -190,6 +200,104 @@ def _new_tid(self) -> int:
190
200
r"""This function generates a new ``tid`` for an entry."""
191
201
return uuid .uuid4 ().int
192
202
203
+ def _get_type_info (self , type_name : str ) -> Dict [str , Any ]:
204
+ """
205
+ Get the dictionary containing type information from ``self._type_attributes``.
206
+ If the ``type_name`` does not currecntly exists and dynamic import is enabled,
207
+ this function will add a new key-value pair into ``self._type_attributes``. The
208
+ value consists of a full attribute-to-index dictionary and an empty parent set.
209
+
210
+ This function returns a dictionary containing an attribute dict and a set of parent
211
+ entries of the given type. For example:
212
+
213
+ .. code-block:: python
214
+
215
+ "ft.onto.base_ontology.Sentence": {
216
+ "attributes": {
217
+ "speaker": 4,
218
+ "part_id": 5,
219
+ "sentiment": 6,
220
+ "classification": 7,
221
+ "classifications": 8,
222
+ },
223
+ "parent_class": set(),
224
+ }
225
+
226
+ Args:
227
+ type_name (str): The fully qualified type name of a type.
228
+ Returns:
229
+ attr_dict (dict): The dictionary containing an attribute dict and a set of parent
230
+ entries of the given type.
231
+ Raises:
232
+ RuntimeError: When the type is not provided by ontology file and
233
+ dynamic import is disabled.
234
+ """
235
+ # check if type is in dictionary
236
+ if type_name in self ._type_attributes :
237
+ return self ._type_attributes [type_name ]
238
+ if not self ._dynamically_add_type :
239
+ raise ValueError (
240
+ f"{ type_name } is not an existing type in current data store."
241
+ f"Dynamically add type is disabled."
242
+ f"Set dynamically_add_type=True if you need to use types other than"
243
+ f"types specified in the ontology file."
244
+ )
245
+ # get attribute dictionary
246
+ attributes = self ._get_entry_attributes_by_class (type_name )
247
+
248
+ attr_dict = {}
249
+ attr_idx = constants .ENTRY_TYPE_INDEX + 1
250
+ for attr_name in attributes :
251
+ attr_dict [attr_name ] = attr_idx
252
+ attr_idx += 1
253
+
254
+ new_entry_info = {
255
+ "attributes" : attr_dict ,
256
+ "parent_class" : set (),
257
+ }
258
+ self ._type_attributes [type_name ] = new_entry_info
259
+
260
+ return new_entry_info
261
+
262
+ def _get_type_attribute_dict (self , type_name : str ) -> Dict [str , int ]:
263
+ """Get the attribute dict of an entry type. The attribute dict maps
264
+ attribute names to a list of consecutive integers as indicies. For example:
265
+ .. code-block:: python
266
+
267
+ "attributes": {
268
+ "speaker": 4,
269
+ "part_id": 5,
270
+ "sentiment": 6,
271
+ "classification": 7,
272
+ "classifications": 8,
273
+ },
274
+
275
+ Args:
276
+ type_name (str): The fully qualified type name of a type.
277
+ Returns:
278
+ attr_dict (dict): The attribute-to-index dictionary of an entry.
279
+ """
280
+ return self ._get_type_info (type_name )["attributes" ]
281
+
282
+ def _get_type_parent (self , type_name : str ) -> str :
283
+ """Get a set of parent names of an entry type. The set is a subset of all
284
+ ancestors of the given type.
285
+ Args:
286
+ type_name (str): The fully qualified type name of a type.
287
+ Returns:
288
+ parent_class (str): The parent entry name of an entry.
289
+ """
290
+ return self ._get_type_info (type_name )["parent_class" ]
291
+
292
+ def _num_attributes_for_type (self , type_name : str ) -> int :
293
+ """Get the length of the attribute dict of an entry type.
294
+ Args:
295
+ type_name (str): The fully qualified type name of the new entry.
296
+ Returns:
297
+ attr_dict (dict): The attributes-to-index dict of an entry.
298
+ """
299
+ return len (self ._get_type_attribute_dict (type_name ))
300
+
193
301
def _new_annotation (self , type_name : str , begin : int , end : int ) -> List :
194
302
r"""This function generates a new annotation with default fields.
195
303
All default fields are filled with None.
@@ -207,8 +315,10 @@ def _new_annotation(self, type_name: str, begin: int, end: int) -> List:
207
315
208
316
tid : int = self ._new_tid ()
209
317
entry : List [Any ]
318
+
210
319
entry = [begin , end , tid , type_name ]
211
- entry += len (self ._type_attributes [type_name ]) * [None ]
320
+ entry += self ._num_attributes_for_type (type_name ) * [None ]
321
+
212
322
return entry
213
323
214
324
def _new_link (
@@ -230,8 +340,10 @@ def _new_link(
230
340
231
341
tid : int = self ._new_tid ()
232
342
entry : List [Any ]
343
+
233
344
entry = [parent_tid , child_tid , tid , type_name ]
234
- entry += len (self ._type_attributes [type_name ]) * [None ]
345
+ entry += self ._num_attributes_for_type (type_name ) * [None ]
346
+
235
347
return entry
236
348
237
349
def _new_group (self , type_name : str , member_type : str ) -> List :
@@ -249,21 +361,22 @@ def _new_group(self, type_name: str, member_type: str) -> List:
249
361
"""
250
362
251
363
tid : int = self ._new_tid ()
364
+
252
365
entry = [member_type , [], tid , type_name ]
253
- entry += len (self ._type_attributes [type_name ]) * [None ]
366
+ entry += self ._num_attributes_for_type (type_name ) * [None ]
367
+
254
368
return entry
255
369
256
370
def _is_annotation (self , type_name : str ) -> bool :
257
371
r"""This function takes a type_id and returns whether a type
258
372
is an annotation type or not.
259
-
260
373
Args:
261
374
type_name (str): The name of type in `self.__elements`.
262
-
263
375
Returns:
264
376
A boolean value whether this type_id belongs to an annotation
265
377
type or not.
266
378
"""
379
+ # TODO: use is_subclass() in DataStore to replace this
267
380
entry_class = get_class (type_name )
268
381
return issubclass (entry_class , (Annotation , AudioAnnotation ))
269
382
@@ -286,7 +399,15 @@ def add_annotation_raw(self, type_name: str, begin: int, end: int) -> int:
286
399
# annotation type entry data with default fields.
287
400
# A reference to the entry should be store in both self.__elements and
288
401
# self.__entry_dict.
289
- raise NotImplementedError
402
+ entry = self ._new_annotation (type_name , begin , end )
403
+ try :
404
+ self .__elements [type_name ].add (entry )
405
+ except KeyError :
406
+ self .__elements [type_name ] = SortedList (key = lambda s : (s [0 ], s [1 ]))
407
+ self .__elements [type_name ].add (entry )
408
+ tid = entry [constants .TID_INDEX ]
409
+ self .__entry_dict [tid ] = entry
410
+ return tid
290
411
291
412
def add_link_raw (
292
413
self , type_name : str , parent_tid : int , child_tid : int
@@ -340,16 +461,17 @@ def set_attribute(self, tid: int, attr_name: str, attr_value: Any):
340
461
KeyError: when ``tid`` or ``attr_name`` is not found.
341
462
"""
342
463
try :
343
- entry_type = self .__entry_dict [tid ][constants .ENTRY_TYPE_INDEX ]
464
+ entry = self .__entry_dict [tid ]
465
+ entry_type = entry [constants .ENTRY_TYPE_INDEX ]
344
466
except KeyError as e :
345
467
raise KeyError (f"Entry with tid { tid } not found." ) from e
346
468
347
469
try :
348
- attr_id = self ._type_attributes [ entry_type ] [attr_name ]
470
+ attr_id = self ._get_type_attribute_dict ( entry_type ) [attr_name ]
349
471
except KeyError as e :
350
472
raise KeyError (f"{ entry_type } has no { attr_name } attribute." ) from e
351
473
352
- self . _set_attr ( tid , attr_id , attr_value )
474
+ entry [ attr_id ] = attr_value
353
475
354
476
def _set_attr (self , tid : int , attr_id : int , attr_value : Any ):
355
477
r"""This function locates the entry data with ``tid`` and sets its
@@ -381,16 +503,17 @@ def get_attribute(self, tid: int, attr_name: str) -> Any:
381
503
KeyError: when ``tid`` or ``attr_name`` is not found.
382
504
"""
383
505
try :
384
- entry_type = self .__entry_dict [tid ][constants .ENTRY_TYPE_INDEX ]
506
+ entry = self .__entry_dict [tid ]
507
+ entry_type = entry [constants .ENTRY_TYPE_INDEX ]
385
508
except KeyError as e :
386
509
raise KeyError (f"Entry with tid { tid } not found." ) from e
387
510
388
511
try :
389
- attr_id = self ._type_attributes [ entry_type ] [attr_name ]
512
+ attr_id = self ._get_type_attribute_dict ( entry_type ) [attr_name ]
390
513
except KeyError as e :
391
514
raise KeyError (f"{ entry_type } has no { attr_name } attribute." ) from e
392
515
393
- return self . _get_attr ( tid , attr_id )
516
+ return entry [ attr_id ]
394
517
395
518
def _get_attr (self , tid : int , attr_id : int ) -> Any :
396
519
r"""This function locates the entry data with ``tid`` and gets the value
@@ -439,7 +562,7 @@ def delete_entry(self, tid: int):
439
562
if self ._is_annotation (type_name ):
440
563
entry_index = bisect_left (target_list , entry_data )
441
564
else : # if it's group or link, use the index in entry_list
442
- entry_index = entry_data [- 1 ]
565
+ entry_index = entry_data [constants . ENTRY_INDEX_INDEX ]
443
566
444
567
if (
445
568
entry_index >= len (target_list )
@@ -455,8 +578,6 @@ def delete_entry(self, tid: int):
455
578
def _delete_entry_by_loc (self , type_name : str , index_id : int ):
456
579
r"""It removes an entry of `index_id` by taking both the `type_id`
457
580
and `index_id`. Called by `delete_entry()`.
458
- This function will raise an IndexError if the `type_id` or `index_id`
459
- is invalid.
460
581
461
582
Args:
462
583
type_id (int): The index of the list in ``self.__elements``.
@@ -769,7 +890,7 @@ def _parse_onto_file(self):
769
890
A user can use classes both in the ontology specification file and their parent
770
891
entries's paths.
771
892
"""
772
- if self .onto_file_path is None :
893
+ if self ._onto_file_path is None :
773
894
return
774
895
raise NotImplementedError
775
896
0 commit comments