diffgram · PJEstrada · Sep 3, 2021 · Sep 3, 2021 · Sep 3, 2021 · Sep 3, 2021
diff --git a/sdk/diffgram/core/core.py b/sdk/diffgram/core/core.py
@@ -53,6 +53,8 @@ def __init__(
 			project_string_id = project_string_id,
 			client_id = client_id, 
 			client_secret = client_secret)
+		self.client_id = client_id
+		self.client_secret = client_secret
 		self.file = FileConstructor(self)
 		self.train = Train(self)
 		self.job = Job(self)

diff --git a/sdk/diffgram/core/diffgram_dataset_iterator.py b/sdk/diffgram/core/diffgram_dataset_iterator.py
@@ -1,18 +1,29 @@
 from PIL import Image, ImageDraw
 from imageio import imread
 import numpy as np
+import traceback
+import sys
+from threading import Thread
+from concurrent.futures import ThreadPoolExecutor
+
 
 class DiffgramDatasetIterator:
 
-    def __init__(self, project, diffgram_file_id_list, validate_ids = True):
+    def __init__(self, project,
+                 diffgram_file_id_list,
+                 validate_ids = True,
+                 max_size_cache = 1073741824,
+                 max_num_concurrent_fetches = 25):
         """
 
         :param project (sdk.core.core.Project): A Project object from the Diffgram SDK
         :param diffgram_file_list (list): An arbitrary number of file ID's from Diffgram.
         """
         self.diffgram_file_id_list = diffgram_file_id_list
-
+        self.max_size_cache = max_size_cache
+        self.pool = ThreadPoolExecutor(max_num_concurrent_fetches)
         self.project = project
+        self.file_cache = {}
         self._internal_file_list = []
         if validate_ids:
             self.__validate_file_ids()
@@ -25,22 +36,58 @@ def __iter__(self):
     def __len__(self):
         return len(self.diffgram_file_id_list)
 
-    def __getitem__(self, idx):
-        diffgram_file = self.project.file.get_by_id(self.diffgram_file_id_list[idx], with_instances = True)
+    def save_file_in_cache(self, idx, instance_data):
+        # If size of cache greater than 1GB (Default)
+        if sys.getsizeof(self.file_cache) > self.max_size_cache:
+            keys = list(self.file_cache.keys())
+            latest_keys = keys[:-10]  # Get oldest 10 elements
+            for k in latest_keys:
+                self.file_cache.pop(k)
+
+        self.file_cache[idx] = instance_data
+
+    def get_next_n_items(self, idx, num_items = 25):
+        """
+            Get next N items and save them to cache proactively.
+        :param idx:
+        :param n:
+        :return:
+        """
+        latest_index = idx + num_items
+        if latest_index >= len(self.diffgram_file_id_list):
+            latest_index = len(self.diffgram_file_id_list)
+
+        for i in range(idx + 1, latest_index):
+            self.pool.submit(self.__get_file_data_for_index, (i,))
+        return True
+
+    def __get_file_data_for_index(self, idx):
+        diffgram_file = self.project.file.get_by_id(self.diffgram_file_id_list[idx], with_instances = True, use_session = False)
         instance_data = self.get_file_instances(diffgram_file)
+        self.save_file_in_cache(idx, instance_data)
         return instance_data
 
+    def __getitem__(self, idx):
+        if self.file_cache.get(idx):
+            return self.file_cache.get(idx)
+
+        result = self.__get_file_data_for_index(idx)
+
+        self.get_next_n_items(idx, num_items = 25)
+
+        return result
+
     def __next__(self):
-        file_id = self.diffgram_file_id_list[self.current_file_index]
-        diffgram_file = self.project.file.get_by_id(file_id, with_instances = True)
-        instance_data = self.get_file_instances(diffgram_file)
+        if self.file_cache.get(self.current_file_index):
+            return self.file_cache.get(self.current_file_index)
+        instance_data = self.__get_file_data_for_index(self.current_file_index)
         self.current_file_index += 1
         return instance_data
 
     def __validate_file_ids(self):
         if not self.diffgram_file_id_list:
             return
-        result = self.project.file.file_list_exists(self.diffgram_file_id_list)
+        result = self.project.file.file_list_exists(self.diffgram_file_id_list, use_session = False)
         if not result:
             raise Exception(
                 'Some file IDs do not belong to the project. Please provide only files from the same project.')
@@ -56,7 +103,9 @@ def get_image_data(self, diffgram_file):
                     if i < MAX_RETRIES - 1:
                         continue
                     else:
-                        raise e
+                        print('Fetch Image Failed: Diffgram File ID: {}'.format(diffgram_file.id))
+                        print(traceback.format_exc())
+                        return None
             return image
         else:
             raise Exception('Pytorch datasets only support images. Please provide only file_ids from images')

diff --git a/sdk/diffgram/file/file_constructor.py b/sdk/diffgram/file/file_constructor.py
@@ -4,7 +4,8 @@
 from diffgram.job.job import Job
 import json
 import os
-
+import requests
+from requests.auth import HTTPDigestAuth
 
 class FileConstructor():
     """
@@ -401,7 +402,7 @@ def get_file_list(self, id_list: list, with_instances: bool = False):
 
         raise NotImplementedError
 
-    def file_list_exists(self, id_list):
+    def file_list_exists(self, id_list, use_session = True):
         """
             Verifies that the given ID list exists inside the project.
         :param id_list:
@@ -413,10 +414,16 @@ def file_list_exists(self, id_list):
         spec_dict = {
             'file_id_list': id_list
         }
-        response = self.client.session.post(
-            self.client.host + url,
-            json = spec_dict)
-
+        if use_session:
+            response = self.client.session.post(
+                self.client.host + url,
+                json = spec_dict)
+        else:
+            response = requests.post(
+                url = self.client.host + url,
+                json = spec_dict,
+                auth = HTTPDigestAuth(self.client.client_id, self.client.client_secret)
+            )
         self.client.handle_errors(response)
 
         response_json = response.json()
@@ -428,7 +435,8 @@ def file_list_exists(self, id_list):
 
     def get_by_id(self,
                   id: int,
-                  with_instances: bool = False):
+                  with_instances: bool = False,
+                  use_session = True):
         """
         returns Diffgram File object
         """
@@ -450,9 +458,15 @@ def get_by_id(self,
             }
             file_response_key = 'file_serialized'
 
-        response = self.client.session.post(
-            self.client.host + endpoint,
-            json = spec_dict)
+        if use_session:
+            response = self.client.session.post(
+                self.client.host + endpoint,
+                json = spec_dict)
+        else:
+            # Add Auth
+            response = requests.post(self.client.host + endpoint,
+                          json = spec_dict,
+                          auth = HTTPDigestAuth(self.client.client_id, self.client.client_secret))
 
         self.client.handle_errors(response)
 

diff --git a/sdk/diffgram/file/view.py b/sdk/diffgram/file/view.py
@@ -1,56 +1,60 @@
+import requests
+from requests.auth import HTTPDigestAuth
 
 
 def get_file_id():
-	"""
-	Get Project file id
+    """
+    Get Project file id
 
-	Arguments
-		project string id
-		working directory?
-		filename??
+    Arguments
+        project string id
+        working directory?
+        filename??
 
-	Future
-		How are we handling video with this?
-		API method for this?
+    Future
+        How are we handling video with this?
+        API method for this?
 
-	"""
-	pass
+    """
+    pass
 
 
+def get_label_file_dict(self, use_session = True):
+    """
+    Get Project label file id dict for project
 
-def get_label_file_dict(self):
-	"""
-	Get Project label file id dict for project
+    Arguments
+        self
 
-	Arguments
-		self
-
-	Expects
-		self.project_string_id
-		self.directory_id 
+    Expects
+        self.project_string_id
+        self.directory_id
 
-	Returns
-		sets self.name_to_file_id to the dict returned
+    Returns
+        sets self.name_to_file_id to the dict returned
 
-	"""
-	if self.project_string_id is None:
-		raise Exception("No project string." + \
-						"Set a project string using .auth()")
+    """
+    if self.project_string_id is None:
+        raise Exception("No project string." + \
+                        "Set a project string using .auth()")
 
-	if type(self.project_string_id) != str:
-		raise Exception("project_string_id must be of type String")
+    if type(self.project_string_id) != str:
+        raise Exception("project_string_id must be of type String")
 
-	endpoint = "/api/v1/project/" + self.project_string_id + \
-			   "/labels/view/name_to_file_id"
+    endpoint = "/api/v1/project/" + self.project_string_id + \
+               "/labels/view/name_to_file_id"
+    if use_session:
+        response = self.session.get(self.host + endpoint)
+    else:
+        # Add Auth
+        response = requests.get(self.host + endpoint,
+                                headers = {'directory_id': str(self.directory_id)},
+                                auth = HTTPDigestAuth(self.client_id, self.client_secret))
 
-	response = self.session.get(self.host + endpoint)
-
-	self.handle_errors(response)
-
-	data = response.json()
-
-	if data["log"]["success"] == True:
-		self.name_to_file_id = data["name_to_file_id"]
-	else:
-		raise Exception(data["log"]["errors"])
+    self.handle_errors(response)
 
+    data = response.json()
+    if data["log"]["success"] == True:
+        self.name_to_file_id = data["name_to_file_id"]
+    else:
+        raise Exception(data["log"]["errors"])
diff --git a/sdk/diffgram/pytorch_diffgram/diffgram_pytorch_dataset.py b/sdk/diffgram/pytorch_diffgram/diffgram_pytorch_dataset.py
@@ -28,9 +28,9 @@ def __get_next_page_of_data(self):
     def __getitem__(self, idx):
         if torch.is_tensor(idx):
             idx = idx.tolist()
-        diffgram_file = self.project.file.get_by_id(self.diffgram_file_id_list[idx], with_instances = True)
 
-        sample = self.get_file_instances(diffgram_file)
+        sample = super().__getitem__(idx)
+
         if 'x_min_list' in sample:
             sample['x_min_list'] = torch.Tensor(sample['x_min_list'])
         if 'x_max_list' in sample:

diff --git a/sdk/diffgram/tensorflow_diffgram/diffgram_tensorflow_dataset.py b/sdk/diffgram/tensorflow_diffgram/diffgram_tensorflow_dataset.py
@@ -54,18 +54,15 @@ def __getitem__(self, idx):
         return tf_example
 
     def get_tf_train_example(self, idx):
-        file_id = self.diffgram_file_id_list[idx]
-        diffgram_file = self.project.file.get_by_id(file_id, with_instances = True)
-        image = self.get_image_data(diffgram_file)
-        instance_data = self.get_file_instances(diffgram_file)
+        instance_data = super().__getitem__(idx)
         filename, file_extension = os.path.splitext(instance_data['diffgram_file'].image['original_filename'])
         label_names_bytes = [x.encode() for x in instance_data['label_name_list']]
         tf_example_dict = {
             'image/height': self.int64_feature(instance_data['diffgram_file'].image['height']),
             'image/width': self.int64_feature(instance_data['diffgram_file'].image['width']),
             'image/filename': self.bytes_feature(filename.encode()),
             'image/source_id': self.bytes_feature(filename.encode()),
-            'image/encoded': self.bytes_feature(image.tobytes()),
+            'image/encoded': self.bytes_feature(instance_data['image'].tobytes()),
             'image/format': self.bytes_feature(file_extension.encode()),
             'image/object/bbox/xmin': self.float_list_feature(instance_data['x_min_list']),
             'image/object/bbox/xmax': self.float_list_feature(instance_data['x_max_list']),