Merge pull request #134 from trytoolchest/staging

jherr-dev · web-flow · commit a880da025d76 · 2022-03-31T17:16:00.000-05:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "toolchest-client"
-version = "0.9.0"
+version = "0.9.1"
 description = "Python client for Toolchest"
 authors = [
     "Bryce Cai <bcai@trytoolchest.com>",
diff --git a/tests/test_cellranger.py b/tests/test_cellranger.py
@@ -19,8 +19,10 @@ def test_cellranger_count_s3_inputs():
     output = toolchest.cellranger_count(
         inputs="s3://toolchest-integration-tests/cellranger/count/pbmc_1k_v3_fastqs_trimmed.tar.gz",
         database_name="GRCh38",
+        output_path=output_dir_path,
+        skip_decompression=True,
     )
-    verify_cellranger_count_outputs(output, output_dir_path)
+    verify_cellranger_count_outputs(output.output_path, output_dir_path)
 
 
 @pytest.mark.integration
@@ -40,11 +42,13 @@ def test_cellranger_count_local_inputs():
     output = toolchest.cellranger_count(
         inputs=input_dir_path,
         database_name="GRCh38",
+        output_path=output_dir_path,
+        skip_decompression=True,
     )
-    verify_cellranger_count_outputs(output, output_dir_path)
+    verify_cellranger_count_outputs(output.output_path, output_dir_path)
 
 
-def verify_cellranger_count_outputs(output, output_dir_path):
+def verify_cellranger_count_outputs(archive_path, output_dir_path):
     # Expected properties of outputs
     MIN_EXPECTED_ARCHIVE_SIZE = 34000000
     MAX_EXPECTED_ARCHIVE_SIZE = 38000000
@@ -54,12 +58,6 @@ def verify_cellranger_count_outputs(output, output_dir_path):
     EXPECTED_FILTERED_MATRIX_SIZE = 503956
 
     # Verify properties of packed archive
-    archive_path = f"{output_dir_path}output.tar.gz"
-    toolchest.download(
-        output_path=output_dir_path,
-        s3_uri=output.s3_uri,
-        skip_decompression=True,
-    )
     archive_size = os.path.getsize(archive_path)
     assert MIN_EXPECTED_ARCHIVE_SIZE <= archive_size <= MAX_EXPECTED_ARCHIVE_SIZE
 
diff --git a/tests/test_chaining.py b/tests/test_chaining.py
@@ -9,7 +9,7 @@
     toolchest.set_key(toolchest_api_key)
 
 SHI7_SINGLE_END_HASH = 1570879637
-SHOGUN_CHAINED_HASH = 33856653
+SHOGUN_CHAINED_HASH = 1708070294
 
 
 @pytest.mark.integration
@@ -29,7 +29,7 @@ def test_shi7_shogun_chaining():
     test_dir = "test_shi7_shogun_chaining"
     os.makedirs(f"./{test_dir}", exist_ok=True)
     output_dir_path = f"./{test_dir}/"
-    output_file_path_shogun = f"{output_dir_path}alignment.burst.b6"
+    output_file_path_shogun = f"{output_dir_path}alignment.bowtie2.sam"
 
     output_shi7 = toolchest.shi7(
         tool_args="-SE",
diff --git a/tests/test_shogun.py b/tests/test_shogun.py
@@ -19,7 +19,7 @@ def test_shogun_filter_and_align():
     os.makedirs(f"{test_dir}", exist_ok=True)
     input_file_path = f"./{test_dir}/combined_seqs_unfiltered.fna"
     output_file_path_filter = f"./{test_dir}/combined_seqs.filtered.fna"
-    output_file_path_align = f"./{test_dir}/alignment.burst.b6"
+    output_file_path_align = f"./{test_dir}/alignment.bowtie2.sam"
 
     s3.download_integration_test_input(
         s3_file_key="combined_seqs_unfiltered.fna",
@@ -40,5 +40,4 @@ def test_shogun_filter_and_align():
         inputs=output_file_path_filter,
         output_path=test_dir,
     )
-
-    assert hash.unordered(output_file_path_align) == 780853697
+    assert hash.unordered(output_file_path_align) == 1952162202
diff --git a/toolchest_client/__init__.py b/toolchest_client/__init__.py
@@ -4,6 +4,7 @@
 from dotenv import load_dotenv, find_dotenv
 import functools
 import sentry_sdk
+import os
 
 # set __version__ module
 try:
@@ -34,5 +35,6 @@
 sentry_sdk.init(
     "https://c7db7e7a4ac349cc974c55f1fcb7d2f7@o1171636.ingest.sentry.io/6271973",
 
-    traces_sample_rate=1.0
+    traces_sample_rate=1.0,
+    environment=os.getenv("DEPLOY_ENVIRONMENT", 'production')
 )
diff --git a/toolchest_client/api/output.py b/toolchest_client/api/output.py
@@ -42,11 +42,12 @@ def set_s3_uri(self, s3_uri):
     def set_output_path(self, output_path):
         self.output_path = output_path
 
-    def download(self, output_dir):
+    def download(self, output_dir, skip_decompression=False):
         self.output_path = download(
             output_path=output_dir,
             s3_uri=self.s3_uri,
             run_id=self.run_id,
+            skip_decompression=skip_decompression,
         )
         return self.output_path
 
diff --git a/toolchest_client/api/query.py b/toolchest_client/api/query.py
@@ -65,7 +65,7 @@ def __init__(self, stored_output=None, is_async=False, pipeline_segment_instance
     def run_query(self, tool_name, tool_version, input_prefix_mapping,
                   output_type, tool_args=None, database_name=None, database_version=None,
                   custom_database_path=None, output_name="output", output_primary_name=None,
-                  input_files=None, output_path=None, thread_statuses=None):
+                  input_files=None, output_path=None, skip_decompression=False, thread_statuses=None):
         """Executes a query to the Toolchest API.
 
         :param tool_name: Tool to be used.
@@ -80,6 +80,7 @@ def run_query(self, tool_name, tool_version, input_prefix_mapping,
         :param input_files: List of paths to be passed in as input.
         :param output_path: Path (client-side) where the output file will be downloaded.
         :param output_type: Type (e.g. GZ_TAR) of the output file
+        :param skip_decompression: Whether to skip decompression of the output file, if it is an archive
         :param thread_statuses: Statuses of all threads, shared between threads.
         """
         self.thread_name = threading.current_thread().getName()
@@ -129,7 +130,7 @@ def run_query(self, tool_name, tool_version, input_prefix_mapping,
 
         self._wait_for_job()
 
-        self._download(output_path, output_type)
+        self._download(output_path, output_type, skip_decompression)
 
         self.mark_as_failed = False
         self._update_status(Status.COMPLETE)
@@ -361,7 +362,7 @@ def _wait_for_job(self):
             leftover_delay = elapsed_time % self.WAIT_FOR_JOB_DELAY
             time.sleep(leftover_delay)
 
-    def _download(self, output_path, output_type):
+    def _download(self, output_path, output_type, skip_decompression):
         """Retrieves information needed for downloading. If ``output_path`` is given,
         downloads output to ``output_path`` and decompresses output archive, if necessary.
         """
@@ -375,6 +376,7 @@ def _download(self, output_path, output_type):
                     output_path=output_path,
                     output_file_keys=output_file_keys,
                     output_type=output_type,
+                    skip_decompression=skip_decompression,
                 )
                 self._update_status(Status.TRANSFERRED_TO_CLIENT)
         except ToolchestDownloadError as err:
diff --git a/toolchest_client/files/http.py b/toolchest_client/files/http.py
@@ -5,6 +5,7 @@
 Functions for handling files given by HTTP / HTTPS URLs.
 """
 from urllib.parse import urlparse
+from urllib3.exceptions import LocationParseError
 
 import requests
 from requests.exceptions import HTTPError, InvalidURL, InvalidSchema
@@ -28,7 +29,7 @@ def path_is_http_url(path):
     """
     try:
         get_http_url_file_size(get_url_with_protocol(path))
-    except (InvalidURL, HTTPError, InvalidSchema):
+    except (InvalidURL, HTTPError, InvalidSchema, LocationParseError):
         return False
 
     return True
diff --git a/toolchest_client/tools/shogun.py b/toolchest_client/tools/shogun.py
@@ -28,7 +28,7 @@ def __init__(self, tool_args, output_name, inputs, output_path,
             parallel_enabled=False,
             output_type=OutputType.GZ_TAR,
             output_is_directory=True,
-            output_names=["alignment.burst.b6"],
+            output_names=["alignment.bowtie2.sam"],
             **kwargs,
         )
 
diff --git a/toolchest_client/tools/tool.py b/toolchest_client/tools/tool.py
@@ -37,7 +37,7 @@ def __init__(self, tool_name, tool_version, tool_args, output_name,
                  max_input_bytes_per_file_parallel=FOUR_POINT_FIVE_GIGABYTES,
                  group_paired_ends=False, compress_inputs=False,
                  output_type=OutputType.FLAT_TEXT, output_is_directory=True,
-                 output_names=None, is_async=False):
+                 output_names=None, is_async=False, skip_decompression=False):
         self.tool_name = tool_name
         self.tool_version = tool_version
         self.tool_args = tool_args
@@ -76,6 +76,7 @@ def __init__(self, tool_name, tool_version, tool_args, output_name,
         self.thread_outputs = {}
         self.output_names = output_names or []
         self.is_async = is_async
+        self.skip_decompression = skip_decompression
         signal.signal(signal.SIGTERM, self._handle_termination)
         signal.signal(signal.SIGINT, self._handle_termination)
 
@@ -435,6 +436,7 @@ def run(self):
                 "input_prefix_mapping": self.input_prefix_mapping,
                 "output_path": temp_parallel_output_file_path if should_run_in_parallel else non_parallel_output_path,
                 "output_type": self.output_type,
+                "skip_decompression": self.skip_decompression,
             })
 
             # Add non-distinct dictionary for status updates

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ def __init__(self, tool_args, output_name, inputs, output_path,`
`28`	`28`	`parallel_enabled=False,`
`29`	`29`	`output_type=OutputType.GZ_TAR,`
`30`	`30`	`output_is_directory=True,`
`31`		`- output_names=["alignment.burst.b6"],`
	`31`	`+ output_names=["alignment.bowtie2.sam"],`
`32`	`32`	`**kwargs,`
`33`	`33`	`)`
`34`	`34`