trytoolchest · lebovic · Dec 20, 2021 · Dec 8, 2021 · Dec 8, 2021 · Dec 9, 2021
diff --git a/docs/index.rst b/docs/index.rst
@@ -29,7 +29,6 @@ Tools
 Toolchest currently supports the following tools:
 
 * Bowtie2 (`bowtie2`)
-* Cutadapt (`cutadapt`)
 * Kraken2 (`kraken2`)
 * STAR (`STAR`)
 * Unicycler (`unicycler`)

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "toolchest-client"
-version = "0.7.30"
+version = "0.7.39"
 description = "Python client for Toolchest"
 authors = [
     "Bryce Cai <bcai@trytoolchest.com>",
@@ -23,6 +23,7 @@ packages = [
 "Bug Tracker" = "https://github.com/trytoolchest/toolchest-client-python/issues"
 
 [tool.poetry.dependencies]
+boto3 = "^1.18.29"
 python = "^3.6"
 requests = "^2.25.1"
 python-dotenv = "^0.18.0"
@@ -31,7 +32,6 @@ importlib-metadata = { version = "~=1.0", python = "<3.8" }
 
 [tool.poetry.dev-dependencies]
 pytest = "^6.2.4"
-boto3 = "^1.18.29"
 
 [[tool.poetry.source]]
 name = "pypi-public"

diff --git a/tests/test_kraken2.py b/tests/test_kraken2.py
@@ -62,3 +62,26 @@ def test_kraken2_paired_end():
 
     # Kraken 2 paired-end is not completely deterministic, and consistently alternates between these two hashes
     assert hash.unordered(output_file_path) in [1076645572, 1174140935]
+
+@pytest.mark.integration
+def test_kraken2_s3():
+    """
+    Tests Kraken 2 with an example input in S3 against the std (v1) DB
+    """
+    test_dir = "test_kraken2_standard"
+    os.makedirs(f"./{test_dir}", exist_ok=True)
+    input_file_path = "./kraken_input.fasta"
+    output_dir_path = f"./{test_dir}/"
+    output_file_path = f"{output_dir_path}kraken2_output.txt"
+
+    s3.download_integration_test_input(
+        s3_file_key="synthetic_bacteroides_reads.fasta",
+        output_file_path=input_file_path,
+    )
+
+    toolchest.kraken2(
+        inputs="s3://toolchest-integration-tests-public/synthetic_bacteroides_reads.fasta",
+        output_path=output_dir_path,
+    )
+
+    assert hash.unordered(output_file_path) == 886254946
diff --git a/tests/test_shogun.py b/tests/test_shogun.py
@@ -15,11 +15,11 @@ def test_shogun_filter_and_align():
     Tests shogun (filter and align for simplicity) with a single R1 input
     """
 
-    test_dir = "test_shogun_filter_and_align"
-    os.makedirs(f"./{test_dir}", exist_ok=True)
+    test_dir = "./test_shogun_filter_and_align"
+    os.makedirs(f"{test_dir}", exist_ok=True)
     input_file_path = f"./{test_dir}/combined_seqs_unfiltered.fna"
-    output_file_path_filter = f"./{test_dir}/combined_seqs_filtered.fna"
-    output_file_path_align = f"./{test_dir}/burst_output.b6"
+    output_file_path_filter = f"./{test_dir}/combined_seqs.filtered.fna"
+    output_file_path_align = f"./{test_dir}/alignment.burst.b6"
 
     s3.download_integration_test_input(
         s3_file_key="combined_seqs_unfiltered.fna",
@@ -29,15 +29,15 @@ def test_shogun_filter_and_align():
     toolchest.shogun_filter(
         tool_args="--alignment True",
         inputs=input_file_path,
-        output_path=output_file_path_filter,
+        output_path=test_dir,
     )
 
     assert hash.unordered(output_file_path_filter) == 510167908
 
     toolchest.shogun_align(
         tool_args="",
         inputs=output_file_path_filter,
-        output_path=output_file_path_align,
+        output_path=test_dir,
     )
 
     assert hash.unordered(output_file_path_align) == 780853697
diff --git a/tests/test_star.py b/tests/test_star.py
@@ -0,0 +1,96 @@
+import os
+import pytest
+
+from tests.util import s3
+import toolchest_client as toolchest
+
+toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
+if toolchest_api_key:
+    toolchest.set_key(toolchest_api_key)
+
+
+@pytest.mark.integration
+def test_star_grch38():
+    """
+    Tests STAR against the grch38 database
+    """
+    test_dir = "test_star_grch38"
+    os.makedirs(f"./{test_dir}", exist_ok=True)
+    input_file_path = "./small_star.fastq"
+    output_dir_path = f"./{test_dir}/"
+    output_file_path = f"{output_dir_path}Aligned.out.sam"
+
+    s3.download_integration_test_input(
+        s3_file_key="small_star_500k.fastq",
+        output_file_path=input_file_path,
+    )
+
+    toolchest.STAR(
+        read_one=input_file_path,
+        output_path=output_dir_path,
+        database_name="GRCh38",
+    )
+
+    # Because STAR is non-deterministic, verify that the number of bytes is in range
+    assert 185952744 <= os.path.getsize(output_file_path) <= 185952766
+
+
+@pytest.mark.integration
+def test_star_grch38_parallel():
+    """
+    Tests STAR against the grch38 database, using parallel mode
+    """
+    test_dir = "test_star_grch38_parallel"
+    os.makedirs(f"./{test_dir}", exist_ok=True)
+    input_file_path = "./large_star.fastq"
+    output_dir_path = f"./{test_dir}/"
+    output_file_path = f"{output_dir_path}Aligned.out.sam"
+
+    s3.download_integration_test_input(
+        s3_file_key="large_star_15GB.fastq",
+        output_file_path=input_file_path,
+    )
+
+    toolchest.STAR(
+        read_one=input_file_path,
+        output_path=output_file_path,
+        database_name="GRCh38",
+        parallelize=True,
+    )
+
+    # Because STAR is non-deterministic, verify that the number of bytes is in range
+    assert 33292990718 <= os.path.getsize(output_file_path) <= 33292994718
+
+
+@pytest.mark.integration
+def test_star_grch38_dangerous_arg():
+    """
+    Tests STAR against the grch38 database, with a dangerous arg (changing functionality)
+    """
+    test_dir = "test_star_grch38"
+    os.makedirs(f"./{test_dir}", exist_ok=True)
+    input_file_path = "./small_star.fastq"
+    output_dir_path = f"./{test_dir}/"
+    output_file_path = f"{output_dir_path}Aligned.out.bam"
+
+    s3.download_integration_test_input(
+        s3_file_key="small_star_500k.fastq",
+        output_file_path=input_file_path,
+    )
+
+    toolchest.STAR(
+        read_one=input_file_path,
+        output_path=output_dir_path,
+        database_name="GRCh38",
+        tool_args="--outSAMtype BAM Unsorted",
+        parallelize=True,  # this should be deliberately ignored
+    )
+
+    # Because STAR is non-deterministic and BAMs are are compressed verify that the number of bytes is in range
+    assert 38236020 <= os.path.getsize(output_file_path) <= 38236030
+
+    # Make sure all non-parallel files exist as well
+    assert os.path.isfile(f"{output_dir_path}Log.final.out")
+    assert os.path.isfile(f"{output_dir_path}Log.out")
+    assert os.path.isfile(f"{output_dir_path}Log.progress.out")
+    assert os.path.isfile(f"{output_dir_path}SJ.out.tab")
diff --git a/toolchest_client/__init__.py b/toolchest_client/__init__.py
@@ -21,4 +21,4 @@
 from toolchest_client.api.auth import get_key, set_key
 from toolchest_client.api.exceptions import ToolchestException, DataLimitError, ToolchestJobError
 from toolchest_client.api.query import Query
-from .tools.api import bowtie2, cellranger_mkfastq, cutadapt, kraken2, shi7, shogun_align, shogun_filter, STAR, test, unicycler
+from .tools.api import bowtie2, cellranger_mkfastq, kraken2, shi7, shogun_align, shogun_filter, STAR, test, unicycler
diff --git a/toolchest_client/api/exceptions.py b/toolchest_client/api/exceptions.py
@@ -15,6 +15,8 @@ class ToolchestException(OSError):
 class ToolchestKeyError(ToolchestException):
     """Invalid Toolchest auth key."""
 
+class ToolchestS3AccessError(ToolchestException):
+    """S3 input cannot be accessed by Toolchest."""
 
 class DataLimitError(ToolchestException):
     """Data limit for Toolchest exceeded."""