Skip to content

Commit

Permalink
Merge pull request #15 from PROBIC/dev
Browse files Browse the repository at this point in the history
mGEMS-v1.1.0 (20 October 2021)
  • Loading branch information
tmaklin authored Oct 20, 2021
2 parents 014b166 + f8ffc0e commit be0377b
Show file tree
Hide file tree
Showing 12 changed files with 480 additions and 438 deletions.
94 changes: 58 additions & 36 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ else()
endif()

## mGEMS executable
add_executable(mGEMS ${CMAKE_CURRENT_SOURCE_DIR}/src/main.cpp)
add_executable(mGEMS ${CMAKE_CURRENT_SOURCE_DIR}/src/mGEMS.cpp)

## Check supported compression types
find_package(BZip2)
Expand All @@ -41,36 +41,9 @@ if (ZLIB_FOUND)
target_link_libraries(mGEMS ${ZLIB_LIBRARIES})
endif()

## Check dependencies and download them if not given
## telescope
if (DEFINED CMAKE_TELESCOPE_LIBRARY AND DEFINED CMAKE_TELESCOPE_HEADERS)
find_library(TELESCOPE NAMES telescope HINTS ${CMAKE_TELESCOPE_LIBRARY})
target_link_libraries(mGEMS ${TELESCOPE})
include_directories("${CMAKE_TELESCOPE_HEADERS}")
else()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-telescope.txt.in ${CMAKE_BINARY_DIR}/external/telescope-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/telescope-download )
if(result)
message(FATAL_ERROR "CMake step for telescope failed: ${result}")
endif()
execute_process(COMMAND ${CMAKE_COMMAND} --build .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/telescope-download )
if(result)
message(FATAL_ERROR "Build step for telescope failed: ${result}")
endif()
add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/external/telescope
${CMAKE_CURRENT_BINARY_DIR}/external/telescope/build)
include_directories(${CMAKE_CURRENT_BINARY_DIR}/external/telescope/include)
set_target_properties(telescope PROPERTIES EXCLUDE_FROM_ALL 1)
target_link_libraries(mGEMS libtelescope)
endif()

## bxzstr
if (DEFINED CMAKE_BXZSTR_HEADERS)
include_directories("${CMAKE_BXZSTR_HEADERS}")
message(STATUS "bxzstr headers provided in: ${CMAKE_BXZSTR_HEADERS}")
else()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-bxzstr.txt.in ${CMAKE_BINARY_DIR}/external/bxzstr-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
Expand All @@ -85,12 +58,13 @@ else()
if(result)
message(FATAL_ERROR "Build step for bxzstr failed: ${result}")
endif()
include_directories(${CMAKE_CURRENT_BINARY_DIR}/external/bxzstr/include)
set(CMAKE_BXZSTR_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/bxzstr/include)
endif()
include_directories(${CMAKE_BXZSTR_HEADERS})

## cxxargs
if (DEFINED CMAKE_CXXARGS_HEADERS)
include_directories("${CMAKE_CXXARGS_HEADERS}")
message(STATUS "cxxargs headers provided in: ${CMAKE_CXXARGS_HEADERS}")
else()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-cxxargs.txt.in ${CMAKE_BINARY_DIR}/external/cxxargs-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
Expand All @@ -105,8 +79,57 @@ else()
if(result)
message(FATAL_ERROR "Build step for cxxargs failed: ${result}")
endif()
include_directories(${CMAKE_CURRENT_BINARY_DIR}/external/cxxargs/include)
set(CMAKE_CXXARGS_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/cxxargs/include)
endif()
include_directories("${CMAKE_CXXARGS_HEADERS}")

## cxxio
if (DEFINED CMAKE_CXXIO_HEADERS)
message(STATUS "cxxio headers provided in: ${CMAKE_CXXIO_HEADERS}")
else()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-cxxio.txt.in ${CMAKE_BINARY_DIR}/external/cxxio-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/cxxio-download )
if(result)
message(FATAL_ERROR "CMake step for cxxio failed: ${result}")
endif()
execute_process(COMMAND ${CMAKE_COMMAND} --build .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/cxxio-download )
if(result)
message(FATAL_ERROR "Build step for cxxio failed: ${result}")
endif()
set(CMAKE_CXXIO_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/cxxio/include)
endif()
include_directories("${CMAKE_CXXIO_HEADERS}")

## Check dependencies and download them if not given
## telescope
if (DEFINED CMAKE_TELESCOPE_LIBRARY AND DEFINED CMAKE_TELESCOPE_HEADERS)
find_library(TELESCOPE NAMES telescope HINTS ${CMAKE_TELESCOPE_LIBRARY})
target_link_libraries(mGEMS ${TELESCOPE})
else()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-telescope.txt.in ${CMAKE_BINARY_DIR}/external/telescope-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/telescope-download )
if(result)
message(FATAL_ERROR "CMake step for telescope failed: ${result}")
endif()
execute_process(COMMAND ${CMAKE_COMMAND} --build .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/telescope-download )
if(result)
message(FATAL_ERROR "Build step for telescope failed: ${result}")
endif()
add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/external/telescope
${CMAKE_CURRENT_BINARY_DIR}/external/telescope/build)
set_target_properties(telescope PROPERTIES EXCLUDE_FROM_ALL 1)
target_link_libraries(mGEMS libtelescope)
set(CMAKE_TELESCOPE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/telescope/include)
endif()
include_directories(${CMAKE_TELESCOPE_HEADERS})

## Get version number from git tags
find_package(Git)
Expand All @@ -128,14 +151,13 @@ string(TIMESTAMP _BUILD_TIMESTAMP)
## Generate a version.h file containing build version and timestamp
configure_file(${CMAKE_SOURCE_DIR}/include/version.h.in ${CMAKE_BINARY_DIR}/include/version.h @ONLY)

## external/include has the version info
include_directories(include ${CMAKE_SOURCE_DIR}/external)
include_directories(${CMAKE_SOURCE_DIR}/include)

## mGEMS library
add_library(libmgems ${CMAKE_CURRENT_SOURCE_DIR}/src/mGEMS.cpp
add_library(libmgems
${CMAKE_CURRENT_SOURCE_DIR}/src/bin_reads.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/extract_bin.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/mGEMS.cpp)
)
set_target_properties(libmgems PROPERTIES OUTPUT_NAME mgems)

# Link libraries
Expand Down
49 changes: 32 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,42 +86,57 @@ mSWEEP_abundances.txt file in the mGEMS-out folder (compressed with
zlib).

### Advanced use
You can also extract the read-to-group assignments table that mGEMS
uses internally by adding the `--write-assignment-table` toggle to the
call to `mGEMS` or `mGEMS bin`:
```
mGEMS --groups group-3,group-4 -r reads_1.fastq.gz,reads_2.fastq.gz --themisto-alns pseudoalignments_1.aln.gz,pseudoalignments_2.aln.gz -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index --write-assignment-table
```

... or bin and write only the reads that are assigned to "group-3" or
"group-4" by adding the '--groups group-3,group-4' flag
```
mGEMS --groups group-3,group-4 -r reads_1.fastq.gz,reads_2.fastq.gz --themisto-alns pseudoalignments_1.aln.gz,pseudoalignments_2.aln.gz -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index
```

Alternatively, find and write only the read bins for "group-3" and
"group-4", skipping extracting the reads
... write the reads that pseudoaligned to a reference sequence but were not assigned to any group by adding the `--write-unassigned` flag:
```
mGEMS bin --groups group-3,group-4 --themisto-alns pseudoalignments_1.aln.gz,pseudoalignments_2.aln.gz -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index
mGEMS --groups group-3,group-4 -r reads_1.fastq.gz,reads_2.fastq.gz --themisto-alns pseudoalignments_1.aln.gz,pseudoalignments_2.aln.gz -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index --write-unassigned
```

Alternatively, find and write only the read bins for "group-3",
"group-4", and the reads that pseudoaligned but were not assigned to
any group; skipping extracting the reads
```
mGEMS bin --groups group-3,group-4 --themisto-alns pseudoalignments_1.aln.gz,pseudoalignments_2.aln.gz -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index --write-unassigned
```

... and extract the reads when feeling like it
```
mGEMS extract --bins mGEMS-out/group-3.bin,mGEMS-out/group-4.bin -r
mGEMS extract --bins mGEMS-out/group-3.bin,mGEMS-out/group-4.bin,mGEMS-out/unassigned_reads.bin -r
reads_1.fastq.gz,reads_2.fastq.gz -o mGEMS-out
```

### Accepted input flags
mGEMS accepts the following input flags
```
-r Comma-separated list of input read(s).
--themisto-alns Comma-separated list of pseudoalignment file(s)
for the reads from themisto.
-o Output directory (must exist before running!).
--probs Comma-separated Posterior probability matrix (output from mSWEEP with
the --write-probs flag).
-a Relative abundance estimates from mSWEEP (tab-separated, 1st
column has the group names and 2nd column the estimates).
--index Themisto pseudoalignment index directory.
--groups (Optional) Which groups to extract from the input reads.
--min-abundance (Optional) Extract only groups that have a relative abundance higher than this value.
--compress (Optional) Toggle compressing the output files (default: compress)
-r Comma-separated list of input read(s).
--themisto-alns Comma-separated list of pseudoalignment file(s)
for the reads from themisto.
-o Output directory (must exist before running!).
--probs Comma-separated Posterior probability matrix (output from mSWEEP with
the --write-probs flag).
-a Relative abundance estimates from mSWEEP (tab-separated, 1st
column has the group names and 2nd column the estimates).
--index Themisto pseudoalignment index directory.
--groups (Optional) Which groups to extract from the input reads.
--min-abundance (Optional) Extract only groups that have a relative abundance higher than this value.
--compress (Optional) Toggle compressing the output files (default: compress)
--write-unassigned (Optional) Extract reads that pseudoaligned to a reference sequence but were not assigned to any group. (default: off)
--write-assignment-table (Optional) Write the read to group assignments table to `reads_to_groups.tsv` in the output directory. (default: off).
--unique-only (Optional) Write only the reads that are assigned to a single group.
```


## License
The source code from this project is subject to the terms of the MIT
license. A copy of the MIT license is supplied with the project, or
Expand Down
16 changes: 16 additions & 0 deletions config/CMakeLists-cxxio.txt.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
cmake_minimum_required(VERSION 2.8.2)

project(cxxio-get NONE)
include(ExternalProject)

ExternalProject_Add(cxxio-download
GIT_REPOSITORY https://github.com/tmaklin/cxxio.git
GIT_TAG main
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/cxxio"
BUILD_IN_SOURCE 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
UPDATE_COMMAND ""
)
4 changes: 3 additions & 1 deletion config/CMakeLists-telescope.txt.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ include(ExternalProject)

ExternalProject_Add(telescope-download
GIT_REPOSITORY https://github.com/tmaklin/telescope.git
GIT_TAG v0.2.0
GIT_TAG v0.2.1
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/telescope"
BUILD_IN_SOURCE 0
BUILD_COMMAND ""
CMAKE_ARGS -D CMAKE_BXZSTR_HEADERS=${CMAKE_BXZSTR_HEADERS}
-D CMAKE_CXXARGS_HEADERS=${CMAKE_CXXARGS_HEADERS}
INSTALL_COMMAND ""
TEST_COMMAND ""
UPDATE_COMMAND ""
Expand Down
55 changes: 49 additions & 6 deletions include/bin_reads.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,57 @@

#include "telescope.hpp"

#include "file.hpp"

namespace mGEMS {
uint32_t ReadAbundances(std::istream &stream, std::vector<long double> *abundances, std::vector<std::string> *groups);
void ConstructThresholds(const uint32_t num_ecs, const long double theta_frac, const std::vector<long double> &abundances, std::vector<long double> *thresholds);
std::vector<bool> AssignProbs(const std::vector<long double> &thresholds, std::istream &probs_file, std::vector<std::string> *target_groups, std::vector<std::vector<bool>> *assignments, const std::vector<std::vector<uint32_t>> &assigned_reads, std::vector<std::vector<uint32_t>> *bins);
void BinReads(const std::vector<std::vector<bool>> &assignments, const std::vector<bool> &groups_to_assign, const std::vector<std::vector<uint32_t>> &aligned_reads, std::vector<std::vector<uint32_t>> *assigned_reads);
// mGEMS::ReadAbundances
// Reads in the `_abundances.txt` file from mSWEEP.
// Input:
// `stream`: Stream pointing to the `_abundances.txt` file.
// Output:
// `abundances`: The relative abundances (2nd column in the file).
// `groups`: Names of the reference lineages (1st column).
void ReadAbundances(std::istream &stream, std::vector<long double> *abundances,
std::vector<std::string> *groups);

// mGEMS::WriteBin
// Writes the ids (i. e. line numbers divided by 4) of the reads in the bin.
// Input:
// `binned_reads`: The bin.
// `of`: Stream for the output.
void WriteBin(const std::vector<uint32_t> &binned_reads, std::ostream &of);

// mGEMS::WriteAssignments
// Writes the boolean read to group assignments matrix in tab-separated format.
// Input:
// `assignments_mat`: The matrix from mGEMS::Bin.
// `aln`: Themisto pseudoalignments.
// `of`: Stream for the output.
void WriteAssignments(const std::vector<std::vector<bool>> &assignments_mat,
const ThemistoAlignment &aln, std::ostream &of);

// mGEMS::Bin
// Returns a 2D vector that contains the ids (line numbers in the
// .fastq files divided by 4) of reads assigned to the groups
// that were requested.
// Input:
// `aln`: Pseudoalignments from Themisto.
// `abundances`: Relative abundances from mSWEEP.
// `theta_frac`: Tuning parameter for the thresholds..
// `single_only`: Only assign reads that are assigned to just a single lineage.
// `probs_file`: Read probability matrix (.probs file) from mSWEEP.
// `*target_groups`: Names of the groups that bins will be created for.
// Output:
// `*target_groups`: The names will be reordered to match the order of the bins.
// `*unassigned_bin`: Vector containing the ids of reads that were not assigned to any bin.
// `out_bins`: Vector containing the bins for the groups given in `*target_groups`.
// `*assignments_mat`: The read assignment matrix from AssignProbs.
std::vector<std::vector<uint32_t>> Bin(const ThemistoAlignment &aln,
const std::vector<long double> &abundances,
const long double theta_frac,
const bool single_only,
std::istream &probs_file,
std::vector<std::string> *target_groups,
std::vector<uint32_t> *unassigned_bin,
std::vector<std::vector<bool>> *assignments_mat);
}

#endif
16 changes: 14 additions & 2 deletions include/extract_bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,22 @@

#include <vector>

#include "file.hpp"
#include "cxxio.hpp"

namespace mGEMS {
void ExtractBin(const std::vector<uint32_t> &bin_assignments, std::vector<File::In> &in_strands, std::vector<File::Out> *out_strands);
// mGEMS::ExtractBin
// Extracts the reads assigned to a specific bin from the .fastq files.
// Input:
// `bin_assignments`: vector containing the ids of the reads assigned to this bin.
// `in_strands`: the input .fastq files (e. g. forward and reverse strands).
// `out_strands`: the output .fastq files.
void ExtractBin(const std::vector<uint32_t> &bin_assignments,
std::vector<cxxio::In> &in_strands,std::vector<cxxio::Out> *out_strands);

// mGEMS::ReadBin
// Reads in a bin that has been written to a file with mGEMS::WriteBin.
// Input:
// `stream`: Stream pointing to the bin file.
std::vector<uint32_t> ReadBin(std::istream &stream);
}

Expand Down
Loading

0 comments on commit be0377b

Please sign in to comment.