Skip to content

Commit

Permalink
Merge 'bamoutput' into master.
Browse files Browse the repository at this point in the history
  • Loading branch information
brianwalenz committed Jul 17, 2024
2 parents 7ca774a + b871d94 commit afd271c
Show file tree
Hide file tree
Showing 25 changed files with 1,685 additions and 1,167 deletions.
32 changes: 30 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,41 @@ Canu is a hierarchical assembly pipeline which runs in four steps:
* Conda: `conda install -c conda-forge -c bioconda -c defaults canu`
* Homebrew: `brew install brewsci/bio/canu`

* Alternatively, you can use the latest unreleased version from the source code. This version has not undergone the same testing as a release and so may have unknown bugs or issues generating sub-optimal assemblies. We recommend the release version for most users.
* Alternatively, you can use the latest unreleased version from the source
code. This version has not undergone the same testing as a release and so
may have unknown bugs or issues generating sub-optimal assemblies. We
recommend the release version for most users.

git clone https://github.com/marbl/canu.git
cd canu/src
make -j <number of threads>

* An *unsupported* Docker image made by Frank Förster is at https://hub.docker.com/r/greatfireball/canu/.
* FreeBSD generally requires libboost be installed from packages/ports. It
will compile with either clang (>= 14) or gcc (>= 9). It requires
openjdk18.

With clang, (default 14) needs libboost from ports.

gmake

With gcc (9+), can use the canu-supplied libboost or libboost from ports.

gmake CC=gcc9 CXX=g++9 BOOST=libboost # Canu-supplied boost
gmake CC=gcc9 CXX=g++9 # Ports/packages supplied boost

* MacOS Apple Silicon requires libboost, and either openjdk or oracle-jdk
to be installed from homebrew (preferred) or MacPorts. It will compile
with either clang (>=14) or gcc (>= 9) but WILL NOT compile with the
standard Xcode compiler.

make CC=gcc-11 CXX=g++-11 BOOST=libboost # Ports/packages supplied boost
make CC=gcc-11 CXX=g++-11 # Ports/packages supplied boost

* MacOS Intel is probably the same as Apple Silicon, but not tested.

* Linux does not need a system installed libboost.

* An *unsupported* Docker image made by Frank Förster is at https://hub.docker.com/r/greatfireball/canu/.

## Learn:

Expand Down
36 changes: 36 additions & 0 deletions scripts/build-all-compilers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/sh

# Build Canu using all available and supported compilers.
# - does not disturb the usual build/ directory.
#
# Only works on FreeBSD; compiler names (and LDFLAGS) would
# need to be adjusted for other systems.

if [ ! -e ./Makefile ] ; then
echo ERROR: Must be run from within the src/ directory.
exit
fi

for v in 9 10 11 12 13 ; do
echo gcc-$v
st=$(date +%s)
rm -rf ../build-gcc$v
export LDFLAGS="-rpath /usr/local/lib/gcc$v"
gmake -j 999 DESTDIR=../build-gcc$v CC=gcc$v CXX=g++$v > ../build-gcc$v.out 2> ../build-gcc$v.err || echo " FAIL"
unset LDFLAGS
ed=$(date +%s)
ss=$(expr $ed - $st)
echo " $ss seconds"
done

for v in 10 11 12 13 14 15 16 17 ; do
echo llvm-$v
st=$(date +%s)
rm -rf ../build-llvm$v
gmake -j 999 DESTDIR=../build-llvm$v CC=clang$v CXX=clang++$v > ../build-llvm$v.out 2> ../build-llvm$v.err || echo " FAIL"
ed=$(date +%s)
ss=$(expr $ed - $st)
echo " $ss seconds"
done

exit
2 changes: 1 addition & 1 deletion src/main.mk
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ SOURCES := utility/src/align/align-ksw2-driver.C \
utility/src/kmers-v1/kmers-exact.C \
utility/src/kmers-v1/kmers-files.C \
utility/src/kmers-v1/kmers-histogram.C \
utility/src/kmers-v1/kmers-histogram-ploidy.C \
utility/src/kmers-v1/kmers-reader.C \
utility/src/kmers-v1/kmers-writer-block.C \
utility/src/kmers-v1/kmers-writer-stream.C \
Expand All @@ -65,7 +66,6 @@ SOURCES := utility/src/align/align-ksw2-driver.C \
utility/src/parasail/sg_qb_de_dispatch.c \
utility/src/parasail/sg_qe_db_dispatch.c \
utility/src/parasail/sg_qx_dispatch.c \
utility/src/parasail/sg_trace.c \
\
utility/src/sequence/dnaSeq-v1.C \
utility/src/sequence/bufSeqFile-v1.C \
Expand Down
3 changes: 2 additions & 1 deletion src/pipelines/canu.pl
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,9 @@
print STDERR "--\n";

checkJava();
checkMinimap($bin);
checkMinimap();
checkGnuplot();
checkSamtools();

checkParameters();
printHelp(); # Fail now if any complaints so far.
Expand Down
70 changes: 56 additions & 14 deletions src/pipelines/canu/Consensus.pm
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ sub utgcns ($$) {
print F " -T ../$asm.ctgStore 1 \\\n";
print F " -P \$jobid \\\n";
print F " -O ./ctgcns/\$jobid.cns.WORKING \\\n";
print F " -norealign \\\n" if ((getGlobal("cnsAlign") eq "0") &&
(getGlobal("bamOutput") eq "0"));
print F " -maxcoverage " . getGlobal('cnsMaxCoverage') . " \\\n";
print F " -e " . getGlobal("cnsErrorRate") . " \\\n";
print F " -em " . getGlobal("cnsErrorRate") . " \\\n";
Expand Down Expand Up @@ -371,16 +373,31 @@ sub consensusCheck ($) {



sub purgeFiles ($$$$$$) {
sub purgeFiles ($$) {
my $asm = shift @_;
my $tag = shift @_;
my $Ncns = shift @_;
my $Nfastq = shift @_;
my $Nlayout = shift @_;
my $Nlog = shift @_;
my ($Ncns, $Nfastq, $Nbam, $Nlayout, $Nlog) = (0, 0, 0, 0, 0);

my $path = "unitigging/5-consensus";

# Remove single-tig bam outputs. If the bam.list doesn't exist, then either
# there are no bam outputs, or merge/sort failed.

if (-e "$path/ctgcns.bam.list") {
open(F, "< $path/ctgcns.bam.list") or caExit("can't open '$path/ctgcns.bam.list' for reading: $!\n", undef);
while (<F>) {
chomp;
if (-e $_) {
$Nbam++;
unlink $_;
}
}
close(F);
}

# Remove utgcns outputs and logs.

unlink "$path/ctgcns.bam.list";
open(F, "< $path/$tag.files") or caExit("can't open '$path/$tag.files' for reading: $!\n", undef);
while (<F>) {
chomp;
Expand All @@ -401,6 +418,10 @@ sub purgeFiles ($$$$$$) {
$Nlayout++;
unlink "unitigging/$1/$ID4.layout";
}
if (-e "unitigging/$1/$ID4.bams") {
rmdir "unitigging/$1/$ID4.bams";
}

if (-e "unitigging/$1/consensus.$ID6.out") {
$Nlog++;
unlink "unitigging/$1/consensus.$ID6.out";
Expand All @@ -419,7 +440,7 @@ sub purgeFiles ($$$$$$) {
unlink "$path/$tag.files";
rmdir "$path/$tag";

return($Ncns, $Nfastq, $Nlayout, $Nlog);
return($Ncns, $Nfastq, $Nbam, $Nlayout, $Nlog);
}


Expand Down Expand Up @@ -457,28 +478,49 @@ sub consensusLoad ($) {
$cmd .= "> ./5-consensus/ctgcns.files.ctgStoreLoad.err 2>&1";

if (runCommand("unitigging", $cmd)) {
caExit("failed to load unitig consensus into ctgStore", "$path/ctgcns.files.ctgStoreLoad.err");
caExit("failed to load unitig consensus into ctgStore", "$path/ctgcns.files.ctgStoreLoad.err");
}
unlink "$path/ctgcns.files.ctgStoreLoad.err";

stashFile("unitigging/$asm.ctgStore/seqDB.v002.dat");
stashFile("unitigging/$asm.ctgStore/seqDB.v002.tig");
}

# Remvoe consensus outputs
# And extract bam records.

if ((getGlobal("bamOutput") eq "1") &&
(! fileExists("$asm.contigs.bam"))) {
my $samtools = getGlobal("samtools");

$cmd = "$bin/tgTigDisplay \\\n";
$cmd .= " -S ../$asm.seqStore \\\n";
$cmd .= " -b \\\n";
$cmd .= " -o ../$asm.contigs.unsorted.bam \\\n";
$cmd .= " -L ./5-consensus/ctgcns.files \\\n";
$cmd .= "> ./5-consensus/ctgcns.files.contigs.bam.err 2>&1";

if (runCommand("unitigging", $cmd)) {
caExit("failed to extract BAM records from ctgcns files", "$path/ctgcns.files.contigs.bam.err");
}
unlink "$path/ctgcns.files.contigs.bam.err";

if (runCommand(".", "$samtools sort --write-index --threads 2 -o ./$asm.contigs.bam ./$asm.contigs.unsorted.bam > ./ctgcns.bam.err 2>&1")) {
caExit("failed to sort BAM outputs from ./$asm.contigs.unsorted.bam to ./$asm.contigs.bam", "./ctgcns.bam.err");
}
unlink "./$asm.contigs.unsorted.bam";
unlink "./ctgcns.bam.err";
}

# Remove consensus outputs

if (-e "$path/ctgcns.files") {
print STDERR "-- Purging consensus output after loading to ctgStore.\n";

my $Ncns = 0;
my $Nfastq = 0;
my $Nlayout = 0;
my $Nlog = 0;

($Ncns, $Nfastq, $Nlayout, $Nlog) = purgeFiles($asm, "ctgcns", $Ncns, $Nfastq, $Nlayout, $Nlog);
my ($Ncns, $Nfastq, $Nbam, $Nlayout, $Nlog) = purgeFiles($asm, "ctgcns");

print STDERR "-- Purged $Ncns .cns outputs.\n" if ($Ncns > 0);
print STDERR "-- Purged $Nfastq .fastq outputs.\n" if ($Nfastq > 0);
print STDERR "-- Purged $Nbam .bam outputs.\n" if ($Nbam > 0);
print STDERR "-- Purged $Nlayout .layout outputs.\n" if ($Nlayout > 0);
print STDERR "-- Purged $Nlog .err log outputs.\n" if ($Nlog > 0);
}
Expand Down
46 changes: 45 additions & 1 deletion src/pipelines/canu/Defaults.pm
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ require Exporter;
checkJava
checkMinimap
checkGnuplot
checkSamtools
adjustMemoryValue
displayMemoryValue
adjustGenomeSize
Expand Down Expand Up @@ -817,6 +818,8 @@ sub setDefaults () {
setDefault("gnuplot", "gnuplot", "Path to the gnuplot executable");
setDefault("gnuplotImageFormat", undef, "Image format that gnuplot will generate. Default: based on gnuplot, 'png', 'svg' or 'gif'");

setDefault("samtools", "samtools", "Path to the samtools executable");

setDefault("stageDirectory", undef, "If set, copy heavily used data to this node-local location");
setDefault("preExec", undef, "A command line to run at the start of Canu execution scripts");

Expand Down Expand Up @@ -1003,6 +1006,10 @@ sub setDefaults () {
setDefault("cnsConsensus", "pbdagcon", "Which consensus algorithm to use; 'pbdagcon' (fast, reliable); 'utgcns' (multialignment output); 'quick' (single read mosaic); default 'pbdagcon'");
setDefault("cnsPartitions", 0, "Attempt to create this many consensus jobs; default '0' = based on the largest tig");

setDefault("cnsAlign", 1, "Realign reads to contigs after consensus");
setDefault("bamOutput", 1, "Output contigs as BAM (enables cnsAlign)");
#etDefault("fastaOutput", 1, "Output contigs as FASTA");

##### Correction Options

setDefault("corPartitions", undef, "Partition read correction into N jobs");
Expand Down Expand Up @@ -1115,7 +1122,7 @@ sub checkJava () {



sub checkMinimap ($) {
sub checkMinimap () {
my $minimap = getGlobal("minimap");
my $version = undef;

Expand Down Expand Up @@ -1291,6 +1298,43 @@ sub checkGnuplot () {



sub checkSamtools () {
my $samtools = getGlobal("samtools");
my $sversion = undef;
my $hversion = undef;

return if (getGlobal("bamOutput") ne "1");

if ($samtools =~ m/^\./) {
addCommandLineError("ERROR: path to samtools '$samtools' must not be a relative path.\n");
goto cleanupSamtools;
}

system("cd /tmp && $samtools --version > /tmp/samtools-$$.err 2>&1");

open(F, "< /tmp/samtools-$$.err");
while (<F>) {
$sversion = $1 if ($_ =~ m/samtools\s+(.*)\s*$/);
$hversion = $1 if ($_ =~ m/htslib\s+(.*)\s*$/);
}
close(F);

if (!defined($sversion)) {
addCommandLineError("ERROR: failed to run samtools using command '$samtools'\n");
addCommandLineError(" set samtools=/path/to/samtools to fix, or\n");
addCommandLineError(" set bamOutput=false to disable.\n");
goto cleanupSamtools;
}

print STDERR "-- Detected samtools version '$sversion' (from '$samtools').\n" if (!defined($hversion));
print STDERR "-- Detected samtools version '$sversion' / htslib version '$hversion' (from '$samtools').\n" if ( defined($hversion));

cleanupSamtools:
unlink "/tmp/samtools-$$.err";
}



# Converts number with units to gigabytes. If no units, gigabytes is assumed.
sub adjustMemoryValue ($) {
my $val = shift @_;
Expand Down
2 changes: 1 addition & 1 deletion src/seqrequester
Submodule seqrequester updated 3 files
+1 −1 README.md
+1 −1 src/main.mk
+1 −1 src/utility
5 changes: 5 additions & 0 deletions src/stores/sqCache.C
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,11 @@ sqCache::sqCache_loadReads(char const *filename) {

fprintf(stderr, "-- Loading reads from '%s': %8lu reads.\r", filename, _readsLen);

if (readFile == nullptr) {
fprintf(stderr, "-- Failed to open '%s' for reading.\n");
exit(1);
}

while (readFile->loadSequence(readSeq) == true) {
if ((_readsLen & 0x1ff) == 0x1ff)
fprintf(stderr, "-- Loading reads from '%s': %8lu reads.\r", filename, _readsLen);
Expand Down
37 changes: 20 additions & 17 deletions src/stores/tgStore.C
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include "tgStore.H"

uint32 MASRmagic = 0x5253414d; // 'MASR', as a big endian integer
uint32 MASRversion = 1;
uint32 MASRversion = 2;

#define MAX_VERS 1024 // Linked to 10 bits in the header file.

Expand Down Expand Up @@ -125,14 +125,6 @@ tgStore::tgStore(const char *path_,
assert(0);
break;
}


// Fail (again?) if there are no tigs loaded.

//if (_tigLen == 0) {
// fprintf(stderr, "tgStore::tgStore()-- ERROR, didn't find any tigs in the store. Correct version?\n");
// exit(1);
//}
}


Expand Down Expand Up @@ -643,18 +635,29 @@ tgStore::loadMASR(tgStoreEntry* &R, uint32& L, uint32& M, uint32 V) {
exit(1);
}

if (MASRversionInFile != MASRversion) {
fprintf(stderr, "tgStore::loadMASR()-- Failed to open '%s': version number mismatch; file=%d code=%d\n",
_name, MASRversionInFile, MASRversion);
exit(1);
}

// Check we're consistent.
if (L < MASRtotalInFile)
fprintf(stderr, "tgStore::loadMASR()-- '%s' has more tigs (" F_U32 ") than expected (" F_U32 ").\n",
_name, MASRtotalInFile, L), exit(1);

loadFromFile(R, "MASR", masrLen, F);
if (MASRversionInFile == 1) {
tgStoreEntry *Rv1 = new tgStoreEntry [M];
loadFromFile(Rv1, "MASR", masrLen, F);

for (uint32 ii=0; ii<masrLen; ii++)
R[ii] = Rv1[ii]; // A format conversion!

delete [] Rv1;
}

else if (MASRversionInFile == 2) {
loadFromFile(R, "MASR", masrLen, F);
}

else {
fprintf(stderr, "tgStore::loadMASR()-- Failed to open '%s': version number mismatch; file=%d code=%d\n",
_name, MASRversionInFile, MASRversion);
exit(1);
}

merylutil::closeFile(F, _name);
}
Expand Down
Loading

0 comments on commit afd271c

Please sign in to comment.