brentp · brentp · Mar 7, 2018 · Mar 3, 2018 · Mar 3, 2018 · Mar 3, 2018
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,10 @@
 *.fa
+*.bai
+*.crai
+*.bam
+*.vcf.gz
+*.tbi
+*.csi
 *.vcf
 *.bedpe
 *.bam

diff --git a/Dockerfile b/Dockerfile
@@ -3,6 +3,7 @@ FROM ubuntu:16.04
 
 COPY docker-build.sh .
 COPY lumpy-smoother .
+COPY smoove .
 
 RUN bash docker-build.sh
 

diff --git a/README.md b/README.md
@@ -1,70 +1,86 @@
-# lumpy-smoother
+# smoove
 
-`lumpy-smoother` simplifies and speeds running [lumpy](https://github.com/arq5x/lumpy-sv) and associated pre and post-processing steps.
+`smoove` simplifies and speeds calling and genotyping SVs.
 
 It requires:
 
  + [lumpy and lumpy\_filter](https://github.com/arq5x/lumpy-sv)
 
  And optionally:
 
- + [cnvnator](https://github.com/abyzovlab/CNVnator): makes per-sample CNV calls that lumpy can use
  + [svtyper](https://github.com/hall-lab/svtyper): to genotypes SVs
+ + [svtools](https://github.com/hall-lab/svtools): required for large cohorts
  + [samtools](https://github.com/samtools/samtools): for CRAM support
  + [gsort](https://github.com/brentp/gsort): to sort final VCF
  + [bgzip+tabix](https://github.com/samtools/htslib): to compress and index final VCF
+ + [mosdepth](https://github.com/brentp/mosdepth)
 
- Running `lumpy-smoother -h` will show which of these are found so they can be added to the PATH as needed.
+ Running `smoove` without any arguments will show which of these are found so they can be added to the PATH as needed.
 
-`lumpy-smoother` will:
+`smoove` will:
 
 1. parallelize calls to `lumpy_filter` to extract split and discordant reads required by lumpy
 2. further filter `lumpy_filter` calls to remove high-coverage, spurious regions and user-specified chroms like 'hs37d5';
+   it will also remove reads that we've found are likely spurious signals. 
    after this, it will remove singleton reads (where the mate was removed by one of the previous filters) from the discordant
    bams. This makes `lumpy` much faster and less memory-hungry.
-3. parallelize calling cnvnator if it is on the $PATH, including splitting the reference as it requires.
-   calls to `lumpy_filter` and `cnvnator` are in the same process-pool for maximum efficiency
-4. calculate per-sample metrics for mean, standard deviation, and distribution of insert size as required by lumpy.
-5. correct the reference allele (lumpy always puts 'N')
-6. stream output of lumpy directly into multiple svtyper processes for parallel-by-region genotyping while lumpy is still running.
-7. sort, compress, and index final VCF.
+3. calculate per-sample metrics for mean, standard deviation, and distribution of insert size as required by lumpy.
+4. correct the reference allele (lumpy always puts 'N')
+5. stream output of lumpy directly into multiple svtyper processes for parallel-by-region genotyping while lumpy is still running.
+6. sort, compress, and index final VCF.
 
 # installation
 
 I will release a binary shortly, meanwhile, you can get this and all dependencies via (a large) docker image:
 
 ```
-docker pull brentp/lumpy-smoother
-docker run -it brentp/lumpy-smoother -h
+docker pull brentp/smoove
+docker run -it brentp/smoove -h
 ```
 
 # usage
 
-run `lumpy-smoother -h` for full usage
+## small cohorts (n < ~ 40)
+
+for small cohorts it's possible to get a jointly-called, genotyped VCF in a single command.
 
 ```
-lumpy-smoother \
-        -n my-project \                        # arbitrary project name for file prefixes
-        -f $reference \
-        --outdir results \
-        --processes 10 \                       # parallelize with this many processors.
-        -C hs37d5,GL000192.1 \                 # comma-delimited list of chroms to exclude
-        --exclude low-complexity-regions.bed \ # see: https://github.com/hall-lab/speedseq/tree/master/annotations 
-        data/*.bam                             # any number of BAM or CRAM files
+smoove call --name my-cohort --fasta $fasta -p $threads --genotype /path/to/*.bam > /dev/null
 ```
 
-# TODO
+## large cohorts
 
-+ [ ] annotate high-quality calls
-+ [ ] (unlikely) isolate steps so that users can call, e.g.: 
-    lumpy-smoother cnvs
-    lumpy-smoother filter
-    lumpy-smoother lumpy
-    lumpy-smoother call
+For large cohorts the steps are:
+
+1. For each sample, call genotypes:
+
+```
+smoove call --outdir results-smoove/ --name $sample --fasta $fasta -p $threads --genotype /path/to/$sample.bam > /dev/null
+```
+
+2. Get the union of sites across all samples:
+
+```
+# this will create ./merged.sites.vcf.gz
+smoove merge --name merged -f $fasta --outdir ./ results-smoove/*.vcf.gz
+```
 
-# limitations
+3. genotype all samples at those sites.
 
-this is limited to cohorts of ~ 30-40 or so.
+```
+# this will create ./merged.sites.vcf.gz
+smoove genotype -p 1 --name $sample-joint --outdir results-genotped/ --fasta $fasta --vcf merged.sites.vcf.gz /path/to/$sample.$bam
+```
+
+4. paste all the single sample VCFs with the same number of variants to get a single, squared, joint-called file.
+
+```
+smoove paste --name $cohort results-genotyped/*.vcf.gz
+```
+
+# TODO
+
++ [ ] annotate high-quality calls
 
 # see also
 

diff --git a/cmd/smoove/smoove.go b/cmd/smoove/smoove.go
@@ -0,0 +1,107 @@
+package main
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"sort"
+	"strconv"
+
+	"github.com/brentp/smoove"
+	"github.com/brentp/smoove/cnvnator"
+	"github.com/brentp/smoove/lumpy"
+	"github.com/brentp/smoove/merge"
+	"github.com/brentp/smoove/paste"
+	"github.com/brentp/smoove/shared"
+	"github.com/brentp/smoove/svtyper"
+	"github.com/valyala/fasttemplate"
+)
+
+type progPair struct {
+	help string
+	main func()
+}
+
+var progs = map[string]progPair{
+	"cnvnator": progPair{"call cnvnator and make bedpe files needed by lumpy", cnvnator.Main},
+	"genotype": progPair{"parallelize svtyper on an input VCF", svtyper.Main},
+	"call":     progPair{"call lumpy (and optionally svtyper) after filtering bams", lumpy.Main},
+	"merge":    progPair{"merge and sort (using svtools) calls from multiple samples", merge.Main},
+	"paste":    progPair{"square final calls from multiple samples (each with same number of variants)", paste.Main},
+}
+
+func Description() string {
+	tmpl := `smoove version: {{version}}
+
+smoove calls several programs. Those with 'Y' are found on your $PATH. Only those with '*' are required.
+
+ *[{{bgzip}}] bgzip [ sort   -> (compress) ->   index ]
+ *[{{gsort}}] gsort [(sort)  ->  compress   ->  index ]
+ *[{{tabix}}] tabix [ sort   ->  compress   -> (index)]
+ *[{{lumpy}}] lumpy
+ *[{{lumpy_filter}}] lumpy_filter
+ *[{{samtools}}] samtools [only required for CRAM input]
+
+  [{{cnvnator}}] cnvnator [per-sample CNV calls]
+  [{{mosdepth}}] mosdepth [extra filtering of split and discordant files for better scaling]
+  [{{svtyper}}] svtyper [required to genotype SV calls]
+  [{{svtools}}] only needed for large cohorts.
+
+Available sub-commands are below. Each can be run with -h for additional help.
+
+`
+	t := fasttemplate.New(tmpl, "{{", "}}")
+
+	vars := map[string]interface{}{
+		"version":      smoove.Version,
+		"lumpy":        shared.HasProg("lumpy"),
+		"cnvnator":     shared.HasProg("cnvnator"),
+		"lumpy_filter": shared.HasProg("lumpy_filter"),
+		"samtools":     shared.HasProg("samtools"),
+		"mosdepth":     shared.HasProg("mosdepth"),
+		"svtyper":      shared.HasProg("svtyper"),
+		"svtools":      shared.HasProg("svtools"),
+		"gsort":        shared.HasProg("gsort"),
+		"bgzip":        shared.HasProg("bgzip"),
+		"tabix":        shared.HasProg("tabix"),
+	}
+	return t.ExecuteString(vars)
+}
+
+func printProgs() {
+
+	var wtr io.Writer = os.Stdout
+
+	fmt.Fprintf(wtr, Description())
+	var keys []string
+	l := 5
+	for k := range progs {
+		keys = append(keys, k)
+		if len(k) > l {
+			l = len(k)
+		}
+	}
+	fmtr := "%-" + strconv.Itoa(l) + "s : %s\n"
+	sort.Strings(keys)
+	for _, k := range keys {
+		fmt.Fprintf(wtr, fmtr, k, progs[k].help)
+
+	}
+	os.Exit(1)
+
+}
+
+func main() {
+
+	if len(os.Args) < 2 {
+		printProgs()
+	}
+	var p progPair
+	var ok bool
+	if p, ok = progs[os.Args[1]]; !ok {
+		printProgs()
+	}
+	// remove the prog name from the call
+	os.Args = append(os.Args[:1], os.Args[2:]...)
+	p.main()
+}