v1.2 update

mewbak · Apr 6, 2018 · 0f07796 · 0f07796
1 parent 5361efa
commit 0f07796
Show file tree

Hide file tree

Showing 89 changed files with 2,877 additions and 906 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,4 @@ cppcheck-result
 /.gitignore
 /.cproject
 /.project
+/.pydevproject
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -52,7 +52,7 @@ find_package (Threads REQUIRED)
 set(Boost_USE_STATIC_LIBS        ON)  # only find static libs
 set(Boost_USE_MULTITHREADED      ON)
 set(Boost_USE_STATIC_RUNTIME     ON)
-find_package(Boost 1.5 COMPONENTS program_options filesystem system REQUIRED)
+find_package(Boost 1.5 COMPONENTS iostreams program_options filesystem system REQUIRED)
 
 include_directories(${ZLIB_INCLUDE_DIR})
 include_directories(${BZIP2_INCLUDE_DIR})

diff --git a/README.md b/README.md
@@ -624,6 +624,16 @@ It is extracted and re-organized from [an expected output](share/test-data/multi
 *    Some developer documentation about our code analysis and testing process can be found in 
     [doc/linting-and-testing.md](doc/linting-and-testing.md).
 
+*    Procedures for read level alignment validation 
+    [doc/validation-with-simulated-reads.md](doc/validation-with-simulated-reads.md).
+
+*    How we count reads for variants and paths
+    [doc/graph-counting.md](doc/graph-counting.md).
+
+*    Documentation of genotyping model parameters
+    [doc/genotyping-parameters.md](doc/genotyping-parameters.md).
+
+
 ### <a name='Links'></a>Links
 
 *   The [Illumina/Polaris](https://github.com/Illumina/Polaris) repository gives the

diff --git a/RELEASES.md b/RELEASES.md
@@ -1,5 +1,27 @@
 # Paragraph Release Notes / Change Log
 
+# Version 1.2
+
+| Date Y-m-d | Ticket  | Description                                                          |
+|------------|---------|----------------------------------------------------------------------|
+| 2018-04-05 | GT-429  | option to turn off exact and graph aligners in grmpy                 |
+| 2018-04-05 | GT-428  | upgrade htslib to version 1.8                                        |
+| 2018-04-04 | GT-427  | GT-427 multigrmpy to generate graph ID if vc2toparagraph does not provide it|
+| 2018-04-03 | GT-415  | GT-415 use parallel grmpy in multigrmpy                              |
+| 2018-03-22 | GT-414  | Use filter instead of mapping status to decide whether to graph align after kmer aligner|
+| 2018-03-20 | GT-412  | Kmer aligner produces fully clipped nodes at start or end of the alignment|
+| 2018-03-20 | GT-407  | Refined edge filter                                                  |
+| 2018-03-15 | GT-403  | New threading for grmpy                                              |
+| 2018-03-15 | GT-235  | Add documentation for read counting                                  |
+| 2018-03-14 | GT-406  | Combine source and sink softclips with adjacent node CIGAR in KmerAligner|
+| 2018-03-11 | GT-398  | Minor grmpy fix and documentation update                             |
+| 2018-03-07 | GT-394  | graph-level threading to improve efficiency with high-latency file systems|
+| 2018-01-03 | GT-393  | Command line option for simulated reads coverage                     |
+| 2018-01-03 | GT-392  | Reads sometimes out of order in simulated.bam                        |
+| 2018-03-05 | GT-396  | Add command line options to specify location of BAM index separately |
+| 2018-02-28 | GT-388  | Improved genotyping on variants with different breakpoint genotypes  |
+
+
 # Version 1.1
 
 | Date Y-m-d | Ticket  | Description                                                          |

diff --git a/doc/deletion-insertion.png b/doc/deletion-insertion.png
diff --git a/doc/filter-scheme.md b/doc/filter-scheme.md
@@ -0,0 +1,29 @@
+# Filters used in genotyper output
+
+## Variant level filters
+
+* **PASS**
+
+Variant PASS all filters
+
+* **CONFLICT**
+
+Variant has genotype conflicts in one or more breakpoints
+
+* **EXIST_BAD_BP**
+
+Varaint has one or more breakpoint that fails breakpoint-level filter
+
+* **ALL_BAD_BP**
+
+All breakpoints in this variant fail breakpoint-level filter
+
+* **MISSING**
+
+Variant has one or more breakpoints with no spanning read
+
+## Breakpoint level filters
+
+* **DEPTH**
+
+Total number of reads on this breakpoint (from all alleles) fail the coverage test
diff --git a/doc/genotyping-parameters.md b/doc/genotyping-parameters.md
@@ -0,0 +1,63 @@
+# How to set genotyping paramters externally
+
+This file shows how to set customized genotyping parameters externally in a JSON file.
+
+External genotyping parameter can be passed to multigrmpy.py and grmpy as JSON file with command line option `-G`
+
+Below includes all allowed parameter fields:
+
+```javascript
+{
+     // ploidy of samples
+    "ploidy": 2
+
+    // Minimal overlapping bases between a read and a mapped edge.
+    //      Used in estimating distribution parameter in genotyper
+    "min_overlap_bases": 16,
+
+
+    // Minimal depth p value for a breakpoint to pass depth test
+    "coverage_test_cutoff": 0.00001
+
+    // Allele names in graph(s).
+    //     If other alleles were observed in graph, they will be excluded from analysis.
+    "allele_names": [
+        "REF",
+        "ALT"
+    ],
+
+
+    // Error rate for each allele.
+    //     Alleles must be in the same orrder as "allele_names"
+    "allele_error_rates": [
+        0.04,
+        0.04
+    ],
+    // These two fields will be effective only if "allele_error_rates" doesn't exist.
+    "reference_allele_error_rate": 0.05, //  error rate of reference allele
+    "other_allele_error_rate": 0.05, // uniformed error rate of all alternative alleles
+
+
+    // Fraction of the non-reference allele in a reference-alternative heterozygotes.
+    //     Alleles must be in the same order as "allele_names"
+    "het_haplotype_fractions": [
+        0.5,
+        0.45
+    ],
+    // het_haplotype_fraction can be a float to represent a uniform fraction, such as:
+    // "het_haplotype_fractions": 0.5
+
+
+
+
+    // Fraction of each genotype in the population.
+    //     Genotype indices are zero-based, relative to the "allele_names" array
+    "genotype_fractions": {
+        "0/0": 0.5,
+        "0/1": 0.3,
+        "1/1": 0.2
+    },
+    // This field will be effective only if "genotype_fractions" doesn't exist
+    "other_genotype_fraction": 0.33, // uniform genotype fraction for each genotype
+}
+```
diff --git a/doc/graph-counting.md b/doc/graph-counting.md
@@ -0,0 +1,92 @@
+# Counting read support on graphs
+
+<!-- vscode-markdown-toc -->
+* [Introduction](#Introduction)
+* [Node counting](#Nodecounting)
+* [Edge counting](#Edgecounting)
+* [Path counting](#Pathcounting)
+
+<!-- vscode-markdown-toc-config
+	numbering=false
+	autoSave=true
+	/vscode-markdown-toc-config -->
+<!-- /vscode-markdown-toc -->
+
+## <a name='Introduction'></a>Introduction
+
+Reads mapped to a graph may support breakpoints, known small variants or haplotypes. For example, a SNP may be represented by two alternate single-basepair nodes, one for the reference nucleotide, and one for the alternate nucleotide, flanked by reference nodes left and right:
+
+![](snp.png)
+
+Reads supporting each of the two nodes can be used to establish which of the two SNP alleles are present in a sample. 
+
+Similarly, a deletion can be represented by an edge that skips a piece of reference sequence:
+
+![](deletion-insertion.png)
+
+
+In this document, we outline how in paragraph nodes, edges and path labels can be used to quantify read support for variants or alleles.
+
+Let *G = (N, E)* be a **sequence graph** with node set *N = \{ n<sub>1</sub>, n<sub>2</sub>, ..., n<sub>k</sub> \}* and edge set *E = \{ (f<sub>1</sub>, t<sub>1</sub>), (f<sub>2</sub>, t<sub>2</sub>), ... , (f<sub>j</sub>, t<sub>j</sub>) \}*. Edges are directed and tuple *(f, t)* represents and edge from node *n<sub>f</sub>* to node  *n<sub>t</sub>*. Each node *n<sub>i</sub>* has an associated sequence *S(i)*, and we assume that there are no cycles in *G*. 
+
+We define a **node path** as a sequence *\{ p<sub>1</sub>, p<sub>2</sub>, ... p<sub>l</sub> \}*, where the set of path nodes *\{ n<sub>p<sub>1</sub></sub>, n<sub>p<sub>2</sub></sub>, ..., n<sub>p<sub>l</sub></sub> \} &sub; N* where no node occurs twice (i.e. *p<sub>s</sub> &ne;p<sub>t</sub>* if *s &ne; t*), and all path edges *\{ (p<sub>1</sub>, p<sub>2</sub>), (p<sub>2</sub>, p<sub>3</sub>), ..., (p<sub>l-1</sub>, p<sub>l</sub>) \} &sub; E*. A **path** is a tuple *(q, r, P)* where *P* is a node path, *q* is a starting position in the sequence of the first node in *P* and *r* is an end position in the last node in *P*. When we don't specify *q* and *r* we implicitly assume that *q = 1* and *r = |n<sub>p<sub>l</sub></sub>|* (i.e. the first node and the last node of the path are fully contained in the path).
+
+Each read we align to a graph is assigned a single alignment path. Variants can be represented by single nodes or edges. Haplotypes can be represented either using single paths, or a set of multiple paths (a path family). In the following sections we discuss how support for nodes, edges, and paths may be quantified given a set of read alignment paths.
+
+## <a name='Nodecounting'></a>Node counting
+
+A read supports a node if its alignment overlaps the node with a minimum number of bases (typically 16 or the length of the node, whichever is smaller).
+
+![](graph-readcounting-nodes.png)
+
+Another thing we consider when deciding whether a read supports a node is base-identity: a read needs to have 90% of its bases mapped to a node match the node reference sequence in order to be counted in support of this node. The most extreme scenario would be a SNP node, where reads will only be counted if they match the SNP itself (due to the min(node length, 16bp) anchoring requirement).
+
+## <a name='Edgecounting'></a>Edge counting
+
+A read supports an edge if its alignment path contains the edge and anchors on both sides with a minimum number of bases.
+
+![](graph-readcounting-edges.png)
+
+## <a name='Pathcounting'></a>Path counting
+
+Let *P* be a path in our graph.
+
+![](graph-readcounting-paths.png)
+
+A read supports a path *P* in the graph if one of the following conditions are met:
+
+* the alignment path *Q* of the read fully contains path *P*,
+* path *P* fully containst the alignment path *Q*,
+* a suffix of *P* matches a prefix of *Q*,
+* a prefix of *P* matches a suffix of *Q*.
+
+Here are some examples:
+
+![](graph-readcounting-path-support.png)
+
+## <a name='PathFamilies'></a>Path Families
+
+A specific haplotype can be represented by a single path. However often we may be interested in grouping all haplotyes that contain certain alleles, without restricting the allele at other variants. As an example when working with a gene and pseudogene pair we may want to count any read with the gene disrupting variants towards the pseudogene, regardless of other variants modeled in the graph. 
+
+If enumerating all paths to be grouped explicitly becomes impractical, we can define an implicit set of paths, called a path family, *F* as a set of edges *\{ f<sub>1</sub>, f<sub>2</sub>, ... f<sub>l</sub> \}*. A path *P* is in the path family *F* iff at at least one node in *P* has an edge in *F* and for any node *p<sub>n</sub>* in *P*
+
+* No incoming edge into *p<sub>n</sub>* exists in *F* **or** *n == 0* **or** edge *(p<sub>n-1</sub>, p<sub>n</sub>)* is in *F*.
+* No outgoing edge from *p<sub>n</sub>* exists in *F* **or** * *n == |P|* 
+  (Number of nodes in *P*) **or** edge *(p<sub>n</sub>, p<sub>n+1</sub>)* is in *F*.
+
+I.e. informally, if a node has incoming or outgoing edges in *F*, then a path including the node must use one of those edges to belong to the part family. A path does not have to cover the entire path family (either all nodes or all edges), but may start in the middle or enter/exit through any edge into a node that has no incoming/outgoing edges in *F*.
+
+One way to represent a group of haplotypes with a path family is for *F* to contain the union of all the edges from the path corresponding to each haplotype.
+
+### Path Family counting
+
+A read is counted as consistent with a path family if the alignment path is in the path family. Hence a read can be consistent with several path families. To quantify specific support for a path family, we need to count reads that support only this path family and not other (e.g. only the pseudogene path family, not the gene). To this end paragraph reports the number of reads consistent with each subset of families defined on the graph. E.g. for a graph with two families *F<sub>1</sub>* and *F<sub>2</sub>*, it will report three counts, for  *{F<sub>1</sub>, F<sub>2</sub>}*, *F<sub>1</sub>* and  *F<sub>2</sub>*.
+
+### Implementation options
+Path families can be stored in the graph by tagging the edges with the ID all path families they are a part of. Path family counting can be implemented either by counting each read alignment path directly towards the subset of path families it matches.
+
+### Possible Extensions
+- Local loops could be supported by annotating an edge within *F* with a range of times it has to appear in *P'*. 
+
+
+
diff --git a/doc/graph-models.md b/doc/graph-models.md
@@ -203,16 +203,19 @@ error parameters for individual events as long as these have a high-enough
 population allele frequency.
 
 ## <a name='WholeVariantGenotyping'></a>Whole Variant Genotyping
-We use the above breakpoint genotyping model to determine the most likely 
-genotype for each breakpoint.
+We use the above breakpoint genotyping model to determine the most likely genotype for each breakpoint.
 
-To report a final genotype for the variant, we follow these conventions: 
+To report a final genotype for the variant, we follow these conventions:
 
-* If all breakpoints have the same most likely genotype, report this genotype as the overall variant genotype. The resulting filter value is *PASS*
-* If some breakpoints have homozygous reference genotype, and the other breakpoints have 
-another consistent non-reference genotype, report this non-reference genotype and label
-the filter as *CONFLICTS*.
-* If none of the above, the genotype is reported as *MISSING* (**./.**) and the filter is set to *CONFLICTS*.
+* If all breakpoints share the same genotype, report this genotype as the variant genotype. If all breakpoints pass the filters, the final variant filter will be *PASS*, otherwise the filter will have a flag of "EXIST_BAD_BP".
+
+* If there are conflicts in breakpoint genotypes, a filter flag "CONFLICT" will be set. And:
+
+  If there are one or more breakpoints that pass the filters, the variant genotype will be re-caculated from the total number of reads covering these passed breakpoints. An "EXIST_BAD_BP" filter flag will be attached.
+
+  If all breakpoints fail the filters, the variant genotype will be re-caculated from the total number of reads covering these breakpoints. An "ALL_BAD_BP" filter flag will be attached.
+
+Please refer to [filter-scheme](doc/filter-scheme.md) for all filter details.
 
 ## <a name='Example'></a>Example
 
@@ -274,10 +277,12 @@ Therefore, the maximum likelihood genotype of this breakpoint is REF/REF.
 
 ### <a name='WholeVariant'></a>Whole Variant
 
-As we have calculated above, genotype of BP1 is REF/REF. 
-If BP2 is also genotyped as REF/REF, then the variant genotype will be REF/REF. 
-The filter flag will be set as *PASS*.
+As we have calculated above, genotype of BP1 is REF/REF.
+
+If BP2 is also genotyped as REF/REF, then the variant genotype will be REF/REF.
+
+If BP1 and BP2 has different genotypes, a filter flag "CONFLICT" will be set. And:
+
+If they all pass the filters or all fail, the variant genotype will be re-caculated from the total number of reads covering the two breakpoints.
 
-If BP2 is not REF/REF and BP1 is the reference genotype, then the variant genotype will be the genotype of BP2. However, the filter flag will be set as *CONFLICTS* to indicate that not all breakpoints
-support this genotype. This may happen when for example when we type an event where the 
-start of the of the event is correct but the end is not.
+If one breakpoint passes the filters and the other one fails, the variant genotype will be set as the same genotype as the passed breakpoint.
diff --git a/doc/graph-readcounting-edges.png b/doc/graph-readcounting-edges.png
diff --git a/doc/graph-readcounting-nodes.png b/doc/graph-readcounting-nodes.png
diff --git a/doc/graph-readcounting-path-support.png b/doc/graph-readcounting-path-support.png
diff --git a/doc/graph-readcounting-paths.png b/doc/graph-readcounting-paths.png
diff --git a/doc/snp.png b/doc/snp.png
diff --git a/doc/validation-with-simulated-reads.md b/doc/validation-with-simulated-reads.md
@@ -0,0 +1,50 @@
+# Alignment validation with simulated reads
+
+The following describes steps required to assess accuracy of paragraph alignment using simulated data reads.
+
+## Generating reference sequences
+
+In order to simulate data reads, an input reference is required. `bin/graph-to-fasta` will produce a separate contig for each path in the supplied graphs:
+
+`/path/to/paragraph/bin/graph-to-fasta -r /path/to/full/genome.fa -g /path/to/graphs/*.json >graphs.fa`
+
+## Running EAGLE simulation to produce reads
+
+When building paragraph `configure --with-eagle=/path/to/EAGLE` can be used to enable paragraph/EAGLE integration. If this has not been done `libexec/simulate-reads.sh --with-eagle /path/to/EAGLE` can still be used to perform simulation. For example:
+
+`/path/to/paragraph/libexec/simulate-reads.sh --graph-genome graphs.fa --reference-genome=/path/to/whole/genome.fa --with-eagle /path/to/EAGLE`
+
+By default `simulate-reads.sh` will attempt to use all cores available on the node. Use `--jobs` to override.
+
+Once the `simulate-reads.sh` completes, the current folder will contain `simulated.bam` file with reads generated in such way that paragraph will load them when processing corresponding graph.json files.
+
+## Running validation
+
+It is important to run validation on a single thread. Otherwise each threads will produce a validation report which might not be a desired outcome.
+
+The following example runs validation for a combination of kmer and graph aligners:
+
+`/path/to/paragraph/bin/paragraph -r /path/to/whole/genome.fa -g /path/to/graphs/*.json -b simulated.bam -o aligned.json --exact no --graph-se yes --kmer no --validate --threads 1`
+
+Once paragraph completes, the log contains the validation summary table:
+
+```
+[2018-02-28 18:03:22.190] [paragraph] [info] [Done with alignment step 376 total aligned (exact: 0 / kmers: 0 / sw: 376) ; 2 were filtered]
+[2018-02-28 18:03:22.190] [paragraph] [info] [VALIDATION]       MAPQ    EmpMAPQ Wrong   Total
+[2018-02-28 18:03:22.190] [paragraph] [info] [VALIDATION]       unalgnd 0       0       0
+[2018-02-28 18:03:22.190] [paragraph] [info] [VALIDATION]       repeat  0       0       0
+[2018-02-28 18:03:22.190] [paragraph] [info] [VALIDATION]       60      60      0       376
+```
+
+The table columns have the following meaning:
+* MAPQ     MAPQ assigned by validation
+  * unalgnd unaligned reads
+  * repeat  reads that had more than one equivalent best candiate alignment
+  * 60      Uniquely aligned reads
+* EmpMAPQ  MAPQ computed using the following formula: -10 * log10(Wrong/Total)
+* Wrong    Number of alignments that don't support the path from which they have been simulated
+* Total    Total number of aligned reads
+
+# References
+
+EAGLE on github [https://github.com/sequencing/EAGLE](https://github.com/sequencing/EAGLE)
diff --git a/external/htslib-1.8.tar.gz b/external/htslib-1.8.tar.gz
diff --git a/external/htslib.tar.gz b/external/htslib.tar.gz
diff --git a/share/test-data/multiparagraph/expected.json b/share/test-data/multiparagraph/expected.json
@@ -249,9 +249,9 @@
                 "INS_RF:FWD": 1,
                 "INS_RF:READS": 1,
                 "INS_RF:REV": 0,
-                "LF_INS": 2,
-                "LF_INS:FWD": 2,
-                "LF_INS:READS": 2,
+                "LF_INS": 1,
+                "LF_INS:FWD": 1,
+                "LF_INS:READS": 1,
                 "LF_INS:REV": 0,
                 "LF_MID": 2,
                 "LF_MID:FWD": 2,
@@ -296,9 +296,9 @@
                     "total:REV": 0
                 },
                 "INS": {
-                    "total": 2,
-                    "total:FWD": 2,
-                    "total:READS": 2,
+                    "total": 1,
+                    "total:FWD": 1,
+                    "total:READS": 1,
                     "total:REV": 0
                 },
                 "REF": {
@@ -706,9 +706,9 @@
                 "INS_RF:FWD": 1,
                 "INS_RF:READS": 1,
                 "INS_RF:REV": 0,
-                "LF_INS": 2,
-                "LF_INS:FWD": 2,
-                "LF_INS:READS": 2,
+                "LF_INS": 1,
+                "LF_INS:FWD": 1,
+                "LF_INS:READS": 1,
                 "LF_INS:REV": 0,
                 "LF_RF": 2,
                 "LF_RF:FWD": 2,
@@ -739,9 +739,9 @@
             },
             "read_counts_by_sequence": {
                 "INS": {
-                    "total": 2,
-                    "total:FWD": 2,
-                    "total:READS": 2,
+                    "total": 1,
+                    "total:FWD": 1,
+                    "total:READS": 1,
                     "total:REV": 0
                 },
                 "REF": {