feat(junction-annotation): by default discard unannotated contigs (#96)

* feat: add `--discard-unannotated-contigs` flag to `junction-annotation` * chore: run `poetry lock` * feat: update behavior to that discussed offline BREAKING CHANGE: Always annotate as `unannotated_reference` in the junctions file. By default, exclude from the summary report. With flag, call them "novel" in the summary * style: whitespace * docs: fix bad hyperlink * docs: add new `-c` param to documentation * chore: changes requested by @claymcleod * chore: add `default=False` for explicitness (per @claymcleod suggestion) * chore: make defaults match
stjudecloud · Mar 22, 2023 · d2f2e18 · d2f2e18
1 parent 26242b2
commit d2f2e18
Show file tree

Hide file tree

Showing 6 changed files with 78 additions and 59 deletions.
diff --git a/docs/subcommands/encoding.md b/docs/subcommands/encoding.md
@@ -4,7 +4,7 @@ The `encoding` subcommand detects the likely PHRED score encoding schema used fo
 
 As a hypothetical example of a mis-translation; a FASTQ sequenced by an Illumina1.3 machine has PHRED quality scores ranging from 6-22. They would be encoded as ASCII values 70-86. If those ASCII values are not adjusted during BAM alignment, someone decoding the PHRED score according to the PHRED+33 specification would think the quality range for that sample was 37-53. This is erroneously high, and may convey undue confidence in the quality of the BAM.
 
-`ngsderive`'s encoding check implementation is based on details of the encoding schemes described [here][https://en.wikipedia.org/wiki/FASTQ_format#Encoding].
+`ngsderive`'s encoding check implementation is based on details of the encoding schemes described [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding).
 
 ## Limitations
 

diff --git a/docs/subcommands/junction_annotation.md b/docs/subcommands/junction_annotation.md
@@ -6,9 +6,10 @@ The following criteria are adjustable for junction detection and read filtering:
 
 | Option                               | Help                                                                                                  |
 | ------------------------------------ | ----------------------------------------------------------------------------------------------------- |
-| `-i`, `--min-intron`                 | Minimum intron length to be considered a junction (default: `50`)                                     |
-| `-q`, `--min-mapq`                   | Minimum read quality to be considered a supporting read (default: `30`)                               |
-| `-m`, `--min-reads`                  | Minimum number of reads supporting a junction for it to be reported (default: `2`)                    |
-| `-k`, `--fuzzy-junction-match-range` | Consider exonic starts/ends within `+-k` bases of a known intron start/end to be known (default: `0`) |
+| `-i`, `--min-intron`                            | Minimum intron length to be considered a junction (default: `50`)                                     |
+| `-q`, `--min-mapq`                              | Minimum read quality to be considered a supporting read (default: `30`)                               |
+| `-m`, `--min-reads`                             | Minimum number of reads supporting a junction for it to be reported (default: `2`)                    |
+| `-k`, `--fuzzy-junction-match-range`            | Consider exonic starts/ends within `+-k` bases of a known intron start/end to be known (default: `0`) |
+| `-c`, `--consider-unannotated-references-novel` | For the summary report, consider all events on unannotated reference sequences `complete_novel`. Default is to exclude them from the summary. Either way, they will be annotated as `unannotated_reference` in the junctions file. (default: False) |
 
-In the resulting `<basename>.junctions.tsv` files, note that the coordinates are 0-based, end-exclusive. Generation of these files can be disabled with `--disable-junction-files`. Instead only the summary file of junction and splice counts will be generated.
+In the resulting `<basename>.junctions.tsv` files, note that the coordinates are 0-based, end-exclusive. Generation of these files can be disabled with `--disable-junction-files`. Instead only the summary of junction and splice counts will be generated.
diff --git a/ngsderive/__main__.py b/ngsderive/__main__.py
@@ -94,8 +94,8 @@ class SaneFormatter(
         "--max-tries",
         type=int,
         default=3,
-        help="When inconclusive, the test will repeat until this many tries have been reached. \
-            Evidence of previous attempts is saved and reused, leading to a larger sample size with multiple attempts.",
+        help="When inconclusive, the test will repeat until this many tries have been reached. "
+             "Evidence of previous attempts is saved and reused, leading to a larger sample size with multiple attempts.",
     )
     strandedness.add_argument(
         "--max-iterations-per-try",
@@ -201,6 +201,15 @@ class SaneFormatter(
         help="Consider found splices within `+-k` bases of a known splice event annotated.",
         default=0,
     )
+    junction_annotation.add_argument(
+        "-c",
+        "--consider-unannotated-references-novel",
+        action="store_true",
+        default=False,
+        help="For the summary report, consider all events on unannotated reference sequences `complete_novel`. "
+             "Default is to exclude them from the summary. "
+             "Either way, they will be annotated as `unannotated_reference` in the junctions file.",
+    )
 
     args = parser.parse_args()
     if not args.subcommand:
@@ -310,6 +319,7 @@ def run():
             min_mapq=args.min_mapq,
             min_reads=args.min_reads,
             fuzzy_range=args.fuzzy_junction_match_range,
+            consider_unannotated_references_novel=args.consider_unannotated_references_novel,
             junction_dir=args.junction_files_dir,
             disable_junction_files=args.disable_junction_files,
         )
diff --git a/ngsderive/commands/junction_annotation.py b/ngsderive/commands/junction_annotation.py
@@ -30,6 +30,7 @@ def annotate_junctions(
     min_mapq,
     min_reads,
     fuzzy_range,
+    consider_unannotated_references_novel,
     junction_dir,
     disable_junction_files,
 ):
@@ -77,7 +78,7 @@ def annotate_junctions(
 
     for contig in samfile.references:
         num_too_few_reads = 0
-        logger.debug(f"Searching {contig} for splice junctions...")
+        logger.info(f"Searching {contig} for splice junctions...")
         found_introns = samfile.find_introns(
             [seg for seg in samfile.fetch(contig) if seg.mapping_quality >= min_mapq]
         )
@@ -97,14 +98,19 @@ def annotate_junctions(
         )
 
         if contig not in cache.exon_starts:
-            logger.info(f"{contig} not found in GFF. All events novel.")
-            annotation = "complete_novel"
+            logger.info(f"{contig} not found in GFF. All events marked `unannotated_reference`.")
+            annotation = "unannotated_reference"
+            if consider_unannotated_references_novel:
+                logger.info(f"Events being considered novel for summary report.")
+
             for intron_start, intron_end, num_reads in events:
                 if num_reads < min_reads:
                     num_too_few_reads += 1
                     continue
-                num_novel += 1
-                num_novel_spliced_reads += num_reads
+                if consider_unannotated_references_novel:
+                    num_novel += 1
+                    num_novel_spliced_reads += num_reads
+
                 if junction_file:
                     print(
                         "\t".join(
@@ -253,8 +259,9 @@ def main(
     outfile=sys.stdout,
     min_intron=50,
     min_mapq=30,
-    min_reads=1,
+    min_reads=2,
     fuzzy_range=0,
+    consider_unannotated_references_novel=False,
     junction_dir="./",
     disable_junction_files=False,
 ):
@@ -264,36 +271,26 @@ def main(
     logger.info("  - Minimum MAPQ: {}".format(min_mapq))
     logger.info("  - Minimum reads per junction: {}".format(min_reads))
     logger.info("  - Fuzzy junction range: +-{}".format(fuzzy_range))
+    logger.info("  - Consider unannotated references novel: {}".format(consider_unannotated_references_novel))
+    if not disable_junction_files:
+        logger.info("  - Junction file directory: {}".format(junction_dir))
+    else:
+        logger.info("  - Junction file directory: <disabled>")
 
-    logger.debug("Processing gene model...")
+    logger.info("Processing gene model...")
     gff = GFF(
         gene_model_file,
         feature_type="exon",
         dataframe_mode=False,
     )
     cache = JunctionCache(gff)
-    logger.debug("Done")
-
-    fieldnames = [
-        "File",
-        "total_junctions",
-        "total_splice_events",
-        "known_junctions",
-        "partial_novel_junctions",
-        "complete_novel_junctions",
-        "known_spliced_reads",
-        "partial_novel_spliced_reads",
-        "complete_novel_spliced_reads",
-    ]
-
-    writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter="\t")
-    writer.writeheader()
-    outfile.flush()
+    logger.info("Done")
 
     junction_dir = Path(junction_dir)
     if not disable_junction_files:
         junction_dir.mkdir(parents=True, exist_ok=True)
 
+    writer = None
     for ngsfilepath in ngsfiles:
         entry = annotate_junctions(
             ngsfilepath,
@@ -302,8 +299,25 @@ def main(
             min_mapq=min_mapq,
             min_reads=min_reads,
             fuzzy_range=fuzzy_range,
+            consider_unannotated_references_novel=consider_unannotated_references_novel,
             junction_dir=junction_dir,
             disable_junction_files=disable_junction_files,
         )
+
+        if not writer:
+            fieldnames = [
+                "File",
+                "total_junctions",
+                "total_splice_events",
+                "known_junctions",
+                "partial_novel_junctions",
+                "complete_novel_junctions",
+                "known_spliced_reads",
+                "partial_novel_spliced_reads",
+                "complete_novel_spliced_reads",
+            ]
+
+            writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter="\t")
+            writer.writeheader()
         writer.writerow(entry)
         outfile.flush()
diff --git a/ngsderive/commands/strandedness.py b/ngsderive/commands/strandedness.py
@@ -335,9 +335,7 @@ def main(
         fieldnames = ["ReadGroup"] + fieldnames
     fieldnames = ["File"] + fieldnames
 
-    writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter="\t")
-    writer.writeheader()
-    outfile.flush()
+    writer = None
 
     for ngsfilepath in ngsfiles:
         tries_for_file = 0
@@ -369,6 +367,9 @@ def main(
             if entries_contains_inconclusive and tries_for_file < max_tries:
                 continue
 
+            if not writer:
+                writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter="\t")
+                writer.writeheader()
             for entry in entries:
                 writer.writerow(entry)
                 outfile.flush()

diff --git a/poetry.lock b/poetry.lock