Merge branch 'master' of github.com:dib-lab/khmer into feature/pathlink

dib-lab · Jun 27, 2016 · 6e9650e · 6e9650e
2 parents 7125a33 + 8760d84
commit 6e9650e
Show file tree

Hide file tree

Showing 14 changed files with 537 additions and 450 deletions.
diff --git a/.gitignore b/.gitignore
@@ -51,3 +51,4 @@ compile_commands.json
 pylint_report.txt
 pep8_report.txt
 pep257_report.txt
+.cache/
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,16 @@
+2016-06-26  Titus Brown  <titus@idyll.org>
+
+   * khmer/_khmer.cc, lib/{hashtable.cc, hashtable.hh},
+   tests/test-data/simple-genome.fa, tests/test_nodegraph.py: added functions
+   'find_high_degree_nodes' and 'traverse_linear_path' to hashtables/graphs.
+   * lib/kmer_hash.cc: minor change to use object member _seq instead of
+   constructor argument seq in KmerIterator.
+   * lib/kmer_hash.hh: added const to Kmer::get_string_rep(...) signature.
+   * sandbox/extract-compact-dbg.py: new sandbox script to
+   extract compact De Bruijn graphs (with linear paths contracted).
+   * tests/test_sandbox_scripts.py: added tests for the extract-compact-dbg.py
+   script.
+
 2016-06-26  Titus Brown  <titus@idyll.org>
 
    * lib/khmer.hh,lib/{labelhash.cc,labelhash.hh}: removed label pointer-based
@@ -6,6 +19,30 @@
    * tests/test_labelhash.py: renamed to match new API; no changes in test
    logic.
 
+2016-06-17  Daniel Standage <daniel.standage@gmail.com>
+
+   * scripts/filter-abund-single.py: add -o/--outfile option.
+   * tests/test_script_output.py: move `_calc_md5` function for calculating
+   file MD5 hashes to test utils module (tests/khmer_tst_utils.py).
+   * tests/{test_filter_abund.py,test_scripts.py): move filter_abund tests to
+   dedicated test script, add minimal test for new -o option.
+   * tests/test-data/paired-mixed-witherror.fa.pe: add data file in support of
+   new test.
+   * .gitignore: add .cache/ directory, which appears to be an artifact of the
+   py.test framework.
+
+2016-06-17  Daniel Standage <daniel.standage@gmail.com>
+
+  * scripts/filter-abund-single.py: add -o/--outfile option.
+  * tests/test_script_output.py: move `_calc_md5` function for calculating file
+  MD5 hashes to test utils module (tests/khmer_tst_utils.py).
+  * tests/{test_filter_abund.py,test_scripts.py): move filter_abund tests to
+  dedicated test script, add minimal test for new -o option.
+  * tests/test-data/paired-mixed-witherror.fa.pe: add data file in support of
+  new test.
+  * .gitignore: add .cache/ directory, which appears to be an artifact of the
+  py.test framework.
+
 2016-05-25  Titus Brown  <titus@idyll.org>
 
    * scripts/trim-low-abund.py: switched to watermark-based reporting to

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
@@ -1027,10 +1027,6 @@ unsigned int Hashtable::traverse_linear_path(const Kmer seed_kmer,
 {
     unsigned int size = 0;
 
-    auto filter = [&] (Kmer& n) -> bool {
-        return true;
-    };
-
     Traverser traverser(this);
 
     // if this k-mer is in the Bloom filter, truncate search.
@@ -1050,7 +1046,7 @@ unsigned int Hashtable::traverse_linear_path(const Kmer seed_kmer,
         size += 1;
 
         KmerQueue node_q;
-        traverser.traverse(kmer, node_q, filter);
+        traverser.traverse(kmer, node_q);
 
         while (node_q.size()) {
             Kmer node = node_q.front();

diff --git a/lib/kmer_hash.cc b/lib/kmer_hash.cc
@@ -40,7 +40,6 @@ Contact: khmer-project@idyll.org
 #include <string.h>
 #include <algorithm>
 #include <string>
-#include <iostream>
 
 #include "MurmurHash3.h"
 #include "khmer.hh"

diff --git a/lib/kmer_hash.hh b/lib/kmer_hash.hh
@@ -132,6 +132,7 @@ HashIntoType _hash_murmur_forward(const std::string& kmer);
  */
 class Kmer
 {
+
 public:
 
     /// The forward hash
@@ -209,7 +210,7 @@ public:
  *
  * Contact: camille.scott.w@gmail.com
  *
-*/
+ */
 class KmerFactory
 {
 protected:

diff --git a/sandbox/extract-compact-dbg.py b/sandbox/extract-compact-dbg.py
@@ -8,7 +8,7 @@
 
 # graph settings
 DEFAULT_KSIZE=31
-NODEGRAPH_SIZE=8e8 # small, big is 2e8
+NODEGRAPH_SIZE=8e8
 
 # minhash settings
 MH_SIZE_DIVISOR=50
@@ -72,9 +72,8 @@ def main():
     parser.add_argument('-o', '--output', default=None)
     parser.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int)
     parser.add_argument('-x', '--tablesize', default=NODEGRAPH_SIZE,
-                            type=float)
+                        type=float)
     parser.add_argument('--force', action='store_true')
-    #parser.add_argument('--gml', action='store_true')
     args = parser.parse_args()
 
     assert args.ksize % 2, "ksize must be odd"
@@ -159,11 +158,9 @@ def main():
 
     # save to GML
     if args.output:
-        import graph_writer
-
         print('saving to', args.output)
         fp = open(args.output, 'w')
-        w = graph_writer.GmlWriter(fp, [], [])
+        w = GmlWriter(fp, [], [])
 
         for k, v in pathy.segments.items():
             w.add_vertex(k, v, [])
@@ -173,5 +170,61 @@ def main():
                 w.add_edge(k, edge, [])
 
 
+# Author of the below code: Dominik Moritz, originally for spacegraphcats.
+
+class GmlWriter:
+    """Similar to the writer for gxt above but for gml."""
+
+    def __init__(self, file, vertex_attributes=None, edge_attributes=None, directed=False):
+        """Initialize graph writer."""
+        self.file = file
+
+        if vertex_attributes is not None:
+            self.vertex_attributes = vertex_attributes
+        if edge_attributes is not None:
+            self.edge_attributes = edge_attributes
+
+        if directed:
+            self._write('graph [\n   directed 1\n')
+        else:
+            self._write('graph [\n   directed 0\n')
+
+    def _write(self, string):
+        self.file.write(string)
+
+    def _quote(self, value):
+        if isinstance(value, str):
+            return '"{}"'.format(value)
+        return value
+
+    def add_vertex(self, id, size, attribute_values=[], vertex_attributes=None):
+        """Add a vertex to the output."""
+        if not hasattr(self, 'vertex_attributes'):
+            self.vertex_attributes = vertex_attributes
+
+        self._write('  node [\n')
+        self._write('    id {}\n'.format(id))
+        self._write('    size {}\n'.format(size))
+        for k, v in zip(self.vertex_attributes, attribute_values):
+            self._write('    {} {}\n'.format(k, self._quote(v)))
+        self._write('  ]\n')
+
+    def add_edge(self, src, dest, attribute_values=[], edge_attributes=None):
+        """Add an edge to the output."""
+        if not hasattr(self, 'edge_attributes'):
+            self.edge_attributes = edge_attributes
+
+        self._write('  edge [\n')
+        self._write('    source {}\n'.format(src))
+        self._write('    target {}\n'.format(dest))
+        for k, v in zip(self.edge_attributes, attribute_values):
+            self._write('    {} {}\n'.format(k, self._quote(v)))
+        self._write('  ]\n')
+
+    def done(self):
+        """Call when done."""
+        self._write(']\n')
+
+
 if __name__ == '__main__':
     main()
diff --git a/sandbox/graph_writer.py b/sandbox/graph_writer.py
diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py
@@ -1,7 +1,7 @@
 #! /usr/bin/env python
 # This file is part of khmer, https://github.com/dib-lab/khmer/, and is
 # Copyright (C) 2013-2015, Michigan State University.
-# Copyright (C) 2015, The Regents of the University of California.
+# Copyright (C) 2015-2016, The Regents of the University of California.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
@@ -88,6 +88,10 @@ def get_parser():
     parser.add_argument('--savegraph', metavar="filename", default='',
                         help="If present, the name of the file to save the "
                         "k-mer countgraph to")
+    parser.add_argument('-o', '--outfile', metavar='optional_output_filename',
+                        default=None, help='Override default output filename '
+                        'and output trimmed sequences into a file with the '
+                        'given filename.')
     parser.add_argument('datafile', metavar='input_sequence_filename',
                         help="FAST[AQ] sequence file to trim")
     parser.add_argument('-f', '--force', default=False, action='store_true',
@@ -153,14 +157,17 @@ def process_fn(record):
 
     # the filtering loop
     print('filtering', args.datafile, file=sys.stderr)
-    outfile = os.path.basename(args.datafile) + '.abundfilt'
-    outfile = open(outfile, 'wb')
-    outfp = get_file_writer(outfile, args.gzip, args.bzip)
+    if args.outfile is None:
+        outfile = os.path.basename(args.datafile) + '.abundfilt'
+    else:
+        outfile = args.outfile
+    outfp = open(outfile, 'wb')
+    outfp = get_file_writer(outfp, args.gzip, args.bzip)
 
     tsp = ThreadedSequenceProcessor(process_fn)
     tsp.start(verbose_loader(args.datafile), outfp)
 
-    print('output in', outfile.name, file=sys.stderr)
+    print('output in', outfile, file=sys.stderr)
 
     if args.savegraph:
         print('Saving k-mer countgraph filename',

diff --git a/tests/khmer_tst_utils.py b/tests/khmer_tst_utils.py
@@ -44,6 +44,7 @@
 import traceback
 import subprocess
 from io import open  # pylint: disable=redefined-builtin
+from hashlib import md5
 
 import pytest
 
@@ -53,6 +54,12 @@
     from io import StringIO
 
 
+def _calc_md5(fp):
+    m = md5()
+    m.update(fp.read())
+    return m.hexdigest()
+
+
 def get_test_data(filename):
     filepath = None
     try:

diff --git a/tests/test-data/paired-mixed-witherror.fa.pe b/tests/test-data/paired-mixed-witherror.fa.pe
@@ -0,0 +1,12 @@
+>895:1:37:17593:9954/1
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
+>895:1:37:17593:9954/2
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
+>895:1:37:17593:9954/1
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
+>895:1:37:17593:9954/2
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
+>895:1:37:17593:9954/1
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
+>895:1:37:17593:9954/2
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGGCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG