Skip to content

Commit

Permalink
Merge branch 'master' of github.com:dib-lab/khmer into feature/pathlink
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Jun 27, 2016
2 parents 7125a33 + 8760d84 commit 6e9650e
Show file tree
Hide file tree
Showing 14 changed files with 537 additions and 450 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,4 @@ compile_commands.json
pylint_report.txt
pep8_report.txt
pep257_report.txt
.cache/
37 changes: 37 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
2016-06-26 Titus Brown <titus@idyll.org>

* khmer/_khmer.cc, lib/{hashtable.cc, hashtable.hh},
tests/test-data/simple-genome.fa, tests/test_nodegraph.py: added functions
'find_high_degree_nodes' and 'traverse_linear_path' to hashtables/graphs.
* lib/kmer_hash.cc: minor change to use object member _seq instead of
constructor argument seq in KmerIterator.
* lib/kmer_hash.hh: added const to Kmer::get_string_rep(...) signature.
* sandbox/extract-compact-dbg.py: new sandbox script to
extract compact De Bruijn graphs (with linear paths contracted).
* tests/test_sandbox_scripts.py: added tests for the extract-compact-dbg.py
script.

2016-06-26 Titus Brown <titus@idyll.org>

* lib/khmer.hh,lib/{labelhash.cc,labelhash.hh}: removed label pointer-based
Expand All @@ -6,6 +19,30 @@
* tests/test_labelhash.py: renamed to match new API; no changes in test
logic.

2016-06-17 Daniel Standage <daniel.standage@gmail.com>

* scripts/filter-abund-single.py: add -o/--outfile option.
* tests/test_script_output.py: move `_calc_md5` function for calculating
file MD5 hashes to test utils module (tests/khmer_tst_utils.py).
* tests/{test_filter_abund.py,test_scripts.py): move filter_abund tests to
dedicated test script, add minimal test for new -o option.
* tests/test-data/paired-mixed-witherror.fa.pe: add data file in support of
new test.
* .gitignore: add .cache/ directory, which appears to be an artifact of the
py.test framework.

2016-06-17 Daniel Standage <daniel.standage@gmail.com>

* scripts/filter-abund-single.py: add -o/--outfile option.
* tests/test_script_output.py: move `_calc_md5` function for calculating file
MD5 hashes to test utils module (tests/khmer_tst_utils.py).
* tests/{test_filter_abund.py,test_scripts.py): move filter_abund tests to
dedicated test script, add minimal test for new -o option.
* tests/test-data/paired-mixed-witherror.fa.pe: add data file in support of
new test.
* .gitignore: add .cache/ directory, which appears to be an artifact of the
py.test framework.

2016-05-25 Titus Brown <titus@idyll.org>

* scripts/trim-low-abund.py: switched to watermark-based reporting to
Expand Down
6 changes: 1 addition & 5 deletions lib/hashtable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1027,10 +1027,6 @@ unsigned int Hashtable::traverse_linear_path(const Kmer seed_kmer,
{
unsigned int size = 0;

auto filter = [&] (Kmer& n) -> bool {
return true;
};

Traverser traverser(this);

// if this k-mer is in the Bloom filter, truncate search.
Expand All @@ -1050,7 +1046,7 @@ unsigned int Hashtable::traverse_linear_path(const Kmer seed_kmer,
size += 1;

KmerQueue node_q;
traverser.traverse(kmer, node_q, filter);
traverser.traverse(kmer, node_q);

while (node_q.size()) {
Kmer node = node_q.front();
Expand Down
1 change: 0 additions & 1 deletion lib/kmer_hash.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ Contact: khmer-project@idyll.org
#include <string.h>
#include <algorithm>
#include <string>
#include <iostream>

#include "MurmurHash3.h"
#include "khmer.hh"
Expand Down
3 changes: 2 additions & 1 deletion lib/kmer_hash.hh
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ HashIntoType _hash_murmur_forward(const std::string& kmer);
*/
class Kmer
{

public:

/// The forward hash
Expand Down Expand Up @@ -209,7 +210,7 @@ public:
*
* Contact: camille.scott.w@gmail.com
*
*/
*/
class KmerFactory
{
protected:
Expand Down
65 changes: 59 additions & 6 deletions sandbox/extract-compact-dbg.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

# graph settings
DEFAULT_KSIZE=31
NODEGRAPH_SIZE=8e8 # small, big is 2e8
NODEGRAPH_SIZE=8e8

# minhash settings
MH_SIZE_DIVISOR=50
Expand Down Expand Up @@ -72,9 +72,8 @@ def main():
parser.add_argument('-o', '--output', default=None)
parser.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int)
parser.add_argument('-x', '--tablesize', default=NODEGRAPH_SIZE,
type=float)
type=float)
parser.add_argument('--force', action='store_true')
#parser.add_argument('--gml', action='store_true')
args = parser.parse_args()

assert args.ksize % 2, "ksize must be odd"
Expand Down Expand Up @@ -159,11 +158,9 @@ def main():

# save to GML
if args.output:
import graph_writer

print('saving to', args.output)
fp = open(args.output, 'w')
w = graph_writer.GmlWriter(fp, [], [])
w = GmlWriter(fp, [], [])

for k, v in pathy.segments.items():
w.add_vertex(k, v, [])
Expand All @@ -173,5 +170,61 @@ def main():
w.add_edge(k, edge, [])


# Author of the below code: Dominik Moritz, originally for spacegraphcats.

class GmlWriter:
"""Similar to the writer for gxt above but for gml."""

def __init__(self, file, vertex_attributes=None, edge_attributes=None, directed=False):
"""Initialize graph writer."""
self.file = file

if vertex_attributes is not None:
self.vertex_attributes = vertex_attributes
if edge_attributes is not None:
self.edge_attributes = edge_attributes

if directed:
self._write('graph [\n directed 1\n')
else:
self._write('graph [\n directed 0\n')

def _write(self, string):
self.file.write(string)

def _quote(self, value):
if isinstance(value, str):
return '"{}"'.format(value)
return value

def add_vertex(self, id, size, attribute_values=[], vertex_attributes=None):
"""Add a vertex to the output."""
if not hasattr(self, 'vertex_attributes'):
self.vertex_attributes = vertex_attributes

self._write(' node [\n')
self._write(' id {}\n'.format(id))
self._write(' size {}\n'.format(size))
for k, v in zip(self.vertex_attributes, attribute_values):
self._write(' {} {}\n'.format(k, self._quote(v)))
self._write(' ]\n')

def add_edge(self, src, dest, attribute_values=[], edge_attributes=None):
"""Add an edge to the output."""
if not hasattr(self, 'edge_attributes'):
self.edge_attributes = edge_attributes

self._write(' edge [\n')
self._write(' source {}\n'.format(src))
self._write(' target {}\n'.format(dest))
for k, v in zip(self.edge_attributes, attribute_values):
self._write(' {} {}\n'.format(k, self._quote(v)))
self._write(' ]\n')

def done(self):
"""Call when done."""
self._write(']\n')


if __name__ == '__main__':
main()
80 changes: 0 additions & 80 deletions sandbox/graph_writer.py

This file was deleted.

17 changes: 12 additions & 5 deletions scripts/filter-abund-single.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#! /usr/bin/env python
# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) 2013-2015, Michigan State University.
# Copyright (C) 2015, The Regents of the University of California.
# Copyright (C) 2015-2016, The Regents of the University of California.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -88,6 +88,10 @@ def get_parser():
parser.add_argument('--savegraph', metavar="filename", default='',
help="If present, the name of the file to save the "
"k-mer countgraph to")
parser.add_argument('-o', '--outfile', metavar='optional_output_filename',
default=None, help='Override default output filename '
'and output trimmed sequences into a file with the '
'given filename.')
parser.add_argument('datafile', metavar='input_sequence_filename',
help="FAST[AQ] sequence file to trim")
parser.add_argument('-f', '--force', default=False, action='store_true',
Expand Down Expand Up @@ -153,14 +157,17 @@ def process_fn(record):

# the filtering loop
print('filtering', args.datafile, file=sys.stderr)
outfile = os.path.basename(args.datafile) + '.abundfilt'
outfile = open(outfile, 'wb')
outfp = get_file_writer(outfile, args.gzip, args.bzip)
if args.outfile is None:
outfile = os.path.basename(args.datafile) + '.abundfilt'
else:
outfile = args.outfile
outfp = open(outfile, 'wb')
outfp = get_file_writer(outfp, args.gzip, args.bzip)

tsp = ThreadedSequenceProcessor(process_fn)
tsp.start(verbose_loader(args.datafile), outfp)

print('output in', outfile.name, file=sys.stderr)
print('output in', outfile, file=sys.stderr)

if args.savegraph:
print('Saving k-mer countgraph filename',
Expand Down
7 changes: 7 additions & 0 deletions tests/khmer_tst_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import traceback
import subprocess
from io import open # pylint: disable=redefined-builtin
from hashlib import md5

import pytest

Expand All @@ -53,6 +54,12 @@
from io import StringIO


def _calc_md5(fp):
m = md5()
m.update(fp.read())
return m.hexdigest()


def get_test_data(filename):
filepath = None
try:
Expand Down
12 changes: 12 additions & 0 deletions tests/test-data/paired-mixed-witherror.fa.pe
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
>895:1:37:17593:9954/1
GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
>895:1:37:17593:9954/2
GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
>895:1:37:17593:9954/1
GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
>895:1:37:17593:9954/2
GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
>895:1:37:17593:9954/1
GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
>895:1:37:17593:9954/2
GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGGCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
Loading

0 comments on commit 6e9650e

Please sign in to comment.