Merge pull request #1 from ged-lab/bleeding-edge

Bleeding edge
dib-lab · Aug 8, 2013 · 0faabbd · 0faabbd
2 parents 07dcc9e + 15bbe80
commit 0faabbd
Show file tree

Hide file tree

Showing 17 changed files with 1,518 additions and 128 deletions.
diff --git a/Doxyfile b/Doxyfile
@@ -52,7 +52,7 @@ PROJECT_LOGO           =
 # If a relative path is entered, it will be relative to the location 
 # where doxygen was started. If left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       = /home/mcrusoe/khmer/bleeding-edge/doc/doxygen
+OUTPUT_DIRECTORY       = doc/doxygen
 
 # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 
 # 4096 sub-directories (in 2 levels) under the output directory of each output 
@@ -665,9 +665,9 @@ WARN_LOGFILE           =
 # directories like "/usr/src/myproject". Separate the files or directories 
 # with spaces.
 
-INPUT                  = /home/mcrusoe/khmer/bleeding-edge \
-                         /home/mcrusoe/khmer/bleeding-edge/lib \
-                         /home/mcrusoe/khmer/bleeding-edge/python
+INPUT                  = . \
+                         lib \
+                         python
 
 # This tag can be used to specify the character encoding of the source files 
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 
@@ -732,15 +732,16 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is 
 # run.
 
-EXCLUDE                = /home/mcrusoe/khmer/bleeding-edge/python/build \
-                         /home/mcrusoe/khmer/bleeding-edge/lib/zlib \
-                         /home/mcrusoe/khmer/bleeding-edge/lib/bzip2 \
-                         /home/mcrusoe/khmer/bleeding-edge/figuregen \
-                         /home/mcrusoe/khmer/bleeding-edge/sandbox \
-                         /home/mcrusoe/khmer/bleeding-edge/tests \
-                         /home/mcrusoe/khmer/bleeding-edge/novelty \
-                         /home/mcrusoe/khmer/bleeding-edge/doc \
-                         /home/mcrusoe/khmer/bleeding-edge/plots
+EXCLUDE                = python/build \
+                         lib/zlib \
+                         lib/bzip2 \
+                         figuregen \
+                         sandbox \
+                         tests \
+                         novelty \
+                         doc \
+                         plots \
+			 .env
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or 
 # directories that are symbolic links (a Unix file system feature) are excluded 

diff --git a/doc/development.txt b/doc/development.txt
@@ -13,12 +13,18 @@ For C++, let's use `Todd Hoff's coding standard
 <http://www.possibility.com/Cpp/CppCodingStandard.html>`__, and
 `astyle -A10 / "One True Brace Style"
 <http://astyle.sourceforge.net/astyle.html>`__ indentation and
-bracing.
+bracing.  Note: we need to find vi and emacs settings that work for this!
 
 For Python, `PEP 8 <http://www.python.org/dev/peps/pep-0008/>`__ seems
 like a good standard to default to, and there are tools to check it,
 too.
 
+git and github strategies
+-------------------------
+
+Still in the works, but read `this
+<http://scottchacon.com/2011/08/31/github-flow.html>`__.
+
 Testing
 -------
 

diff --git a/doc/index.txt b/doc/index.txt
@@ -10,6 +10,11 @@ khmer -- k-mer counting & filtering FTW
 :Contact: ctb@msu.edu
 :License: BSD
 
+khmer is a library and suite of command line tools for working with
+DNA sequence.  It is primarily aimed at short-read sequencing data
+such as that produced by the Illumina platform.  khmer takes a k-mer-centric
+approach to sequence analysis, hence the name.
+
 There are two mailing lists dedicated to khmer, an announcements-only list and
 a discussion list. To search their archives and sign-up for them, please visit
 the following URLs:
@@ -20,6 +25,14 @@ the following URLs:
 
 The archives for the khmer list are available at: http://lists.idyll.org/pipermail/khmer/
 
+khmer development has largely been supported by AFRI Competitive Grant
+no.  `2010-65205-20361
+<http://ged.msu.edu/downloads/2009-usda-vertex.pdf>`__ from the USDA
+NIFA, and is now funded by the National Human Genome Research
+Institute of the National Institutes of Health under Award Number
+`R01HG007513 <http://ged.msu.edu/downloads/2012-bigdata-nsf.pdf>`__,
+both to C. Titus Brown.
+
 Contents:
 
 .. toctree::

diff --git a/doc/scripts.txt b/doc/scripts.txt
@@ -1,6 +1,6 @@
-========================
-Ready-made khmer scripts
-========================
+==============================
+khmer's command-line interface
+==============================
 
 For -x and -N parameters, see :doc:`choosing-hash-sizes`.
 
@@ -16,6 +16,12 @@ k-mer counting
    Build a counting hash table using the sequences in <file1-N>, and save it
    to <output.kh>.
 
+   Note that with the '-b' option <output.kh> will be the exact size
+   of your hashtables, and load-into-counting.py will be constant
+   memory; in exchange, k-mer counts will stop at 255.  The memory
+   usage of this script with '-b' will be about 1.15x the product of
+   the -x and -N numbers.
+
    Example::
 
 	scripts/load-into-counting.py -k 20 -x 5e7 out.kh data/100k-filtered.fa
@@ -45,6 +51,33 @@ k-mer counting
    are (1) k-mer abundance, (2) k-mer count, (3) cumulative count, (4) fraction
    of total distinct k-mers.
 
+**abundance-dist-single.py**: calculate the k-mer abundance distribution for a single file
+
+   Usage::
+
+	scripts/abundance-dist-single.py [ options ] <datafile> <histout>
+
+   Count the k-mer abundance distribution in <datafile>; output
+   distribution to <histout>.  (This is a single-step/in-memory
+   version of abundance-dist.py; no counting hash file will be
+   created unless --savehash is specified.)
+
+   Note that with the '-b' option this script is constant memory; in
+   exchange, k-mer counts will stop at 255. The memory usage of this
+   script with '-b' will be about 1.15x the product of the -x and -N
+   numbers.
+
+   To count k-mers in multiple files, use load-into-counting.py and
+   abundance-dist.py.
+
+   Example::
+
+	scripts/abundance-dist-single.py -k 20 -x 5e7 data/100k-filtered.fa out.hist
+
+   The file 'out.hist' will contain the histogram of k-mer abundances.  Columns
+   are (1) k-mer abundance, (2) k-mer count, (3) cumulative count, (4) fraction
+   of total distinct k-mers.
+
 **filter-abund.py**: trim sequences at a min k-mer abundance.
 
    Usage::
@@ -65,6 +98,26 @@ k-mer counting
 	scripts/load-into-counting.py -k 20 -x 5e7 table.kh data/100k-filtered.fa
 	scripts/filter-abund.py -C 2 table.kh data/100k-filtered.fa
 
+**filter-abund-single.py**: trim sequences at a min k-mer abundance.
+
+   Usage::
+
+	filter-abund-single.py [ -C <cutoff> ... ] <file>
+
+   Trim sequences at k-mer abundance < C.  (This is an in-memory
+   version of filter-abund.py that works for single files.)
+
+   Trimmed sequences will be placed in <file>.abundfilt.
+
+   This script is constant memory.
+
+   To trim reads based on k-mer abundance across multiple files, use
+   load-into-counting and filter-abund.
+
+   Example::
+
+	scripts/filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
+
 **count-median.py**: count median, average, and stddev of k-mer abundance in sequences.
 
    Usage::
@@ -246,10 +299,28 @@ redundant sequences)
 
    Paired end reads can be considered together if the ``-p`` flag is set.
 
+   With the ``-s`` option, the counting hash will be saved to the
+   specified file after all sequences have been processed. With ``-d``,
+   the counting hash will be saved every d files for multifile runs;
+   if ``-s`` is set, the specified name will be used, and if not,
+   the name backup.ht will be used.
+
+   The ``-f`` flag will force the program to continue upon encountering
+   a formatting error in a sequence file; the counting hash up to that
+   point will be dumped, and processing will continue on the next file.
+
    Example::
 
 	scripts/normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa
 
    Example::
 
 	scripts/normalize-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa
+
+   Example::
+
+        scripts/normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq
+
+   Example::
+
+        scripts/normalize-by-median.py -k 17 -d 2 -s test.ht tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads.fq
diff --git a/lib/counting.cc b/lib/counting.cc
@@ -141,10 +141,9 @@ BoundedCounterType CountingHash::get_max_count(const std::string &s,
   return max_count;
 }
 
-HashIntoType * CountingHash::abundance_distribution(std::string filename,
-						    Hashbits * tracking,
-			    CallbackFn callback,
-			    void * callback_data) const
+HashIntoType * 
+CountingHash::abundance_distribution(read_parsers::IParser * parser,
+				     Hashbits * tracking)
 {
   HashIntoType * dist = new HashIntoType[MAX_BIGCOUNT + 1];
   HashIntoType i;
@@ -154,7 +153,7 @@ HashIntoType * CountingHash::abundance_distribution(std::string filename,
   }
 
   Read read;
-  IParser* parser = IParser::get_parser(filename.c_str());
+
   string name;
   string seq;
   unsigned long long read_num = 0;
@@ -186,20 +185,19 @@ HashIntoType * CountingHash::abundance_distribution(std::string filename,
     }
 
     read_num += 1;
-
-    // run callback, if specified
-    if (read_num % CALLBACK_PERIOD == 0 && callback) {
-      try {
-        callback("abundance_distribution", callback_data, read_num, 0);
-      } catch (...) {
-        throw;
-      }
-    }
   }
-
   return dist;
 }
 
+
+HashIntoType * CountingHash::abundance_distribution(std::string filename,
+						    Hashbits * tracking)
+{
+  IParser* parser = IParser::get_parser(filename.c_str());
+
+  return abundance_distribution(parser, tracking);
+}
+
 HashIntoType * CountingHash::fasta_count_kmers_by_position(const std::string &inputfile,
 					     const unsigned int max_read_len,
 					     BoundedCounterType limit_by_count,

diff --git a/lib/counting.hh b/lib/counting.hh
@@ -191,10 +191,10 @@ namespace khmer {
 			  BoundedCounterType &kadian,
 			  unsigned int nk = 1);
 
+    HashIntoType * abundance_distribution(read_parsers::IParser * parser,
+					  Hashbits * tracking);
     HashIntoType * abundance_distribution(std::string filename,
-					  Hashbits * tracking,
-					  CallbackFn callback = NULL,
-					  void * callback_data = NULL) const;
+					  Hashbits * tracking);
 
     HashIntoType * fasta_count_kmers_by_position(const std::string &inputfile,
 					 const unsigned int max_read_len,

diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
@@ -1833,9 +1833,7 @@ static PyObject * hash_abundance_distribution(PyObject * self, PyObject * args)
 
   char * filename = NULL;
   PyObject * tracking_obj = NULL;
-  PyObject * callback_obj = NULL;
-  if (!PyArg_ParseTuple(args, "sO|O", &filename, &tracking_obj,
-			&callback_obj)) {
+  if (!PyArg_ParseTuple(args, "sO", &filename, &tracking_obj)) {
     return NULL;
   }
 
@@ -1846,8 +1844,46 @@ static PyObject * hash_abundance_distribution(PyObject * self, PyObject * args)
 
 
   khmer::HashIntoType * dist;
-  dist = counting->abundance_distribution(filename, hashbits,
-					  _report_fn, callback_obj);
+
+  Py_BEGIN_ALLOW_THREADS
+    dist = counting->abundance_distribution(filename, hashbits);
+  Py_END_ALLOW_THREADS
+
+  PyObject * x = PyList_New(MAX_BIGCOUNT + 1);
+  for (int i = 0; i < MAX_BIGCOUNT + 1; i++) {
+    PyList_SET_ITEM(x, i, PyInt_FromLong(dist[i]));
+  }
+
+  delete[] dist;
+
+  return x;
+}
+
+static PyObject * hash_abundance_distribution_with_reads_parser(PyObject * self, PyObject * args)
+{
+  khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
+  khmer::CountingHash * counting = me->counting;
+
+  PyObject * rparser_obj = NULL;
+  PyObject * tracking_obj = NULL;
+  if (!PyArg_ParseTuple(args, "OO", &rparser_obj, &tracking_obj)) {
+    return NULL;
+  }
+
+  khmer:: read_parsers:: IParser * rparser = 
+    _PyObject_to_khmer_ReadParser(rparser_obj);
+
+  assert(is_hashbits_obj(tracking_obj));
+
+  khmer_KHashbitsObject * tracking_o = (khmer_KHashbitsObject *) tracking_obj;
+  khmer::Hashbits * hashbits = tracking_o->hashbits;
+
+
+  khmer::HashIntoType * dist;
+
+  Py_BEGIN_ALLOW_THREADS
+    dist = counting->abundance_distribution(rparser, hashbits);
+  Py_END_ALLOW_THREADS
 
   PyObject * x = PyList_New(MAX_BIGCOUNT + 1);
   for (int i = 0; i < MAX_BIGCOUNT + 1; i++) {
@@ -2037,6 +2073,7 @@ static PyMethodDef khmer_counting_methods[] = {
   { "trim_on_abundance", count_trim_on_abundance, METH_VARARGS, "Trim on >= abundance" },
   { "trim_below_abundance", count_trim_below_abundance, METH_VARARGS, "Trim on >= abundance" },
   { "abundance_distribution", hash_abundance_distribution, METH_VARARGS, "" },
+  { "abundance_distribution_with_reads_parser", hash_abundance_distribution_with_reads_parser, METH_VARARGS, "" },
   { "fasta_count_kmers_by_position", hash_fasta_count_kmers_by_position, METH_VARARGS, "" },
   { "fasta_dump_kmers_by_abundance", hash_fasta_dump_kmers_by_abundance, METH_VARARGS, "" },
   { "load", hash_load, METH_VARARGS, "" },

diff --git a/python/khmer/counting_args.py b/python/khmer/counting_args.py
@@ -6,10 +6,12 @@
 DEFAULT_MIN_HASHSIZE = 1e6
 
 
-def build_construct_args():
+def build_construct_args(descr=None):
 
-    parser = argparse.ArgumentParser(description=
-                                     'Build & load a counting Bloom filter.')
+    if descr is None:
+        descr = 'Build & load a counting Bloom filter.'
+
+    parser = argparse.ArgumentParser(description=descr)
 
     env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
     env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT)

diff --git a/sandbox/fastq-to-fasta.py b/sandbox/fastq-to-fasta.py
@@ -1,9 +1,8 @@
 import sys
 sys.path.insert(0, '/u/t/dev/screed')
 import screed
-from screed.fastq import fastq_iter
 
-for n, record in enumerate(fastq_iter(open(sys.argv[1]))):
+for n, record in enumerate(screed.open(sys.argv[1])):
     if n % 10000 == 0:
         print>>sys.stderr, '...', n