Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# sequence files
*.gb
*.seq
*.gz

# autoenv
.env

# virtualenv
.venv

Expand All @@ -7,7 +15,6 @@ __pycache__/

# C extensions
*.so
*.c

# Distribution / packaging
.Python
Expand Down Expand Up @@ -56,3 +63,34 @@ coverage.xml
# Sphinx documentation
docs/_build/

### Intellij ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm

## Directory-based project format
.idea/
# if you remove the above rule, at least ignore user-specific stuff:
# .idea/workspace.xml
# .idea/tasks.xml
# and these sensitive or high-churn files:
# .idea/dataSources.ids
# .idea/dataSources.xml
# .idea/sqlDataSources.xml
# .idea/dynamic.xml

## File-based project format
*.ipr
*.iws
*.iml

## Additional for IntelliJ
out/

# generated by mpeltonen/sbt-idea plugin
.idea_modules/

# generated by JIRA plugin
atlassian-ide-plugin.xml

# generated by Crashlytics plugin (for Android Studio and Intellij)
com_crashlytics_export_strings.xml

12 changes: 10 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
language: python
python:
- "2.7"
- "pypy"
# command to install dependencies
install:
- "pip install -r requirements.txt"
- "python setup.py install"
- "wget https://raw.githubusercontent.com/biopython/biopython/master/Tests/GenBank/NC_005816.gb"
- "wget https://www.dropbox.com/s/n67qz6k262ghxuz/cow.seq"
- "make"
# command to run tests
script: "pep8 *.py*"
script:
- "python --version"
- "pep8 *.py*"
- "cat NC_005816.gb | python main.py -f genbank"
- "cat cow.seq | python bench.py"

70 changes: 70 additions & 0 deletions CodonCountStruct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from ctypes import Structure, c_longlong


class CodonCount(Structure):
_fields_ = [
('TTT', c_longlong),
('TTC', c_longlong),
('TTA', c_longlong),
('TTG', c_longlong),
('TCT', c_longlong),
('TCC', c_longlong),
('TCA', c_longlong),
('TCG', c_longlong),
('TAT', c_longlong),
('TAC', c_longlong),
('TAA', c_longlong),
('TAG', c_longlong),
('TGT', c_longlong),
('TGC', c_longlong),
('TGA', c_longlong),
('TGG', c_longlong),
('CTT', c_longlong),
('CTC', c_longlong),
('CTA', c_longlong),
('CTG', c_longlong),
('CCT', c_longlong),
('CCC', c_longlong),
('CCA', c_longlong),
('CCG', c_longlong),
('CAT', c_longlong),
('CAC', c_longlong),
('CAA', c_longlong),
('CAG', c_longlong),
('CGT', c_longlong),
('CGC', c_longlong),
('CGA', c_longlong),
('CGG', c_longlong),
('ATT', c_longlong),
('ATC', c_longlong),
('ATA', c_longlong),
('ATG', c_longlong),
('ACT', c_longlong),
('ACC', c_longlong),
('ACA', c_longlong),
('ACG', c_longlong),
('AAT', c_longlong),
('AAC', c_longlong),
('AAA', c_longlong),
('AAG', c_longlong),
('AGT', c_longlong),
('AGC', c_longlong),
('AGA', c_longlong),
('AGG', c_longlong),
('GTT', c_longlong),
('GTC', c_longlong),
('GTA', c_longlong),
('GTG', c_longlong),
('GCT', c_longlong),
('GCC', c_longlong),
('GCA', c_longlong),
('GCG', c_longlong),
('GAT', c_longlong),
('GAC', c_longlong),
('GAA', c_longlong),
('GAG', c_longlong),
('GGT', c_longlong),
('GGC', c_longlong),
('GGA', c_longlong),
('GGG', c_longlong)
]
26 changes: 26 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
UNAME := $(shell uname)

prefix=${HOME}
exec_prefix=${prefix}
libdir=${exec_prefix}/lib
includedir=${prefix}/include

# the build target executable:
TARGET = counterc
TRIE_CODON = trie_codon

CC = gcc
CFLAGS = -shared -Wl,-soname,$(TARGET).so -Wl,--no-undefined -std=c99 -O3 -funroll-loops

ifeq ($(UNAME), Darwin)
CC = gcc
CFLAGS = -shared -Wl,-install_name,$(TARGET).so -O3 -funroll-loops
endif

all: $(TARGET)

$(TARGET): $(TARGET).c
$(CC) $(CFLAGS) -o $(TARGET).so -fPIC $(TARGET).c $(TRIE_CODON).c

clean:
$(RM) $(TARGET).so
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,14 @@ Create a virtualenv and then run the following commands.

```bash
pip install -r requirements.txt
python setup.py install
make
```

C dependencies
-----

1. [C-Algorithms 1.2.0](http://c-algorithms.sourceforge.net)

Usage
-------

Expand Down
35 changes: 35 additions & 0 deletions bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# !/usr/bin/python

import sys
import time

from ctypes import *
from CodonCountStruct import CodonCount

""" Utility """
CURRENT_MILLI_TIME = lambda: int(round(time.clock() * 1000))


def getdict(struct):
return dict(
(field, getattr(struct, field)) for field, _ in struct._fields_
)


def main(argv):
counterc = CDLL('./counterc.so')
counterc.countcodons.argtypes = (c_char_p,)
counterc.countcodons.restype = CodonCount

start = CURRENT_MILLI_TIME()

for line in sys.stdin:
cstruct = counterc.countcodons(line)
print getdict(cstruct)

end = CURRENT_MILLI_TIME()
print "Tokenize and Count in " + str(end - start) + " ms"


if __name__ == "__main__":
main(sys.argv[1:])
31 changes: 0 additions & 31 deletions count.pyx

This file was deleted.

104 changes: 104 additions & 0 deletions counterc.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#include "counterc.h"

CodonCount countcodons(char * sequence)
{
char codon[4];
long seq_length = strlen(sequence) - 2;
const char * valid_characters = "ACTG";
char * c = codon;

Trie * codoncount = trie_new(); // You create it, you destroy it

for (int i = 0; i < seq_length; i+=3)
{
int invalid = 0;

strncpy(codon, sequence+i, 3);
codon[3] = '\0'; // Ensure codon ends at 3 characters

// Break out if invalid character detected
while (*c) {
if (!strchr(valid_characters, *c)) {
invalid = 1;
break;
}
c++;
}

if (invalid == 1) { continue; }

trie_increment(codoncount, codon);
}

// Prepare struct to return
// Trie is not an iterable structure, thus the length
struct CodonCount counts;
counts.TTT = (long long) trie_lookup(codoncount, "TTT");
counts.TTC = (long long) trie_lookup(codoncount, "TTC");
counts.TTA = (long long) trie_lookup(codoncount, "TTA");
counts.TTG = (long long) trie_lookup(codoncount, "TTG");
counts.TCT = (long long) trie_lookup(codoncount, "TCT");
counts.TCC = (long long) trie_lookup(codoncount, "TCC");
counts.TCA = (long long) trie_lookup(codoncount, "TCA");
counts.TCG = (long long) trie_lookup(codoncount, "TCG");
counts.TAT = (long long) trie_lookup(codoncount, "TAT");
counts.TAC = (long long) trie_lookup(codoncount, "TAC");
counts.TAA = (long long) trie_lookup(codoncount, "TAA");
counts.TAG = (long long) trie_lookup(codoncount, "TAG");
counts.TGT = (long long) trie_lookup(codoncount, "TGT");
counts.TGC = (long long) trie_lookup(codoncount, "TGC");
counts.TGA = (long long) trie_lookup(codoncount, "TGA");
counts.TGG = (long long) trie_lookup(codoncount, "TGG");
counts.CTT = (long long) trie_lookup(codoncount, "CTT");
counts.CTC = (long long) trie_lookup(codoncount, "CTC");
counts.CTA = (long long) trie_lookup(codoncount, "CTA");
counts.CTG = (long long) trie_lookup(codoncount, "CTG");
counts.CCT = (long long) trie_lookup(codoncount, "CCT");
counts.CCC = (long long) trie_lookup(codoncount, "CCC");
counts.CCA = (long long) trie_lookup(codoncount, "CCA");
counts.CCG = (long long) trie_lookup(codoncount, "CCG");
counts.CAT = (long long) trie_lookup(codoncount, "CAT");
counts.CAC = (long long) trie_lookup(codoncount, "CAC");
counts.CAA = (long long) trie_lookup(codoncount, "CAA");
counts.CAG = (long long) trie_lookup(codoncount, "CAG");
counts.CGT = (long long) trie_lookup(codoncount, "CGT");
counts.CGC = (long long) trie_lookup(codoncount, "CGC");
counts.CGA = (long long) trie_lookup(codoncount, "CGA");
counts.CGG = (long long) trie_lookup(codoncount, "CGG");
counts.ATT = (long long) trie_lookup(codoncount, "ATT");
counts.ATC = (long long) trie_lookup(codoncount, "ATC");
counts.ATA = (long long) trie_lookup(codoncount, "ATA");
counts.ATG = (long long) trie_lookup(codoncount, "ATG");
counts.ACT = (long long) trie_lookup(codoncount, "ACT");
counts.ACC = (long long) trie_lookup(codoncount, "ACC");
counts.ACA = (long long) trie_lookup(codoncount, "ACA");
counts.ACG = (long long) trie_lookup(codoncount, "ACG");
counts.AAT = (long long) trie_lookup(codoncount, "AAT");
counts.AAC = (long long) trie_lookup(codoncount, "AAC");
counts.AAA = (long long) trie_lookup(codoncount, "AAA");
counts.AAG = (long long) trie_lookup(codoncount, "AAG");
counts.AGT = (long long) trie_lookup(codoncount, "AGT");
counts.AGC = (long long) trie_lookup(codoncount, "AGC");
counts.AGA = (long long) trie_lookup(codoncount, "AGA");
counts.AGG = (long long) trie_lookup(codoncount, "AGG");
counts.GTT = (long long) trie_lookup(codoncount, "GTT");
counts.GTC = (long long) trie_lookup(codoncount, "GTC");
counts.GTA = (long long) trie_lookup(codoncount, "GTA");
counts.GTG = (long long) trie_lookup(codoncount, "GTG");
counts.GCT = (long long) trie_lookup(codoncount, "GCT");
counts.GCC = (long long) trie_lookup(codoncount, "GCC");
counts.GCA = (long long) trie_lookup(codoncount, "GCA");
counts.GCG = (long long) trie_lookup(codoncount, "GCG");
counts.GAT = (long long) trie_lookup(codoncount, "GAT");
counts.GAC = (long long) trie_lookup(codoncount, "GAC");
counts.GAA = (long long) trie_lookup(codoncount, "GAA");
counts.GAG = (long long) trie_lookup(codoncount, "GAG");
counts.GGT = (long long) trie_lookup(codoncount, "GGT");
counts.GGC = (long long) trie_lookup(codoncount, "GGC");
counts.GGA = (long long) trie_lookup(codoncount, "GGA");
counts.GGG = (long long) trie_lookup(codoncount, "GGG");

trie_free(codoncount);

return counts;
}
Loading