Skip to content

Commit

Permalink
updating the docs, adding test files
Browse files Browse the repository at this point in the history
  • Loading branch information
CNuge committed Mar 10, 2020
1 parent 31fad59 commit 980f934
Show file tree
Hide file tree
Showing 9 changed files with 92 additions and 44 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ For increased control, alfie can also be deployed as a module from within Python

## Installation

Alfie is a python3 program that depends on the python packages: `numpy` (version >= 1.18.1), `tensorflow` (version>=2.0.0), and `scikit-learn` (version>=0.21.3). If you do not have these installed, it is recommended that you install python and the required packages via [anaconda](https://www.anaconda.com/distribution/).
Alfie is a python3 program that depends on the python packages: `numpy`, `tensorflow`, `scikit-learn`, and `pandas`. If you do not have these installed, it is recommended that you install python and the required packages via [anaconda](https://www.anaconda.com/distribution/).

To install alfie, download and unzip this repository. From the terminal, enter the downloaded repository and then run the following command:
```
Expand Down
Binary file modified alfie/__pycache__/kmerseq.cpython-37.pyc
Binary file not shown.
2 changes: 0 additions & 2 deletions alfie/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ def decode_predictions(predictions,
Arguments
---------
predictions : list like object, a list of numeric encoded predictions.
tax_list : list, a list of strings indicating what the numeric predictions should
Expand All @@ -104,7 +103,6 @@ def decode_predictions(predictions,
Examples
---------
#load example data
>>> from alfie import example_fasta
#generate predictions for examples, using default kingdom model
Expand Down
16 changes: 14 additions & 2 deletions alfie/kmerseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,24 @@ def __init__(self, name, sequence, k = 4):

self.name = name
self.k = k
self.seq = sequence.upper()

up_seq = sequence.upper()
if self.__check_seq(up_seq) == True:
self.seq = up_seq

self.k_dict = self.__kmer_dict(k = self.k)
self.__count_kmers()

def __kmer_build(self, k = 4, dna_list = ['A', 'C', 'G', 'T']):
def __check_seq(self, seq):
"""Check the input sequence for invalid characters."""
allowed = {"A", "C", "G", "T", "N", "-"}
in_set = set(seq)

if in_set.issubset(allowed) == False:
raise ValueError("Unallowed characters in input sequence")
return True

def __kmer_build(self, k = 4, dna_list = ["A", "C", "G", "T"]):
"""Recursive construction of all nucleotide kmer combinations."""

# all the nucleotides to be appended to new kmers
Expand Down
17 changes: 7 additions & 10 deletions alfie/test_kmerseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,15 @@
from kmerseq import KmerFeatures

class KmerTests(unittest.TestCase):
"""
unit tests for the io functions associated with the main alfie executable
"""
"""Unit tests for the KmerFeatures class."""
@classmethod
def setUpClass(self):
"""
initiate the test class instance with the
"""
"""Initiate the test class instance."""
self.test_kmers = KmerFeatures("test1",
"aaaaaattttttatatatgcgcgccccccgccgcgccgggc")

def test_file_type(self):
"""
test that the file type is properly identified
"""
def test_KmerFeatures(self):

self.assertEqual(self.test_kmers.name,
"test1")

Expand All @@ -34,6 +28,9 @@ def test_file_type(self):
self.assertEqual(self.test_kmers.kmer_freqs.shape,
(256,))

with self.assertRaises(ValueError):
self.assertEqual(KmerFeatures("test1", "NOTDNA"))


if __name__ == '__main__':
unittest.main()
Expand Down
24 changes: 7 additions & 17 deletions alfie/test_seqio.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,10 @@
from alfie import ex_fasta_file, ex_fastq_file

class SeqioTests(unittest.TestCase):
"""
unit tests for the io functions associated with the main alfie executable
"""
"""Unit tests for the seqio functions"""
@classmethod
def setUpClass(self):
"""
initiate the test class instance with the
"""
"""Initiate the test class instance."""
self._expected_kingdom_dict = {0: 'alfie_out/animalia_test.fasta',
1: 'alfie_out/bacteria_test.fasta',
2: 'alfie_out/fungi_test.fasta',
Expand All @@ -25,18 +21,14 @@ def setUpClass(self):

@classmethod
def tearDown(self):
"""
after unit tests, remove the temporary outputs
"""
"""After unit tests, remove the temporary outputs."""
try:
os.rmdir("alfie_out")
except OSError:
pass

def test_file_type(self):
"""
test that the file type is properly identified
"""
"""Test that the file type is properly identified."""
self.assertEqual(file_type("file_1.fa"),
"fasta")
self.assertEqual(file_type("file_1.fasta"),
Expand All @@ -56,19 +48,16 @@ def test_file_type(self):
with self.assertRaises(ValueError):
self.assertEqual(file_type("in.file_2.csv"))


def test_outfile_builder(self):
"""
test that the output file set is generated properly
"""
"""Test that the output file set is generated properly."""
self.assertEqual(outfile_dict("test.fasta"),
self._expected_kingdom_dict)

self.assertEqual(outfile_dict("in_data/test.fasta"),
self._expected_kingdom_dict)


def test_fasta_reader(self):
""" Test the fasta reader functions."""
self._fasta_read = read_fasta(self._fasta_infile)

self.assertEqual(len(self._fasta_read), 100)
Expand All @@ -88,6 +77,7 @@ def test_fasta_reader(self):
"AGTATTAATTCGTATGGAATTAGCA")

def test_fastq_reader(self):
""" Test the fastq reader functions."""
self._fastq_read = read_fastq(self._fastq_infile)

self.assertEqual(len(self._fastq_read), 100)
Expand Down
18 changes: 18 additions & 0 deletions alfie/test_training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import unittest

import training

class TrainingTests(unittest.TestCase):

@classmethod
def setUpClass(self):
self.dnn_1mer = training.alfie_dnn_default()

def test_split(self):

def test_process_sequences(self):

def test_shuffle_unison(self):

def test_nn_constriction(self):

54 changes: 43 additions & 11 deletions alfie/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,31 +15,50 @@
stratified_taxon_split : Conduct a stratified train/test split based on a user defined categorical column.
"""
import tensorflow as tf

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedShuffleSplit

from alfie.kmerseq import KmerFeatures

from sklearn.model_selection import StratifiedShuffleSplit


def stratified_taxon_split(input_data, class_col, test_size = 0.3, silent = False):
"""
Conduct a stratified train/test split based on a user defined categorical column.
Arguments
---------
input_data : pandas.DataFrame, a dataframe to be split into a train and test set.
class_col : string, the column of the input data with the categories to stratify
between the train and test set.
test_size : double, the proportion of the input data to be included in the test split.
silent : bool, should the split criteria be echoded, defualt is True.
Returns
---------
out1, out2 : pandas.DataFrame, out1 is the training data frame, out2 is the test data frame.
Examples
---------
#initiate a similated dataframe
>>> data = pd.DataFrame({"phylum" : ["Mollusca"]*10 + ["Arthropoda"] * 15,
>>> "data_col" : [np.random.randint(100) for x in range(25)]})
#split on the column phylum, contians the classifications
>>> train, test = stratified_taxon_split(data, class_col = "phylum",
>>> test_size = .2, silent = True)
# 80% of data in train
>>> train.shape
# index order is randomized
>>> train.index
# 20% of data in test
>>> test.shape
"""
if silent == False:
print(f'Conducting train/test split, split evenly by: {class_col}')
Expand All @@ -48,20 +67,31 @@ def stratified_taxon_split(input_data, class_col, test_size = 0.3, silent = Fals
strat_index = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1738)

for train_index, test_valid_index in strat_index.split(input_data, input_data[class_col]):
X_train, X_test = input_data.loc[train_index], input_data.loc[test_valid_index]
train, test = input_data.loc[train_index], input_data.loc[test_valid_index]


return X_train, X_test
return train, test


def sample_seq(seq, min_size = 200, max_size = 600, n = 1, seed = None):
"""
Take a full sequence and return a list of random subsamples.
Take a full sequence or list of sequences and return a list of random subsamples.
Samples will be of a random length subset of the input seq. The min and max size of
the random subset are defined by the min_size and max_size parameters.
Samples will be of a random length form within the defined sizes of the
Arguments
---------
seq : string or list, the sequence, or list of sequences, to randomly subsample.
min_size : int, the minimum size of the random subsample. Default is 200.
max_size : int, the maximum size of the random subsample. Default is 600.
n : int, the number of random samples to generte from each input sequence.
Default is 1 (no upsampling).
seed : int, a random seed for repeatable random sampling.
Returns
---------
Expand All @@ -79,6 +109,8 @@ def sample_seq(seq, min_size = 200, max_size = 600, n = 1, seed = None):
#set the max to seq length if its shorter
if max_size > len(seq):
max_size = len(seq)
if min_size > len(seq):
raise ValueError("Minimum sample size exceeds sequence length")
#get the set of random window sizes
win_sizes = [np.random.randint(min_size, max_size) for x in range(n)]
#for each window size, randomly subset the sequence by choosing a start point
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
numpy>=1.18.1
tensorflow>=2.0.0
scikit-learn>=0.21.3
scikit-learn>=0.21.3
pandas>=0.25.1

0 comments on commit 980f934

Please sign in to comment.