updating the docs, adding test files

CNuge · Mar 10, 2020 · 980f934 · 980f934
1 parent 31fad59
commit 980f934
Show file tree

Hide file tree

Showing 9 changed files with 92 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ For increased control, alfie can also be deployed as a module from within Python
 
 ## Installation
 
-Alfie is a python3 program that depends on the python packages: `numpy` (version >= 1.18.1), `tensorflow` (version>=2.0.0), and `scikit-learn` (version>=0.21.3). If you do not have these installed, it is recommended that you install python and the required packages via [anaconda](https://www.anaconda.com/distribution/).
+Alfie is a python3 program that depends on the python packages: `numpy`, `tensorflow`, `scikit-learn`, and `pandas`. If you do not have these installed, it is recommended that you install python and the required packages via [anaconda](https://www.anaconda.com/distribution/).
 
 To install alfie, download and unzip this repository. From the terminal, enter the downloaded repository and then run the following command:
 ```

diff --git a/alfie/__pycache__/kmerseq.cpython-37.pyc b/alfie/__pycache__/kmerseq.cpython-37.pyc
diff --git a/alfie/classify.py b/alfie/classify.py
@@ -92,7 +92,6 @@ def decode_predictions(predictions,
 	
 	Arguments
 	---------
-
 	predictions : list like object, a list of numeric encoded predictions.
 
 	tax_list : list, a list of strings indicating what the numeric predictions should
@@ -104,7 +103,6 @@ def decode_predictions(predictions,
 	
 	Examples
 	---------
-	
 	#load example data
 	>>> from alfie import example_fasta
 	#generate predictions for examples, using default kingdom model

diff --git a/alfie/kmerseq.py b/alfie/kmerseq.py
@@ -79,12 +79,24 @@ def __init__(self, name, sequence, k = 4):
 
 		self.name = name
 		self.k = k
-		self.seq = sequence.upper()
+
+		up_seq = sequence.upper()
+		if self.__check_seq(up_seq) == True:
+			self.seq = up_seq
 
 		self.k_dict = self.__kmer_dict(k = self.k)
 		self.__count_kmers()
 
-	def __kmer_build(self, k = 4, dna_list = ['A', 'C', 'G', 'T']):
+	def __check_seq(self, seq):
+		"""Check the input sequence for invalid characters."""
+		allowed = {"A", "C", "G", "T", "N", "-"}
+		in_set = set(seq)
+
+		if in_set.issubset(allowed) == False:
+			raise ValueError("Unallowed characters in input sequence")
+		return True		
+
+	def __kmer_build(self, k = 4, dna_list = ["A", "C", "G", "T"]):
 		"""Recursive construction of all nucleotide kmer combinations."""
 
 		# all the nucleotides to be appended to new kmers

diff --git a/alfie/test_kmerseq.py b/alfie/test_kmerseq.py
@@ -4,21 +4,15 @@
 from kmerseq import KmerFeatures
 
 class KmerTests(unittest.TestCase):
-	"""
-	unit tests for the io functions associated with the main alfie executable
-	"""
+	"""Unit tests for the KmerFeatures class."""
 	@classmethod
 	def setUpClass(self):
-		"""
-		initiate the test class instance with the 
-		"""
+		"""Initiate the test class instance."""
 		self.test_kmers = KmerFeatures("test1", 
 							"aaaaaattttttatatatgcgcgccccccgccgcgccgggc")
 
-	def test_file_type(self):
-		"""
-		test that the file type is properly identified
-		"""
+	def test_KmerFeatures(self):
+
 		self.assertEqual(self.test_kmers.name, 
 						"test1")
 
@@ -34,6 +28,9 @@ def test_file_type(self):
 		self.assertEqual(self.test_kmers.kmer_freqs.shape,
 						(256,))
 
+		with self.assertRaises(ValueError):
+			self.assertEqual(KmerFeatures("test1", "NOTDNA"))
+
 
 if __name__ == '__main__':
 	unittest.main()

diff --git a/alfie/test_seqio.py b/alfie/test_seqio.py
@@ -6,14 +6,10 @@
 from alfie import ex_fasta_file, ex_fastq_file
 
 class SeqioTests(unittest.TestCase):
-	"""
-	unit tests for the io functions associated with the main alfie executable
-	"""
+	"""Unit tests for the seqio functions"""
 	@classmethod
 	def setUpClass(self):
-		"""
-		initiate the test class instance with the 
-		"""
+		"""Initiate the test class instance."""
 		self._expected_kingdom_dict = {0: 'alfie_out/animalia_test.fasta',
 										 1: 'alfie_out/bacteria_test.fasta',
 										 2: 'alfie_out/fungi_test.fasta',
@@ -25,18 +21,14 @@ def setUpClass(self):
 
 	@classmethod
 	def tearDown(self):
-		"""
-		after unit tests, remove the temporary outputs
-		"""
+		"""After unit tests, remove the temporary outputs."""
 		try:
 			os.rmdir("alfie_out")
 		except OSError:
 			pass
 
 	def test_file_type(self):
-		"""
-		test that the file type is properly identified
-		"""
+		"""Test that the file type is properly identified."""
 		self.assertEqual(file_type("file_1.fa"), 
 						"fasta")
 		self.assertEqual(file_type("file_1.fasta"),
@@ -56,19 +48,16 @@ def test_file_type(self):
 		with self.assertRaises(ValueError):
 			self.assertEqual(file_type("in.file_2.csv"))
 
-
 	def test_outfile_builder(self):
-		"""
-		test that the output file set is generated properly
-		"""
+		"""Test that the output file set is generated properly."""
 		self.assertEqual(outfile_dict("test.fasta"), 
 						self._expected_kingdom_dict)
 
 		self.assertEqual(outfile_dict("in_data/test.fasta"), 
 				self._expected_kingdom_dict)
 
-
 	def test_fasta_reader(self):
+		""" Test the fasta reader functions."""
 		self._fasta_read = read_fasta(self._fasta_infile)
 
 		self.assertEqual(len(self._fasta_read), 100)
@@ -88,6 +77,7 @@ def test_fasta_reader(self):
 						"AGTATTAATTCGTATGGAATTAGCA")
 
 	def test_fastq_reader(self):
+		""" Test the fastq reader functions."""
 		self._fastq_read = read_fastq(self._fastq_infile)
 
 		self.assertEqual(len(self._fastq_read), 100)

diff --git a/alfie/test_training.py b/alfie/test_training.py
@@ -0,0 +1,18 @@
+import unittest
+
+import training
+
+class TrainingTests(unittest.TestCase):
+
+	@classmethod
+	def setUpClass(self):
+		self.dnn_1mer = training.alfie_dnn_default()
+
+	def test_split(self):
+
+	def test_process_sequences(self):
+
+	def test_shuffle_unison(self):
+
+	def test_nn_constriction(self):
+
diff --git a/alfie/training.py b/alfie/training.py
@@ -15,31 +15,50 @@
 
 stratified_taxon_split : Conduct a stratified train/test split based on a user defined categorical column.
 
-
 """
-import tensorflow as tf
 
 import numpy as np
+import pandas as pd
+import tensorflow as tf
+from sklearn.model_selection import StratifiedShuffleSplit
 
 from alfie.kmerseq import KmerFeatures
 
-from sklearn.model_selection import StratifiedShuffleSplit
-
 
 def stratified_taxon_split(input_data, class_col, test_size = 0.3, silent = False):
 	"""
 	Conduct a stratified train/test split based on a user defined categorical column.
 	
 	Arguments
 	---------
+	
+	input_data : pandas.DataFrame, a dataframe to be split into a train and test set.
+
+	class_col : string, the column of the input data with the categories to stratify 
+		between the train and test set.
+
+	test_size : double, the proportion of the input data to be included in the test split. 
+
+	silent : bool, should the split criteria be echoded, defualt is True.
 
 	Returns
 	---------
+	out1, out2 : pandas.DataFrame, out1 is the training data frame, out2 is the test data frame.
 
 	Examples
 	---------
-
-
+	#initiate a similated dataframe
+	>>> data = pd.DataFrame({"phylum" : ["Mollusca"]*10 + ["Arthropoda"] * 15,
+	>>>						"data_col" : [np.random.randint(100) for x in range(25)]})
+	#split on the column phylum, contians the classifications
+	>>> train, test = stratified_taxon_split(data, class_col = "phylum", 
+	>>>			test_size = .2, silent = True)
+	# 80% of data in train
+	>>> train.shape
+	# index order is randomized
+	>>> train.index
+	# 20% of data in test
+	>>> test.shape
 	"""
 	if silent == False:
 		print(f'Conducting train/test split, split evenly by: {class_col}')
@@ -48,20 +67,31 @@ def stratified_taxon_split(input_data, class_col, test_size = 0.3, silent = Fals
 	strat_index = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1738)
 
 	for train_index, test_valid_index in strat_index.split(input_data, input_data[class_col]):
-		X_train, X_test = input_data.loc[train_index], input_data.loc[test_valid_index] 
+		train, test = input_data.loc[train_index], input_data.loc[test_valid_index] 
 
 
-	return X_train, X_test
+	return train, test
 
 
 def sample_seq(seq, min_size = 200, max_size = 600, n = 1, seed = None):
 	"""
-	Take a full sequence and return a list of random subsamples.
+	Take a full sequence or list of sequences and return a list of random subsamples.
+
+	Samples will be of a random length subset of the input seq. The min and max size of
+	the random subset are defined by the min_size and max_size parameters. 
 
-	Samples will be of a random length form within the defined sizes of the 
-	
 	Arguments
 	---------
+	seq : string or list, the sequence, or list of sequences, to randomly subsample.
+
+	min_size : int, the minimum size of the random subsample. Default is 200.
+
+	max_size : int, the maximum size of the random subsample. Default is 600.
+
+	n : int, the number of random samples to generte from each input sequence.
+		Default is 1 (no upsampling).
+
+	seed : int, a random seed for repeatable random sampling.
 
 	Returns
 	---------
@@ -79,6 +109,8 @@ def sample_seq(seq, min_size = 200, max_size = 600, n = 1, seed = None):
 	#set the max to seq length if its shorter
 	if max_size > len(seq):
 		max_size = len(seq)
+	if min_size > len(seq):
+		raise ValueError("Minimum sample size exceeds sequence length")
 	#get the set of random window sizes
 	win_sizes = [np.random.randint(min_size, max_size) for x in range(n)]
 	#for each window size, randomly subset the sequence by choosing a start point

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 numpy>=1.18.1
 tensorflow>=2.0.0
-scikit-learn>=0.21.3
+scikit-learn>=0.21.3
+pandas>=0.25.1