Skip to content

Commit

Permalink
some more refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
FoxHarley committed Jun 20, 2023
1 parent 105e95d commit 0d74f01
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 1,653 deletions.
Binary file modified models/baseline/__pycache__/markov_model.cpython-39.pyc
Binary file not shown.
153 changes: 140 additions & 13 deletions results/11mer_full/11mer.ipynb → results/11mer/11mer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -236,37 +236,164 @@
"[18134 rows x 1 columns]"
]
},
"execution_count": 4,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# load the full dataframe if it exists\n",
"file_path = 'full_df.pickle'\n",
"# load the train data if it exists\n",
"file_path = 'train_df.pickle'\n",
"if os.path.exists(file_path):\n",
" with open(file_path, 'rb') as f:\n",
" full_df = pickle.load(f)\n",
" train_df = pickle.load(f)\n",
"else:\n",
" # load the fasta file\n",
" # load the fasta file and select the train data\n",
" fasta_file = \"../../Homo_sapiens_3prime_UTR.fa\"\n",
" sequences = []\n",
" for s in SeqIO.parse(fasta_file, \"fasta\"):\n",
" sequences.append(str(s.seq).upper())\n",
" # make a dataframe\n",
" full_df = pd.DataFrame({'3-UTR':sequences})\n",
" # store it \n",
" # get the train fraction\n",
" val_fraction = 0.1\n",
" N_train = int(len(sequences)*(1-val_fraction))\n",
" train_data = sequences[:N_train]\n",
" # store it as a dataframe\n",
" train_df = pd.DataFrame({'3-UTR':train_data})\n",
" with open(file_path, 'wb') as f:\n",
" pickle.dump(full_df, f)\n",
"full_df"
" pickle.dump(train_df, f)\n",
"train_df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>3-UTR</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ATCTTATATAACTGTGAGATTAATCTCAGATAATGACACAAAATAT...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>GGTTGCCGGGGGTAGGGGTGGGGCCACACAAATCTCCAGGAGCCAC...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>GGCAGCCCATCTGGGGGGCCTGTAGGGGCTGCCGGGCTGGTGGCCA...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CCCACCTACCACCAGAGGCCTGCAGCCTCCCACATGCCTTAAGGGG...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>TGGCCGCGGTGAGGTGGGTTCTCAGGACCACCCTCGCCAAGCTCCA...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18129</th>\n",
" <td>AGCAAGCATTGAAAATAATAGTTATTGCATACCAATCCTTGTTTGC...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18130</th>\n",
" <td>AGCAAGCATTGAAAATAATAGTTATTGCATACCAATCCTTGTTTGC...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18131</th>\n",
" <td>GCCTACTTCATCTCAGGACCCGCCCAAGAGTGGCCGCGGCTTTGGG...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18132</th>\n",
" <td>TTGTCAGTCTGTCTGCTCAGGACACAAGAACTAAGGGGCAACAAAT...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18133</th>\n",
" <td>CTTTATAGTGGCACAAACGCTTCAGAGACACACAATTATAAGAGAC...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>18134 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 3-UTR\n",
"0 ATCTTATATAACTGTGAGATTAATCTCAGATAATGACACAAAATAT...\n",
"1 GGTTGCCGGGGGTAGGGGTGGGGCCACACAAATCTCCAGGAGCCAC...\n",
"2 GGCAGCCCATCTGGGGGGCCTGTAGGGGCTGCCGGGCTGGTGGCCA...\n",
"3 CCCACCTACCACCAGAGGCCTGCAGCCTCCCACATGCCTTAAGGGG...\n",
"4 TGGCCGCGGTGAGGTGGGTTCTCAGGACCACCCTCGCCAAGCTCCA...\n",
"... ...\n",
"18129 AGCAAGCATTGAAAATAATAGTTATTGCATACCAATCCTTGTTTGC...\n",
"18130 AGCAAGCATTGAAAATAATAGTTATTGCATACCAATCCTTGTTTGC...\n",
"18131 GCCTACTTCATCTCAGGACCCGCCCAAGAGTGGCCGCGGCTTTGGG...\n",
"18132 TTGTCAGTCTGTCTGCTCAGGACACAAGAACTAAGGGGCAACAAAT...\n",
"18133 CTTTATAGTGGCACAAACGCTTCAGAGACACACAATTATAAGAGAC...\n",
"\n",
"[18134 rows x 1 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# load the test data if it exists\n",
"file_path = 'test_df.pickle'\n",
"if os.path.exists(file_path):\n",
" with open(file_path, 'rb') as f:\n",
" train_df = pickle.load(f)\n",
"else:\n",
" # load the fasta file and select the train data\n",
" fasta_file = \"../../Homo_sapiens_3prime_UTR.fa\"\n",
" sequences = []\n",
" for s in SeqIO.parse(fasta_file, \"fasta\"):\n",
" sequences.append(str(s.seq).upper())\n",
" # get the train fraction\n",
" val_fraction = 0.1\n",
" N_train = int(len(sequences)*(1-val_fraction))\n",
" test_data = sequences[N_train:]\n",
" # store it as a dataframe\n",
" test_df = pd.DataFrame({'3-UTR':test_data})\n",
" with open(file_path, 'wb') as f:\n",
" pickle.dump(test_df, f)\n",
"test_df"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Training: Full Dataset"
"# Training: Full "
]
},
{
Expand Down Expand Up @@ -358,7 +485,7 @@
}
],
"source": [
"# get the frequency counts of all 11mers\n",
"# get the frequency counts of all motifs till 11mer\n",
"kmer_all = KmerCount(11,pseudocount=0.1)\n",
"kmer_all.compute_counts(full_df['3-UTR'])\n",
"kmer_all.kmer_counts_dict"
Expand Down
1 change: 0 additions & 1 deletion results/11mer_full/.gitignore

This file was deleted.

Loading

0 comments on commit 0d74f01

Please sign in to comment.