some more refactoring

Hugenotte585 · Jun 20, 2023 · 0d74f01 · 0d74f01
1 parent 105e95d
commit 0d74f01
Show file tree

Hide file tree

Showing 5 changed files with 140 additions and 1,653 deletions.
diff --git a/models/baseline/__pycache__/markov_model.cpython-39.pyc b/models/baseline/__pycache__/markov_model.cpython-39.pyc
diff --git a/results/11mer_full/11mer.ipynb → results/11mer/11mer.ipynb b/results/11mer_full/11mer.ipynb → results/11mer/11mer.ipynb
@@ -142,7 +142,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -236,37 +236,164 @@
        "[18134 rows x 1 columns]"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# load the full dataframe if it exists\n",
-    "file_path = 'full_df.pickle'\n",
+    "# load the train data if it exists\n",
+    "file_path = 'train_df.pickle'\n",
     "if os.path.exists(file_path):\n",
     "    with open(file_path, 'rb') as f:\n",
-    "        full_df = pickle.load(f)\n",
+    "        train_df = pickle.load(f)\n",
     "else:\n",
-    "    # load the fasta file\n",
+    "    # load the fasta file and select the train data\n",
     "    fasta_file = \"../../Homo_sapiens_3prime_UTR.fa\"\n",
     "    sequences = []\n",
     "    for s in SeqIO.parse(fasta_file, \"fasta\"):\n",
     "        sequences.append(str(s.seq).upper())\n",
-    "    # make a dataframe\n",
-    "    full_df = pd.DataFrame({'3-UTR':sequences})\n",
-    "    # store it \n",
+    "    # get the train fraction\n",
+    "    val_fraction = 0.1\n",
+    "    N_train = int(len(sequences)*(1-val_fraction))\n",
+    "    train_data = sequences[:N_train]\n",
+    "    # store it as a dataframe\n",
+    "    train_df = pd.DataFrame({'3-UTR':train_data})\n",
     "    with open(file_path, 'wb') as f:\n",
-    "        pickle.dump(full_df, f)\n",
-    "full_df"
+    "        pickle.dump(train_df, f)\n",
+    "train_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>3-UTR</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ATCTTATATAACTGTGAGATTAATCTCAGATAATGACACAAAATAT...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GGTTGCCGGGGGTAGGGGTGGGGCCACACAAATCTCCAGGAGCCAC...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GGCAGCCCATCTGGGGGGCCTGTAGGGGCTGCCGGGCTGGTGGCCA...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>CCCACCTACCACCAGAGGCCTGCAGCCTCCCACATGCCTTAAGGGG...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>TGGCCGCGGTGAGGTGGGTTCTCAGGACCACCCTCGCCAAGCTCCA...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18129</th>\n",
+       "      <td>AGCAAGCATTGAAAATAATAGTTATTGCATACCAATCCTTGTTTGC...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18130</th>\n",
+       "      <td>AGCAAGCATTGAAAATAATAGTTATTGCATACCAATCCTTGTTTGC...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18131</th>\n",
+       "      <td>GCCTACTTCATCTCAGGACCCGCCCAAGAGTGGCCGCGGCTTTGGG...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18132</th>\n",
+       "      <td>TTGTCAGTCTGTCTGCTCAGGACACAAGAACTAAGGGGCAACAAAT...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18133</th>\n",
+       "      <td>CTTTATAGTGGCACAAACGCTTCAGAGACACACAATTATAAGAGAC...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>18134 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                   3-UTR\n",
+       "0      ATCTTATATAACTGTGAGATTAATCTCAGATAATGACACAAAATAT...\n",
+       "1      GGTTGCCGGGGGTAGGGGTGGGGCCACACAAATCTCCAGGAGCCAC...\n",
+       "2      GGCAGCCCATCTGGGGGGCCTGTAGGGGCTGCCGGGCTGGTGGCCA...\n",
+       "3      CCCACCTACCACCAGAGGCCTGCAGCCTCCCACATGCCTTAAGGGG...\n",
+       "4      TGGCCGCGGTGAGGTGGGTTCTCAGGACCACCCTCGCCAAGCTCCA...\n",
+       "...                                                  ...\n",
+       "18129  AGCAAGCATTGAAAATAATAGTTATTGCATACCAATCCTTGTTTGC...\n",
+       "18130  AGCAAGCATTGAAAATAATAGTTATTGCATACCAATCCTTGTTTGC...\n",
+       "18131  GCCTACTTCATCTCAGGACCCGCCCAAGAGTGGCCGCGGCTTTGGG...\n",
+       "18132  TTGTCAGTCTGTCTGCTCAGGACACAAGAACTAAGGGGCAACAAAT...\n",
+       "18133  CTTTATAGTGGCACAAACGCTTCAGAGACACACAATTATAAGAGAC...\n",
+       "\n",
+       "[18134 rows x 1 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# load the test data if it exists\n",
+    "file_path = 'test_df.pickle'\n",
+    "if os.path.exists(file_path):\n",
+    "    with open(file_path, 'rb') as f:\n",
+    "        train_df = pickle.load(f)\n",
+    "else:\n",
+    "    # load the fasta file and select the train data\n",
+    "    fasta_file = \"../../Homo_sapiens_3prime_UTR.fa\"\n",
+    "    sequences = []\n",
+    "    for s in SeqIO.parse(fasta_file, \"fasta\"):\n",
+    "        sequences.append(str(s.seq).upper())\n",
+    "    # get the train fraction\n",
+    "    val_fraction = 0.1\n",
+    "    N_train = int(len(sequences)*(1-val_fraction))\n",
+    "    test_data = sequences[N_train:]\n",
+    "    # store it as a dataframe\n",
+    "    test_df = pd.DataFrame({'3-UTR':test_data})\n",
+    "    with open(file_path, 'wb') as f:\n",
+    "        pickle.dump(test_df, f)\n",
+    "test_df"
    ]
   },
   {
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Training: Full Dataset"
+    "# Training: Full "
    ]
   },
   {
@@ -358,7 +485,7 @@
     }
    ],
    "source": [
-    "# get the frequency counts of all 11mers\n",
+    "# get the frequency counts of all motifs till 11mer\n",
     "kmer_all = KmerCount(11,pseudocount=0.1)\n",
     "kmer_all.compute_counts(full_df['3-UTR'])\n",
     "kmer_all.kmer_counts_dict"

diff --git a/results/11mer_full/.gitignore b/results/11mer_full/.gitignore