Skip to content

Commit

Permalink
Added dinucl dist computation after meeting
Browse files Browse the repository at this point in the history
  • Loading branch information
annachernysheva179 committed Jun 11, 2023
1 parent 63dc48b commit 708ffe9
Show file tree
Hide file tree
Showing 3 changed files with 4,998 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"csv_file = \"..\\\\..\\\\scer_half_life.csv\"\n",
"df = pd.read_csv(csv_file)\n",
"#print(list(df.columns))\n",
"df = df[df[\"UTR3_seq\"].notnull()].reset_index(drop=True)\n",
"\n",
"data = list(df[\"UTR3_seq\"])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['AA', 'AC', 'AG', 'AT', 'AN', 'CA', 'CC', 'CG', 'CT', 'CN', 'GA', 'GC', 'GG', 'GT', 'GN', 'TA', 'TC', 'TG', 'TT', 'TN', 'NA', 'NC', 'NG', 'NT', 'NN']\n"
]
}
],
"source": [
"d = {}\n",
"nucleotides = \"ACGTN\"\n",
"pairs = []\n",
"for n1 in nucleotides:\n",
" for n2 in nucleotides:\n",
" pairs.append(n1+n2)\n",
"\n",
"print(pairs)\n",
"for seq in data:\n",
" for n in pairs:\n",
" tmp = d.get(n, 0)\n",
" count = 0\n",
" for i in range(len(seq)-1):\n",
" pair=seq[i:i+2]\n",
" if pair == n:\n",
" count += 1\n",
" d[n] = tmp + count"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'AA': 73552,\n",
" 'AC': 32728,\n",
" 'AG': 29334,\n",
" 'AT': 76731,\n",
" 'AN': 0,\n",
" 'CA': 34675,\n",
" 'CC': 16836,\n",
" 'CG': 14032,\n",
" 'CT': 33597,\n",
" 'CN': 0,\n",
" 'GA': 27438,\n",
" 'GC': 17562,\n",
" 'GG': 14076,\n",
" 'GT': 31294,\n",
" 'GN': 0,\n",
" 'TA': 75017,\n",
" 'TC': 34094,\n",
" 'TG': 33276,\n",
" 'TT': 87074,\n",
" 'TN': 0,\n",
" 'NA': 0,\n",
" 'NC': 0,\n",
" 'NG': 0,\n",
" 'NT': 0,\n",
" 'NN': 0}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'A': 212345, 'C': 99140, 'G': 90370, 'T': 229461, 'N': 0}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count = {}\n",
"\n",
"for k in d.keys():\n",
" tmp = count.get(k[0],0)\n",
" count[k[0]] = tmp + d[k]\n",
"\n",
"count"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'AA': 0.34638071226070777,\n",
" 'AC': 0.15412753935812004,\n",
" 'AG': 0.1381441161553133,\n",
" 'AT': 0.3613516322258588,\n",
" 'AN': 1e-06,\n",
" 'CA': 0.3497589180956223,\n",
" 'CC': 0.1698214559209199,\n",
" 'CG': 0.14153822009279807,\n",
" 'CT': 0.33888540589065963,\n",
" 'CN': 1e-06,\n",
" 'GA': 0.3036194574526945,\n",
" 'GC': 0.1943354030098484,\n",
" 'GG': 0.15576065475268341,\n",
" 'GT': 0.3462884847847737,\n",
" 'GN': 1e-06,\n",
" 'TA': 0.32692801591991666,\n",
" 'TC': 0.14858398360069902,\n",
" 'TG': 0.1450191076522808,\n",
" 'TT': 0.3794728928271035,\n",
" 'TN': 1e-06,\n",
" 'NA': 1e-06,\n",
" 'NC': 1e-06,\n",
" 'NG': 1e-06,\n",
" 'NT': 1e-06,\n",
" 'NN': 1e-06}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"{k: 0.000001 if count[k[0]] == 0 else (v / count[k[0]]) + 0.000001 for k, v in d.items()}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dist = torch.log(torch.tensor([[0.34638071226070777, 0.15412753935812004, 0.1381441161553133, 0.3613516322258588, 1e-06],\n",
" [0.3497589180956223, 0.1698214559209199, 0.14153822009279807, 0.33888540589065963, 1e-06],\n",
" [0.3036194574526945, 0.1943354030098484, 0.15576065475268341, 0.3462884847847737, 1e-06],\n",
" [0.32692801591991666, 0.14858398360069902, 0.1450191076522808, 0.3794728928271035, 1e-06]\n",
" [1e-06, 1e-06, 1e-06, 1e-06, 1e-06]]))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.11.3 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 708ffe9

Please sign in to comment.