-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
282 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,282 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Blast and Alternatives\n", | ||
"Finding similar sequences based on sequence alignment algorithms\n", | ||
"\n", | ||
"K-mer indexing scheme (strategy)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Sequence similarity\n", | ||
"Useful as a metric between sequences of strings" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 41, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import operator\n", | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 42, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def read_submat(filename=\"blosum62.mat\"):\n", | ||
" df = pd.read_csv(filename, header=0, delimiter=\"\\t\")\n", | ||
" alphabet = list(df.columns)\n", | ||
" sm = {x+y:df.loc[i][j] for i,x in enumerate(alphabet) for j,y in enumerate(alphabet)}\n", | ||
" return df, alphabet, sm" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 43, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def max_mat(mat):\n", | ||
" \"\"\"finds the max cell in the matrix\"\"\"\n", | ||
" maxval = mat[0][0]\n", | ||
" maxrow = 0\n", | ||
" maxcol = 0\n", | ||
" for i in range(0,len(mat)):\n", | ||
" for j in range(0, len(mat[i])):\n", | ||
" if mat[i][j] > maxval:\n", | ||
" maxval = mat[i][j]\n", | ||
" maxrow = i\n", | ||
" maxcol = j\n", | ||
" return (maxrow, maxcol)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 44, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def recover_align_local (S, T, seq1, seq2):\n", | ||
" \"\"\"recover one of the optimal alignments\"\"\"\n", | ||
" res = [\"\", \"\"]\n", | ||
" \"\"\"determine the cell with max score\"\"\"\n", | ||
" i, j = max_mat(S)\n", | ||
" \"\"\"terminates when finds a cell with zero\"\"\"\n", | ||
" while T[i][j]>0:\n", | ||
" if T[i][j]==1:\n", | ||
" res[0] = seq1[i-1] + res[0]\n", | ||
" res[1] = seq2[j-1] + res[1]\n", | ||
" i -= 1\n", | ||
" j -= 1\n", | ||
" elif T[i][j] == 3:\n", | ||
" res[0] = \"-\" + res[0];\n", | ||
" res[1] = seq2[j-1] + res[1]\n", | ||
" j -= 1\n", | ||
" elif T[i][j] == 2:\n", | ||
" res[0] = seq1[i-1] + res[0]\n", | ||
" res[1] = \"-\" + res[1]\n", | ||
" i -= 1\n", | ||
" return res" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 45, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def score_col_alignment(s1, s2, sm, g):\n", | ||
" return g if s1==\"_\" or s2==\"_\" else sm[s1+s2]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 51, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def max3t(v1, v2, v3):\n", | ||
" \"\"\"Indicates which of the given integers is bigger: 1 2 or 3\"\"\"\n", | ||
" if v1 > v2:\n", | ||
" return 1 if v1 > v3 else 3\n", | ||
" else:\n", | ||
" return 2 if v2 > v3 else 3" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 52, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def smith_waterman(seq1, seq2, sm, g):\n", | ||
" \"\"\"Local alignment\"\"\"\n", | ||
" S = [[0]]\n", | ||
" T = [[0]]\n", | ||
" maxscore = 0\n", | ||
" # first row filled with zero\n", | ||
" for j in range(1, len(seq2)+1):\n", | ||
" S[0].append(0)\n", | ||
" T[0].append(0)\n", | ||
" # first column filled with zero\n", | ||
" for i in range(1, len(seq1)+1):\n", | ||
" S.append([0])\n", | ||
" T.append([0])\n", | ||
" for i in range(0, len(seq1)):\n", | ||
" for j in range(len(seq2)):\n", | ||
" s1 = S[i][j] + score_col_alignment(seq1[i], seq2[j], sm, g);\n", | ||
" s2 = S[i][j+1] + g\n", | ||
" s3 = S[i+1][j] + g\n", | ||
" b = max(s1, s2, s3)\n", | ||
" if b <= 0:\n", | ||
" S[i+1].append(0)\n", | ||
" T[i+1].append(0)\n", | ||
" else:\n", | ||
" S[i+1].append(b)\n", | ||
" T[i+1].append(max3t(s1, s2, s3))\n", | ||
" if b > maxscore:\n", | ||
" maxscore = b\n", | ||
" return (S, T, maxscore)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 71, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def test_local_alig():\n", | ||
" sm = read_submat_file('blosum62.mat')\n", | ||
" seq1 = \"PHSWG\"\n", | ||
" seq2 = \"HGWAG\"\n", | ||
" res = smith_waterman(seq1, seq2, sm, -8)\n", | ||
" S = res[0]\n", | ||
" T = res[1]\n", | ||
" print(\"Score of optimal alignment:\", res[2])\n", | ||
" #print_mat(S)\n", | ||
" #print_mat(T)\n", | ||
" print(S)\n", | ||
" print(T)\n", | ||
" alinL = recover_align_local(S, T, seq1, seq2)\n", | ||
" print(alinL[0])\n", | ||
" print(alinL[1]) " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 82, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Find the most similar sequence to a query sequence in a \"database\"\n", | ||
"# using local alignment\n", | ||
"# query sequence, database of sequences, substitution matrix, gap penalty\n", | ||
"def find_similar(query, list_of_seqs, sm, g):\n", | ||
" # smith_waterman returns (S, T, score)\n", | ||
" (S, T, score), seq = max([(smith_waterman(query, s, sm, g), s) for s in list_of_seqs], key=lambda x: x[0][2])\n", | ||
" print(S, T, score, seq)\n", | ||
"# return recover_align_needleman_wunsch(query, seq, s, t), score" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 83, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[[0, 0, 0, 0, 0, 0, 0], [0, 4, 5, 6, 7, 8, 9], [0, 5, 13, 14, 15, 16, 17], [0, 6, 14, 18, 19, 20, 21], [0, 7, 15, 19, 20, 25, 26]] [[0, 0, 0, 0, 0, 0, 0], [0, 1, 3, 3, 3, 3, 3], [0, 2, 1, 3, 3, 3, 3], [0, 2, 2, 1, 3, 3, 3], [0, 2, 2, 2, 3, 1, 3]] 26 ACTTGC\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"df, alf, sm = read_submat()\n", | ||
"find_similar(\"ACTG\", [\"ACTTGC\", \"ATATAT\", \"CATAGA\"], sm, g=1)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Para o exame\n", | ||
"Pode ser pedido um `align_query(query, ls, ms, g)`. O exame é escrito, mas terá sempre componente prática. Preencher matriz de alinhamento e escrever e a outra metade escrever código. É possível consultar material das aulas em papel. " | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# BLAST\n", | ||
"Basic Local Alignment Search Tool - a lot of programs\n", | ||
"\n", | ||
"Find regions of sequences that are locally similar" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Hashmap with partial match, assumes that keys can translate amount of similarity, could be interesting to improve search times" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.2" | ||
}, | ||
"latex_envs": { | ||
"LaTeX_envs_menu_present": true, | ||
"autoclose": false, | ||
"autocomplete": true, | ||
"bibliofile": "biblio.bib", | ||
"cite_by": "apalike", | ||
"current_citInitial": 1, | ||
"eqLabelWithNumbers": true, | ||
"eqNumInitial": 1, | ||
"hotkeys": { | ||
"equation": "Ctrl-E", | ||
"itemize": "Ctrl-I" | ||
}, | ||
"labels_anchors": false, | ||
"latex_user_defs": false, | ||
"report_style_numbering": false, | ||
"user_envs_cfg": false | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |