From 89d01866f2cf24372f759547f4f10cbd3d04d8cc Mon Sep 17 00:00:00 2001 From: Sina Booeshaghi Date: Fri, 15 Dec 2023 15:01:43 -0500 Subject: [PATCH] intermediary save --- examples/parse_genbank.ipynb | 201 ++++++++++++++++++++++++++++++++--- 1 file changed, 188 insertions(+), 13 deletions(-) diff --git a/examples/parse_genbank.ipynb b/examples/parse_genbank.ipynb index 2174acf..58362e1 100644 --- a/examples/parse_genbank.ipynb +++ b/examples/parse_genbank.ipynb @@ -4,7 +4,7 @@ "metadata": { "colab": { "provenance": [], - "authorship_tag": "ABX9TyMhCTZ7NuSG/ASANgDzabR8", + "authorship_tag": "ABX9TyNMWNFnKajuCncoJoFxgHui", "include_colab_link": true }, "kernelspec": { @@ -33,10 +33,10 @@ ], "metadata": { "id": "RhUJkFixEzIe", - "outputId": "8b19d4f4-31b0-494c-d3b7-699afb77dd56", "colab": { "base_uri": "https://localhost:8080/" - } + }, + "outputId": "8b19d4f4-31b0-494c-d3b7-699afb77dd56" }, "execution_count": 1, "outputs": [ @@ -840,11 +840,11 @@ "gaps" ], "metadata": { - "id": "h9a02vSj86GS", - "outputId": "c676449d-566f-44db-be70-b18c2ce9cf30", "colab": { "base_uri": "https://localhost:8080/" - } + }, + "id": "h9a02vSj86GS", + "outputId": "c676449d-566f-44db-be70-b18c2ce9cf30" }, "execution_count": 33, "outputs": [ @@ -872,11 +872,12 @@ { "cell_type": "code", "source": [ - "def fill_gaps(regions, parent_start, parent_stop):\n", + "def fill_gaps(regions, parent_start = 0, parent_stop = 0):\n", " if len(regions) == 0:\n", - " return []\n", + " return regions\n", "\n", " prev = regions[0]\n", + " e = prev\n", " if prev[\"start\"] > parent_start:\n", " # add filler to start of regions\n", " regions = [{\n", @@ -888,28 +889,202 @@ " for idx, e in enumerate(regions[1:]):\n", " if prev[\"stop\"] < e[\"start\"]:\n", " # insert filler between\n", - " pass\n", + " regions[:regions.index(prev)] + {\n", + " \"id\": \"filler\",\n", + " \"start\": prev[\"stop\"],\n", + " \"stop\": e[\"start\"]\n", + " } + regions[:regions.index(e)]\n", + "\n", " prev = e\n", "\n", " if len(e[\"regions\"]) > 0:\n", - " fill_gaps(e[\"regions\"], e[\"start\"], e[\"stop\"])\n", + " e[\"regions\"] = fill_gaps(e[\"regions\"], e[\"start\"], e[\"stop\"])\n", "\n", " if e[\"stop\"] < parent_stop:\n", " # add to end of regions\n", - " pass\n", + " regions.append({\n", + " \"id\": \"filler\",\n", + " \"start\": e[\"stop\"],\n", + " \"stop\": parent_stop\n", + " })\n", " return regions" ], "metadata": { "id": "yA8e9-7n7dqe" }, - "execution_count": 35, + "execution_count": 43, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "fill_gaps(gaps)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yJJicbaX7wyf", + "outputId": "78cae183-cb4c-4cd3-938e-ac9bed776b73" + }, + "execution_count": 44, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[{'id': 'A',\n", + " 'start': 0,\n", + " 'stop': 100,\n", + " 'regions': [{'id': 'B',\n", + " 'start': 0,\n", + " 'stop': 25,\n", + " 'regions': [{'id': 'C', 'start': 10, 'stop': 15}]},\n", + " {'id': 'D', 'start': 50, 'stop': 75},\n", + " {'id': 'E', 'start': 75, 'stop': 85},\n", + " {'id': 'F', 'start': 90, 'stop': 100}]}]" + ] + }, + "metadata": {}, + "execution_count": 44 + } + ] + }, + { + "cell_type": "code", + "source": [ + "nogaps" + ], + "metadata": { + "id": "Lti5l9QHAY2A", + "outputId": "37401c8b-bba9-4df4-f087-f53a8fc47b8a", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 45, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[{'id': 'A',\n", + " 'start': 0,\n", + " 'stop': 100,\n", + " 'regions': [{'id': 'B',\n", + " 'start': 0,\n", + " 'stop': 25,\n", + " 'regions': [{'id': 'filler_BC', 'start': 0, 'stop': 10},\n", + " {'id': 'C', 'start': 10, 'stop': 15},\n", + " {'id': 'filler_CB', 'start': 15, 'stop': 25}]},\n", + " {'id': 'filler_BD', 'start': 25, 'stop': 50},\n", + " {'id': 'D', 'start': 50, 'stop': 75},\n", + " {'id': 'E', 'start': 75, 'stop': 85},\n", + " {'id': 'filler_EF', 'start': 85, 'stop': 90},\n", + " {'id': 'F', 'start': 90, 'stop': 100}]}]" + ] + }, + "metadata": {}, + "execution_count": 45 + } + ] + }, + { + "cell_type": "code", + "source": [ + "def fill_gaps(regions, parent_start=0, parent_stop=0):\n", + " if len(regions) == 0:\n", + " return []\n", + "\n", + " # Insert a filler at the start if necessary\n", + " if regions[0]['start'] > parent_start:\n", + " regions.insert(0, {\n", + " 'id': 'filler_start',\n", + " 'start': parent_start,\n", + " 'stop': regions[0]['start']\n", + " })\n", + "\n", + " new_regions = []\n", + " for i, region in enumerate(regions):\n", + " # Append the current region\n", + " new_regions.append(region)\n", + "\n", + " # Recursive call for nested regions\n", + " if 'regions' in region:\n", + " region['regions'] = fill_gaps(region['regions'], region['start'], region['stop'])\n", + "\n", + " # Check for gap and insert a filler\n", + " if i < len(regions) - 1 and region['stop'] < regions[i+1]['start']:\n", + " filler_id = f'filler_{region[\"id\"]}_{regions[i+1][\"id\"]}'\n", + " new_regions.append({\n", + " 'id': filler_id,\n", + " 'start': region['stop'],\n", + " 'stop': regions[i+1]['start']\n", + " })\n", + "\n", + " # Insert a filler at the end if necessary\n", + " if new_regions[-1]['stop'] < parent_stop:\n", + " new_regions.append({\n", + " 'id': 'filler_end',\n", + " 'start': new_regions[-1]['stop'],\n", + " 'stop': parent_stop\n", + " })\n", + "\n", + " return new_regions\n", + "\n", + "# Example usage\n", + "filled_regions = fill_gaps(gaps)\n" + ], + "metadata": { + "id": "7MzP1hzYDiUj" + }, + "execution_count": 47, "outputs": [] }, + { + "cell_type": "code", + "source": [ + "filled_regions" + ], + "metadata": { + "id": "94mXCoOGELC6", + "outputId": "afc3de76-ab33-4f03-d074-7a8b3b59b03d", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 48, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[{'id': 'A',\n", + " 'start': 0,\n", + " 'stop': 100,\n", + " 'regions': [{'id': 'B',\n", + " 'start': 0,\n", + " 'stop': 25,\n", + " 'regions': [{'id': 'filler_start', 'start': 0, 'stop': 10},\n", + " {'id': 'C', 'start': 10, 'stop': 15},\n", + " {'id': 'filler_end', 'start': 15, 'stop': 25}]},\n", + " {'id': 'filler_B_D', 'start': 25, 'stop': 50},\n", + " {'id': 'D', 'start': 50, 'stop': 75},\n", + " {'id': 'E', 'start': 75, 'stop': 85},\n", + " {'id': 'filler_E_F', 'start': 85, 'stop': 90},\n", + " {'id': 'F', 'start': 90, 'stop': 100}]}]" + ] + }, + "metadata": {}, + "execution_count": 48 + } + ] + }, { "cell_type": "code", "source": [], "metadata": { - "id": "yJJicbaX7wyf" + "id": "FG_eq8KPEOI8" }, "execution_count": null, "outputs": []