Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
.ipynb_checkpoints
.idea
.venv
150 changes: 74 additions & 76 deletions convert_html_jupyter_notebook_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,138 +2,136 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## How to Convert HTML to .ipynb\n",
"This is the code example used for the blog post [https://www.marsja.se/converting-html-to-a-jupyter-notebook/](https://www.marsja.se/converting-html-to-a-jupyter-notebook/) in which we learn how to convert code chunks from a webpage to a Jupyter notebook."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"execution_count": 12,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import json\n",
"import urllib\n",
"from urllib import request\n",
"\n",
"from bs4 import BeautifulSoup\n",
"\n",
"url = 'https://www.marsja.se/python-manova-made-easy-using-statsmodels/'\n",
"\n",
"headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11'\\\n",
" '(KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',\n",
" 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',\n",
" 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',\n",
" 'Accept-Encoding': 'none',\n",
" 'Accept-Language': 'en-US,en;q=0.8',\n",
" 'Connection': 'keep-alive'}"
"headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11''(KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',\n",
" 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',\n",
" 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',\n",
" 'Accept-Encoding': 'none',\n",
" 'Accept-Language': 'en-US,en;q=0.8',\n",
" 'Connection': 'keep-alive'}"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"execution_count": 13,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"req = urllib.request.Request(url, headers=headers)\n",
"page = urllib.request.urlopen(req)\n",
"req = request.Request(url, headers=headers)\n",
"page = request.urlopen(req)\n",
"text = page.read()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"execution_count": 14,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"soup = BeautifulSoup(text, 'lxml')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"execution_count": 15,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"create_nb = {'nbformat': 4, 'nbformat_minor': 2, \n",
" 'cells': [], 'metadata': \n",
" {\"kernelspec\": \n",
" {\"display_name\": \"Python 3\", \n",
" \"language\": \"python\", \"name\": \"python3\"\n",
" }}}\n",
"create_nb = {'nbformat': 4, 'nbformat_minor': 2,\n",
" 'cells': [], 'metadata':\n",
" {\"kernelspec\":\n",
" {\"display_name\": \"Python 3\",\n",
" \"language\": \"python\", \"name\": \"python3\"\n",
" }}}\n",
"\n",
"\n",
"def get_data(bs, content_tag, content_class=None):\n",
" if content_class:\n",
" bs_all = soup.find_all(content_tag, attrs={'class': content_class})\n",
" else:\n",
" bs_all = soup.find_all(content_tag)\n",
"\n",
" for div in bs_all:\n",
"\n",
"def get_data(soup, content_class):\n",
" for div in soup.find_all('div', \n",
" attrs={'class': content_class}):\n",
" \n",
" code_chunks = div.find_all('code')\n",
" \n",
"\n",
" for chunk in code_chunks:\n",
" cell_text = ' '\n",
" cell = {}\n",
" cell['metadata'] = {}\n",
" cell['outputs'] = []\n",
" cell['source'] = [chunk.get_text()]\n",
" cell['execution_count'] = None\n",
" cell['cell_type'] = 'code'\n",
" cell = {'metadata': {},\n",
" 'outputs': [],\n",
" 'source': [chunk.get_text()],\n",
" 'execution_count': None,\n",
" 'cell_type': 'code'\n",
" }\n",
" create_nb['cells'].append(cell)\n",
"\n",
"get_data(soup, 'post-content')\n",
"\n",
"get_data(soup, 'article')\n",
"\n",
"with open('Python_MANOVA.ipynb', 'w') as jynotebook:\n",
" jynotebook.write(json.dumps(create_nb))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"execution_count": 16,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'nbformat': 4,\n",
" 'nbformat_minor': 2,\n",
" 'cells': [{'metadata': {},\n",
" 'outputs': [],\n",
" 'source': ['import pandas as pd\\nfrom statsmodels.multivariate.manova import MANOVA'],\n",
" 'execution_count': None,\n",
" 'cell_type': 'code'},\n",
" {'metadata': {},\n",
" 'outputs': [],\n",
" 'source': ['url = \\'https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv\\'\\ndf = pd.read_csv(url, index_col=0)\\ndf.columns = df.columns.str.replace(\".\", \"_\")\\ndf.head()'],\n",
" 'execution_count': None,\n",
" 'cell_type': 'code'},\n",
" {'metadata': {},\n",
" 'outputs': [],\n",
" 'source': [\"maov = MANOVA.from_formula('Sepal_Length + Sepal_Width + \\\\\\n Petal_Length + Petal_Width ~ Species', data=df)\"],\n",
" 'execution_count': None,\n",
" 'cell_type': 'code'},\n",
" {'metadata': {},\n",
" 'outputs': [],\n",
" 'source': ['print(maov.mv_test())'],\n",
" 'execution_count': None,\n",
" 'cell_type': 'code'}],\n",
" 'metadata': {'kernelspec': {'display_name': 'Python 3',\n",
" 'language': 'python',\n",
" 'name': 'python3'}}}"
]
"text/plain": "{'nbformat': 4,\n 'nbformat_minor': 2,\n 'cells': [{'metadata': {},\n 'outputs': [],\n 'source': ['pip install statsmodels'],\n 'execution_count': None,\n 'cell_type': 'code'},\n {'metadata': {},\n 'outputs': [],\n 'source': ['import pandas as pd\\nfrom statsmodels.multivariate.manova import MANOVA'],\n 'execution_count': None,\n 'cell_type': 'code'},\n {'metadata': {},\n 'outputs': [],\n 'source': ['url = \\'https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv\\'\\ndf = pd.read_csv(url, index_col=0)\\ndf.columns = df.columns.str.replace(\".\", \"_\")\\ndf.head()'],\n 'execution_count': None,\n 'cell_type': 'code'},\n {'metadata': {},\n 'outputs': [],\n 'source': [\"maov = MANOVA.from_formula('Sepal_Length + Sepal_Width + \\\\\\n Petal_Length + Petal_Width ~ Species', data=df)\"],\n 'execution_count': None,\n 'cell_type': 'code'},\n {'metadata': {},\n 'outputs': [],\n 'source': ['print(maov.mv_test())'],\n 'execution_count': None,\n 'cell_type': 'code'}],\n 'metadata': {'kernelspec': {'display_name': 'Python 3',\n 'language': 'python',\n 'name': 'python3'}}}"
},
"execution_count": 9,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"create_nb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -157,4 +155,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}