Skip to content

Commit

Permalink
Create pip package
Browse files Browse the repository at this point in the history
  • Loading branch information
jameswex committed Jul 22, 2019
1 parent 46fc827 commit b7df077
Show file tree
Hide file tree
Showing 18 changed files with 2,788 additions and 230 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@
**/__pycache__
**/.ipynb_checkpoints/
**/*.swp
**/dist/
**/build/
**/facets_overview.egg-info/

2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Key aspects of the visualization are outlier detection and distribution comparis
Interesting values (such as a high proportion of missing data, or very different distributions of a feature across multiple datasets) are highlighted in red.
Features can be sorted by values of interest such as the number of missing values or the skew between the different datasets.

The python code to generate the statistics for visualization can be installed through `pip install facets-overview`.

Details about Overview usage can be found in its [README](./facets_overview/README.md).

## Facets Dive
Expand Down
334 changes: 124 additions & 210 deletions colab_facets.ipynb
Original file line number Diff line number Diff line change
@@ -1,218 +1,132 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
"name": "Facets Dive and Overview Colab Example",
"version": "0.3.2",
"provenance": []
},
"colab_type": "code",
"id": "blPpZw5R3Bb4"
},
"outputs": [],
"source": [
"# Load UCI census train and test data into dataframes.\n",
"import pandas as pd\n",
"features = [\"Age\", \"Workclass\", \"fnlwgt\", \"Education\", \"Education-Num\", \"Marital Status\",\n",
" \"Occupation\", \"Relationship\", \"Race\", \"Sex\", \"Capital Gain\", \"Capital Loss\",\n",
" \"Hours per week\", \"Country\", \"Target\"]\n",
"train_data = pd.read_csv(\n",
" \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
" names=features,\n",
" sep=r'\\s*,\\s*',\n",
" engine='python',\n",
" na_values=\"?\")\n",
"test_data = pd.read_csv(\n",
" \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
" names=features,\n",
" sep=r'\\s*,\\s*',\n",
" skiprows=[0],\n",
" engine='python',\n",
" na_values=\"?\")"
]
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 617
},
"colab_type": "code",
"executionInfo": {
"elapsed": 4749,
"status": "ok",
"timestamp": 1532523415979,
"user": {
"displayName": "James Wexler",
"photoUrl": "//lh4.googleusercontent.com/-TJBPojJ2kd8/AAAAAAAAAAI/AAAAAAAAABE/YrSFlsiqR80/s50-c-k-no/photo.jpg",
"userId": "104529426628068202733"
},
"user_tz": 240
},
"id": "XtOzRy8Z3M36",
"outputId": "9efa442d-1e11-416e-d57f-e57b6e7e16e4"
},
"outputs": [],
"source": [
"\n",
"# Display the Dive visualization for the training data.\n",
"from IPython.core.display import display, HTML\n",
"\n",
"jsonstr = train_data.to_json(orient='records')\n",
"HTML_TEMPLATE = \"\"\"\n",
" <script src=\"https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js\"></script>\n",
" <link rel=\"import\" href=\"https://raw.githubusercontent.com/PAIR-code/facets/master/facets-dist/facets-jupyter.html\">\n",
" <facets-dive id=\"elem\" height=\"600\"></facets-dive>\n",
" <script>\n",
" var data = {jsonstr};\n",
" document.querySelector(\"#elem\").data = data;\n",
" </script>\"\"\"\n",
"html = HTML_TEMPLATE.format(jsonstr=jsonstr)\n",
"display(HTML(html))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 125
},
"colab_type": "code",
"executionInfo": {
"elapsed": 3967,
"status": "ok",
"timestamp": 1532522957138,
"user": {
"displayName": "James Wexler",
"photoUrl": "//lh4.googleusercontent.com/-TJBPojJ2kd8/AAAAAAAAAAI/AAAAAAAAABE/YrSFlsiqR80/s50-c-k-no/photo.jpg",
"userId": "104529426628068202733"
},
"user_tz": 240
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "M7JcESAhpKG-",
"colab_type": "code",
"colab": {}
},
"source": [
"#@title Install the facets_overview pip package.\n",
"!pip install facets-overview"
],
"execution_count": 0,
"outputs": []
},
"id": "B22HH9kyeyQd",
"outputId": "323a4d74-8d40-480b-ac9f-58ccf7a4f990"
},
"outputs": [],
"source": [
"# Clone the facets github repo to get access to the python feature stats generation code\n",
"!git clone https://github.com/pair-code/facets.git"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "blPpZw5R3Bb4",
"colab": {}
},
"source": [
"# Load UCI census train and test data into dataframes.\n",
"import pandas as pd\n",
"features = [\"Age\", \"Workclass\", \"fnlwgt\", \"Education\", \"Education-Num\", \"Marital Status\",\n",
" \"Occupation\", \"Relationship\", \"Race\", \"Sex\", \"Capital Gain\", \"Capital Loss\",\n",
" \"Hours per week\", \"Country\", \"Target\"]\n",
"train_data = pd.read_csv(\n",
" \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
" names=features,\n",
" sep=r'\\s*,\\s*',\n",
" engine='python',\n",
" na_values=\"?\")\n",
"test_data = pd.read_csv(\n",
" \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
" names=features,\n",
" sep=r'\\s*,\\s*',\n",
" skiprows=[0],\n",
" engine='python',\n",
" na_values=\"?\")"
],
"execution_count": 0,
"outputs": []
},
"colab_type": "code",
"id": "mjv5Kr1Mflq7"
},
"outputs": [],
"source": [
"# Add the path to the feature stats generation code.\n",
"import sys\n",
"sys.path.insert(0, '/content/facets/facets_overview/python/')\n",
"\n",
"# Create the feature stats for the datasets and stringify it.\n",
"import base64\n",
"from generic_feature_statistics_generator import GenericFeatureStatisticsGenerator\n",
"\n",
"gfsg = GenericFeatureStatisticsGenerator()\n",
"proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': train_data},\n",
" {'name': 'test', 'table': test_data}])\n",
"protostr = base64.b64encode(proto.SerializeToString()).decode(\"utf-8\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 1028
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "XtOzRy8Z3M36",
"colab": {}
},
"source": [
"\n",
"# Display the Dive visualization for the training data.\n",
"from IPython.core.display import display, HTML\n",
"\n",
"jsonstr = train_data.to_json(orient='records')\n",
"HTML_TEMPLATE = \"\"\"\n",
" <script src=\"https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js\"></script>\n",
" <link rel=\"import\" href=\"https://raw.githubusercontent.com/PAIR-code/facets/master/facets-dist/facets-jupyter.html\">\n",
" <facets-dive id=\"elem\" height=\"600\"></facets-dive>\n",
" <script>\n",
" var data = {jsonstr};\n",
" document.querySelector(\"#elem\").data = data;\n",
" </script>\"\"\"\n",
"html = HTML_TEMPLATE.format(jsonstr=jsonstr)\n",
"display(HTML(html))"
],
"execution_count": 0,
"outputs": []
},
"colab_type": "code",
"executionInfo": {
"elapsed": 369,
"status": "ok",
"timestamp": 1532523370507,
"user": {
"displayName": "James Wexler",
"photoUrl": "//lh4.googleusercontent.com/-TJBPojJ2kd8/AAAAAAAAAAI/AAAAAAAAABE/YrSFlsiqR80/s50-c-k-no/photo.jpg",
"userId": "104529426628068202733"
},
"user_tz": 240
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "mjv5Kr1Mflq7",
"colab": {}
},
"source": [
"# Create the feature stats for the datasets and stringify it.\n",
"import base64\n",
"from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator\n",
"\n",
"gfsg = GenericFeatureStatisticsGenerator()\n",
"proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': train_data},\n",
" {'name': 'test', 'table': test_data}])\n",
"protostr = base64.b64encode(proto.SerializeToString()).decode(\"utf-8\")"
],
"execution_count": 0,
"outputs": []
},
"id": "b7zs2p2_goJa",
"outputId": "22e211df-972f-49b9-f271-75e0d4ba68ee"
},
"outputs": [],
"source": [
"# Display the facets overview visualization for this data\n",
"from IPython.core.display import display, HTML\n",
"\n",
"HTML_TEMPLATE = \"\"\"\n",
" <script src=\"https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js\"></script>\n",
" <link rel=\"import\" href=\"https://raw.githubusercontent.com/PAIR-code/facets/master/facets-dist/facets-jupyter.html\" >\n",
" <facets-overview id=\"elem\"></facets-overview>\n",
" <script>\n",
" document.querySelector(\"#elem\").protoInput = \"{protostr}\";\n",
" </script>\"\"\"\n",
"html = HTML_TEMPLATE.format(protostr=protostr)\n",
"display(HTML(html))"
]
}
],
"metadata": {
"colab": {
"default_view": {},
"name": "Facets Dive and Overview Colab Example",
"provenance": [],
"version": "0.3.2",
"views": {}
},
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "b7zs2p2_goJa",
"colab": {}
},
"source": [
"# Display the facets overview visualization for this data\n",
"from IPython.core.display import display, HTML\n",
"\n",
"HTML_TEMPLATE = \"\"\"\n",
" <script src=\"https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js\"></script>\n",
" <link rel=\"import\" href=\"https://raw.githubusercontent.com/PAIR-code/facets/master/facets-dist/facets-jupyter.html\" >\n",
" <facets-overview id=\"elem\"></facets-overview>\n",
" <script>\n",
" document.querySelector(\"#elem\").protoInput = \"{protostr}\";\n",
" </script>\"\"\"\n",
"html = HTML_TEMPLATE.format(protostr=protostr)\n",
"display(HTML(html))"
],
"execution_count": 0,
"outputs": []
}
]
}
18 changes: 5 additions & 13 deletions facets_overview/Overview_demo.ipynb
Original file line number Diff line number Diff line change
@@ -1,16 +1,5 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Add the facets overview python code to the python path\n",
"import sys\n",
"sys.path.append('./python')"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -43,8 +32,11 @@
"metadata": {},
"outputs": [],
"source": [
"# Calculate the feature statistics proto from the datasets and stringify it for use in facets overview\n",
"from generic_feature_statistics_generator import GenericFeatureStatisticsGenerator\n",
"# Calculate the feature statistics proto from the datasets and stringify it for use in facets overview.\n",
"\n",
"# This code assumes that the facets-overview package has been installed through pip,\n",
"# along with a tensorflow (or tensorflow-gpu) package.\n",
"from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator\n",
"import base64\n",
"\n",
"gfsg = GenericFeatureStatisticsGenerator()\n",
Expand Down
Loading

0 comments on commit b7df077

Please sign in to comment.