added M/Z bias sims

apoorvalal · apoorvalal · commit b1b38d47e813 · 2023-10-24T15:26:30.000-07:00
diff --git a/Chapter16.ipynb b/Chapter16.ipynb
@@ -0,0 +1,319 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 16: Difficulties of Unconfoundedness in Observational Studies for Causal Effects"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import scipy as sp\n",
+    "import statsmodels.api as sm\n",
+    "import statsmodels.formula.api as smf\n",
+    "# viz\n",
+    "import matplotlib\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "font = {'family' : 'IBM Plex Sans Condensed',\n",
+    "               'weight' : 'normal',\n",
+    "               'size'   : 10}\n",
+    "plt.rc('font', **font)\n",
+    "plt.rcParams['figure.figsize'] = (10, 10)\n",
+    "%matplotlib inline\n",
+    "\n",
+    "from utils import *\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>U1</th>\n",
+       "      <th>U2</th>\n",
+       "      <th>X</th>\n",
+       "      <th>Y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.868786</td>\n",
+       "      <td>3.271211</td>\n",
+       "      <td>2.870715</td>\n",
+       "      <td>2.776858</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.102776</td>\n",
+       "      <td>-1.424613</td>\n",
+       "      <td>-0.343647</td>\n",
+       "      <td>-2.323463</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>-0.473300</td>\n",
+       "      <td>-0.808196</td>\n",
+       "      <td>-2.437951</td>\n",
+       "      <td>-0.117681</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>-0.524105</td>\n",
+       "      <td>-0.641949</td>\n",
+       "      <td>-0.149231</td>\n",
+       "      <td>-0.537228</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>-0.183823</td>\n",
+       "      <td>0.540470</td>\n",
+       "      <td>-0.029903</td>\n",
+       "      <td>1.374849</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         U1        U2         X         Y\n",
+       "0  0.868786  3.271211  2.870715  2.776858\n",
+       "1  0.102776 -1.424613 -0.343647 -2.323463\n",
+       "2 -0.473300 -0.808196 -2.437951 -0.117681\n",
+       "3 -0.524105 -0.641949 -0.149231 -0.537228\n",
+       "4 -0.183823  0.540470 -0.029903  1.374849"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "n = int(1e6)\n",
+    "df = simulate(\n",
+    "        U1 = lambda: np.random.normal(size = n),\n",
+    "        U2 = lambda: np.random.normal(size = n),\n",
+    "        X = lambda U1, U2: U1 + U2 + np.random.normal(size=n),\n",
+    "        Y = lambda U2: U2 + np.random.normal(size=n),\n",
+    "    )\n",
+    "\n",
+    "df.head()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## M-bias"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### continuous treatment\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(-0.0005873841517624718, -0.19992316827000112)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "df['Z'] = df.U1 + np.random.normal(size=n)\n",
+    "\n",
+    "smf.ols(\"Y ~ Z\", df).fit().params[1], smf.ols(\"Y ~ Z + X\", df).fit().params[1]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### binary treatment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0.0010055221102859783, -0.4154833890606614)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['Z'] = df.Z >= 0\n",
+    "\n",
+    "smf.ols(\"Y ~ Z\", df).fit().params[1], smf.ols(\"Y ~ Z + X\", df).fit().params[1]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Z-bias"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = int(1e6)\n",
+    "df = simulate(\n",
+    "        U = lambda: np.random.normal(size = n),\n",
+    "        X = lambda: np.random.normal(size = n),\n",
+    "        Z = lambda X, U: X + U + np.random.normal(size=n),\n",
+    "        Y = lambda U: U + np.random.normal(size=n),\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0.33315108130802534, 0.4997989461297992)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "smf.ols(\"Y ~ Z\", df).fit().params[1], smf.ols(\"Y ~ Z + X\", df).fit().params[1]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Adjusted comparison is more biased."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### stronger association"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0.16699612964603475, 0.4998217107196198)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['Z'] = 2 * df.X + df.U + np.random.normal(size=n)\n",
+    "smf.ols(\"Y ~ Z\", df).fit().params[1], smf.ols(\"Y ~ Z + X\", df).fit().params[1]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0.00990024072283937, 0.500804991941852)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['Z'] = 10 * df.X + df.U + np.random.normal(size=n)\n",
+    "smf.ols(\"Y ~ Z\", df).fit().params[1], smf.ols(\"Y ~ Z + X\", df).fit().params[1]\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "metrics",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/utils.py b/utils.py
@@ -0,0 +1,9 @@
+import numpy as np
+import pandas as pd
+
+def simulate(**kwargs):
+  values = {}
+  for k,v in kwargs.items():
+    inputs = {arg: values[arg] for arg in v.__code__.co_varnames}
+    values[k] = v(**inputs)
+  return pd.DataFrame(values)