Skip to content

Commit

Permalink
Update notes
Browse files Browse the repository at this point in the history
  • Loading branch information
frankbuckley committed Apr 16, 2024
1 parent 7bb7b98 commit 1c52057
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 198 deletions.
339 changes: 143 additions & 196 deletions projects/us_birth_certificates/notes.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,100 +11,102 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 18524899 entries, 0 to 18524898\n",
"Data columns (total 80 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 DOB_YY int32 \n",
" 1 DOB_MM category\n",
" 2 BFACIL category\n",
" 3 F_BFACIL category\n",
" 4 MAGE_IMPFLG category\n",
" 5 MAGE_REPFLG category\n",
" 6 MAGER category\n",
" 7 MAGER14 category\n",
" 8 MAGER9 category\n",
" 9 MBSTATE_REC category\n",
" 10 RESTATUS category\n",
" 11 MRACE31 category\n",
" 12 MRACE6 category\n",
" 13 MRACE15 category\n",
" 14 MRACEIMP category\n",
" 15 MHISPX category\n",
" 16 MHISP_R category\n",
" 17 F_MHISP category\n",
" 18 MRACEHISP category\n",
" 19 MAR_P category\n",
" 20 DMAR category\n",
" 21 MAR_IMP category\n",
" 22 F_MAR_P category\n",
" 23 MEDUC category\n",
" 24 F_MEDUC category\n",
" 25 FAGERPT_FLG category\n",
" 26 FAGECOMB category\n",
" 27 FAGEREC11 category\n",
" 28 FRACE31 category\n",
" 29 FRACE6 category\n",
" 30 FRACE15 category\n",
" 31 FHISPX category\n",
" 32 FHISP_R category\n",
" 33 F_FHISP category\n",
" 34 FRACEHISP category\n",
" 35 FEDUC category\n",
" 36 PRIORLIVE category\n",
" 37 PRIORDEAD category\n",
" 38 PRIORTERM category\n",
" 39 LBO_REC category\n",
" 40 TBO_REC category\n",
" 41 PRECARE category\n",
" 42 PAY category\n",
" 43 PAY_REC category\n",
" 44 F_PAY category\n",
" 45 F_PAY_REC category\n",
" 46 SEX category\n",
" 47 IMP_SEX category\n",
" 48 CA_ANEN category\n",
" 49 CA_MNSB category\n",
" 50 CA_CCHD category\n",
" 51 CA_CDH category\n",
" 52 OMPH category\n",
" 53 CA_GAST category\n",
" 54 F_CA_ANEN category\n",
" 55 F_CA_MENIN category\n",
" 56 F_CA_HEART category\n",
" 57 F_CA_HERNIA category\n",
" 58 F_CA_OMPHA category\n",
" 59 F_CA_GASTRO category\n",
" 60 CA_LIMB category\n",
" 61 CA_CLEFT category\n",
" 62 CA_CLPAL category\n",
" 63 CA_DOWN category\n",
" 64 CA_DISOR category\n",
" 65 CA_HYPO category\n",
" 66 F_CA_LIMB category\n",
" 67 F_CA_CLEFT category\n",
" 68 F_CA_CLPAL category\n",
" 69 F_CA_DOWN category\n",
" 70 F_CA_DISOR category\n",
" 71 F_CA_HYPO category\n",
" 72 NO_CONGEN category\n",
" 73 F_MPCB category\n",
" 74 PRECARE5 category\n",
" 75 PREVIS category\n",
" 76 PREVIS_REC category\n",
" 77 F_TPCV category\n",
" 78 WIC category\n",
" 79 F_WIC category\n",
"dtypes: category(79), int32(1)\n",
"memory usage: 1.4 GB\n"
"RangeIndex: 22389653 entries, 0 to 22389652\n",
"Data columns (total 82 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 DOB_YY int32 \n",
" 1 DOB_MM category\n",
" 2 BFACIL category\n",
" 3 F_BFACIL category\n",
" 4 MAGE_IMPFLG category\n",
" 5 MAGE_REPFLG category\n",
" 6 MAGER category\n",
" 7 MAGER14 category\n",
" 8 MAGER9 category\n",
" 9 MBSTATE_REC category\n",
" 10 RESTATUS category\n",
" 11 MRACE31 category\n",
" 12 MRACE6 category\n",
" 13 MRACE15 category\n",
" 14 MRACEIMP category\n",
" 15 MHISPX category\n",
" 16 MHISP_R category\n",
" 17 F_MHISP category\n",
" 18 MRACEHISP category\n",
" 19 MAR_P category\n",
" 20 DMAR category\n",
" 21 MAR_IMP category\n",
" 22 F_MAR_P category\n",
" 23 MEDUC category\n",
" 24 F_MEDUC category\n",
" 25 FAGERPT_FLG category\n",
" 26 FAGECOMB category\n",
" 27 FAGEREC11 category\n",
" 28 FRACE31 category\n",
" 29 FRACE6 category\n",
" 30 FRACE15 category\n",
" 31 FHISPX category\n",
" 32 FHISP_R category\n",
" 33 F_FHISP category\n",
" 34 FRACEHISP category\n",
" 35 FEDUC category\n",
" 36 PRIORLIVE category\n",
" 37 PRIORDEAD category\n",
" 38 PRIORTERM category\n",
" 39 LBO_REC category\n",
" 40 TBO_REC category\n",
" 41 PRECARE category\n",
" 42 PAY category\n",
" 43 PAY_REC category\n",
" 44 F_PAY category\n",
" 45 F_PAY_REC category\n",
" 46 SEX category\n",
" 47 IMP_SEX category\n",
" 48 CA_ANEN category\n",
" 49 CA_MNSB category\n",
" 50 CA_CCHD category\n",
" 51 CA_CDH category\n",
" 52 OMPH category\n",
" 53 CA_GAST category\n",
" 54 F_CA_ANEN category\n",
" 55 F_CA_MENIN category\n",
" 56 F_CA_HEART category\n",
" 57 F_CA_HERNIA category\n",
" 58 F_CA_OMPHA category\n",
" 59 F_CA_GASTRO category\n",
" 60 CA_LIMB category\n",
" 61 CA_CLEFT category\n",
" 62 CA_CLPAL category\n",
" 63 CA_DOWN category\n",
" 64 CA_DISOR category\n",
" 65 CA_HYPO category\n",
" 66 F_CA_LIMB category\n",
" 67 F_CA_CLEFT category\n",
" 68 F_CA_CLPAL category\n",
" 69 F_CA_DOWN category\n",
" 70 F_CA_DISOR category\n",
" 71 F_CA_HYPO category\n",
" 72 NO_CONGEN category\n",
" 73 F_MPCB category\n",
" 74 PRECARE5 category\n",
" 75 PREVIS category\n",
" 76 PREVIS_REC category\n",
" 77 F_TPCV category\n",
" 78 WIC category\n",
" 79 F_WIC category\n",
" 80 DS category\n",
" 81 DS_LB_CHANCE float64 \n",
"dtypes: category(80), float64(1), int32(1)\n",
"memory usage: 1.9 GB\n"
]
}
],
Expand All @@ -125,132 +127,77 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "\"['DS'] not in index\"",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m grouped \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDS\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWIC\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mgroupby(\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWIC\u001b[39m\u001b[38;5;124m\"\u001b[39m, observed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\u001b[38;5;241m.\u001b[39mvalue_counts()\n\u001b[1;32m 3\u001b[0m grouped\n\u001b[1;32m 4\u001b[0m grouped\u001b[38;5;241m.\u001b[39mto_clipboard()\n",
"File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py:4108\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 4107\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 4108\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcolumns\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 4110\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 4111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n",
"File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/indexes/base.py:6200\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[0;34m(self, key, axis_name)\u001b[0m\n\u001b[1;32m 6197\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 6198\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 6200\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6202\u001b[0m keyarr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[1;32m 6203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[1;32m 6204\u001b[0m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
"File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/indexes/base.py:6252\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[0;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[1;32m 6249\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6251\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[38;5;241m.\u001b[39mnonzero()[\u001b[38;5;241m0\u001b[39m]]\u001b[38;5;241m.\u001b[39munique())\n\u001b[0;32m-> 6252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[0;31mKeyError\u001b[0m: \"['DS'] not in index\""
]
"data": {
"text/plain": [
"WIC DS\n",
"N N 14723144\n",
" U 20108\n",
" Y 7908\n",
"U N 247471\n",
" U 5365\n",
" Y 183\n",
"Y N 7370412\n",
" U 11253\n",
" Y 3809\n",
"Name: count, dtype: int64"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grouped = df[[\"DS\", \"WIC\"]].groupby(\n",
"ds_wic = df[[\"DS\", \"WIC\"]].groupby(\n",
" \"WIC\", observed=True).value_counts()\n",
"grouped\n",
"grouped.to_clipboard()\n"
"ds_wic.to_clipboard()\n",
"ds_wic"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 18524899 entries, 0 to 18524898\n",
"Data columns (total 80 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 DOB_YY int32 \n",
" 1 DOB_MM int64 \n",
" 2 BFACIL int64 \n",
" 3 F_BFACIL int64 \n",
" 4 MAGE_IMPFLG float64 \n",
" 5 MAGE_REPFLG float64 \n",
" 6 MAGER int64 \n",
" 7 MAGER14 int64 \n",
" 8 MAGER9 int64 \n",
" 9 MBSTATE_REC int64 \n",
" 10 RESTATUS int64 \n",
" 11 MRACE31 int64 \n",
" 12 MRACE6 int64 \n",
" 13 MRACE15 int64 \n",
" 14 MRACEIMP float64 \n",
" 15 MHISPX Int64 \n",
" 16 MHISP_R int64 \n",
" 17 F_MHISP int64 \n",
" 18 MRACEHISP int64 \n",
" 19 MAR_P category\n",
" 20 DMAR float64 \n",
" 21 MAR_IMP float64 \n",
" 22 F_MAR_P int64 \n",
" 23 MEDUC int64 \n",
" 24 F_MEDUC int64 \n",
" 25 FAGERPT_FLG float64 \n",
" 26 FAGECOMB Int64 \n",
" 27 FAGEREC11 int64 \n",
" 28 FRACE31 int64 \n",
" 29 FRACE6 int64 \n",
" 30 FRACE15 int64 \n",
" 31 FHISPX int64 \n",
" 32 FHISP_R int64 \n",
" 33 F_FHISP int64 \n",
" 34 FRACEHISP int64 \n",
" 35 FEDUC int64 \n",
" 36 PRIORLIVE Int64 \n",
" 37 PRIORDEAD Int64 \n",
" 38 PRIORTERM Int64 \n",
" 39 LBO_REC int64 \n",
" 40 TBO_REC int64 \n",
" 41 PRECARE int64 \n",
" 42 PAY int64 \n",
" 43 PAY_REC int64 \n",
" 44 F_PAY int64 \n",
" 45 F_PAY_REC int64 \n",
" 46 SEX category\n",
" 47 IMP_SEX float64 \n",
" 48 CA_ANEN category\n",
" 49 CA_MNSB category\n",
" 50 CA_CCHD category\n",
" 51 CA_CDH category\n",
" 52 OMPH category\n",
" 53 CA_GAST category\n",
" 54 F_CA_ANEN int64 \n",
" 55 F_CA_MENIN int64 \n",
" 56 F_CA_HEART int64 \n",
" 57 F_CA_HERNIA int64 \n",
" 58 F_CA_OMPHA int64 \n",
" 59 F_CA_GASTRO int64 \n",
" 60 CA_LIMB category\n",
" 61 CA_CLEFT category\n",
" 62 CA_CLPAL category\n",
" 63 CA_DOWN category\n",
" 64 CA_DISOR category\n",
" 65 CA_HYPO category\n",
" 66 F_CA_LIMB int64 \n",
" 67 F_CA_CLEFT int64 \n",
" 68 F_CA_CLPAL int64 \n",
" 69 F_CA_DOWN int64 \n",
" 70 F_CA_DISOR int64 \n",
" 71 F_CA_HYPO int64 \n",
" 72 NO_CONGEN int64 \n",
" 73 F_MPCB int64 \n",
" 74 PRECARE5 int64 \n",
" 75 PREVIS Int64 \n",
" 76 PREVIS_REC int64 \n",
" 77 F_TPCV int64 \n",
" 78 WIC category\n",
" 79 F_WIC int64 \n",
"dtypes: Int64(6), category(15), float64(7), int32(1), int64(51)\n",
"memory usage: 9.3 GB\n"
]
"data": {
"text/plain": [
"DOB_YY DS\n",
"2017 N 3857773\n",
" U 4937\n",
" Y 2044\n",
"2018 N 3793319\n",
" U 6107\n",
" Y 2108\n",
"2019 N 3750084\n",
" U 5468\n",
" Y 2030\n",
"2020 N 3612707\n",
" U 5165\n",
" Y 1954\n",
"2021 N 3660477\n",
" U 7548\n",
" Y 1903\n",
"2022 N 3666667\n",
" U 7501\n",
" Y 1861\n",
"Name: count, dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.info()"
"year_ds = df[[\"DOB_YY\", \"DS\"]].groupby(\n",
" \"DOB_YY\", observed=True).value_counts()\n",
"year_ds.to_clipboard()\n",
"year_ds"
]
}
],
Expand Down
4 changes: 2 additions & 2 deletions projects/us_birth_certificates/prepare/columns.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Column utilities."""

import pandas as pd
from utils import get_ds_lb_chance
from . import utils


def rename_columns(df: pd.DataFrame, inplace=True) -> pd.DataFrame | None:
Expand Down Expand Up @@ -192,7 +192,7 @@ def add_computed_columns(df: pd.DataFrame) -> pd.DataFrame:
df["DS"] = df["CA_DOWN"].apply(lambda x: ds_convert(str(x)))

df["DS_LB_CHANCE"] = df["MAGER"].apply(
lambda x: get_ds_lb_chance(float(x)))
lambda x: utils.get_ds_lb_chance(float(x)))

df = df.astype({
"DS": "category",
Expand Down

0 comments on commit 1c52057

Please sign in to comment.