PolicyEngine · nikhilwoodruff · Jan 29, 2024 · Jan 26, 2024 · Jan 26, 2024 · Jan 26, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,55 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.634.4] - 2024-01-29 04:29:46
+
+### Added
+
+- 2024 federal poverty guidelines.
+
+## [0.634.3] - 2024-01-29 01:13:40
+
+### Fixed
+
+- Removed duplicate NIIT addition from Louisiana federal tax deduction.
+
+## [0.634.2] - 2024-01-29 01:05:00
+
+### Fixed
+
+- Idaho permanent building fund tax calculation.
+
+## [0.634.1] - 2024-01-29 01:00:09
+
+### Fixed
+
+- Adjust the adds function in the hi_subtractions variable.
+
+## [0.634.0] - 2024-01-28 19:06:09
+
+### Added
+
+- Enable Georgia state income tax computation.
+
+## [0.633.4] - 2024-01-28 05:48:37
+
+### Fixed
+
+- Georgia itemized deduction calculation.
+
+## [0.633.3] - 2024-01-28 03:52:36
+
+### Fixed
+
+- CA TANF monthly applicant income disregards.
+
+## [0.633.2] - 2024-01-27 19:26:08
+
+### Fixed
+
+- Added remaining components to SPM net income.
+- Inflation-index SPM-related variables.
+
 ## [0.633.1] - 2024-01-26 16:34:55
 
 ### Fixed
@@ -6983,6 +7032,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[0.634.4]: https://github.com/PolicyEngine/policyengine-us/compare/0.634.3...0.634.4
+[0.634.3]: https://github.com/PolicyEngine/policyengine-us/compare/0.634.2...0.634.3
+[0.634.2]: https://github.com/PolicyEngine/policyengine-us/compare/0.634.1...0.634.2
+[0.634.1]: https://github.com/PolicyEngine/policyengine-us/compare/0.634.0...0.634.1
+[0.634.0]: https://github.com/PolicyEngine/policyengine-us/compare/0.633.4...0.634.0
+[0.633.4]: https://github.com/PolicyEngine/policyengine-us/compare/0.633.3...0.633.4
+[0.633.3]: https://github.com/PolicyEngine/policyengine-us/compare/0.633.2...0.633.3
+[0.633.2]: https://github.com/PolicyEngine/policyengine-us/compare/0.633.1...0.633.2
 [0.633.1]: https://github.com/PolicyEngine/policyengine-us/compare/0.633.0...0.633.1
 [0.633.0]: https://github.com/PolicyEngine/policyengine-us/compare/0.632.0...0.633.0
 [0.632.0]: https://github.com/PolicyEngine/policyengine-us/compare/0.631.6...0.632.0

diff --git a/changelog.yaml b/changelog.yaml
@@ -5922,3 +5922,44 @@
     fixed:
     - LA General relief computation to include withholding rules.
   date: 2024-01-26 16:34:55
+- bump: patch
+  changes:
+    fixed:
+    - Added remaining components to SPM net income.
+    - Inflation-index SPM-related variables.
+  date: 2024-01-27 19:26:08
+- bump: patch
+  changes:
+    fixed:
+    - CA TANF monthly applicant income disregards.
+  date: 2024-01-28 03:52:36
+- bump: patch
+  changes:
+    fixed:
+    - Georgia itemized deduction calculation.
+  date: 2024-01-28 05:48:37
+- bump: minor
+  changes:
+    added:
+    - Enable Georgia state income tax computation.
+  date: 2024-01-28 19:06:09
+- bump: patch
+  changes:
+    fixed:
+    - Adjust the adds function in the hi_subtractions variable.
+  date: 2024-01-29 01:00:09
+- bump: patch
+  changes:
+    fixed:
+    - Idaho permanent building fund tax calculation.
+  date: 2024-01-29 01:05:00
+- bump: patch
+  changes:
+    fixed:
+    - Removed duplicate NIIT addition from Louisiana federal tax deduction.
+  date: 2024-01-29 01:13:40
+- bump: patch
+  changes:
+    added:
+    - 2024 federal poverty guidelines.
+  date: 2024-01-29 04:29:46
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    changed:
+    - Improved CPS previous year imputations.
diff --git a/policyengine_us/data/datasets/cps/cps.py b/policyengine_us/data/datasets/cps/cps.py
@@ -1,7 +1,10 @@
 import logging
 from policyengine_core.data import Dataset
+from policyengine_us.data.storage import STORAGE_FOLDER
 import h5py
 from policyengine_us.data.datasets.cps.raw_cps import (
+    RawCPS_2018,
+    RawCPS_2019,
     RawCPS_2020,
     RawCPS_2021,
     RawCPS_2022,
@@ -308,15 +311,16 @@ def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None:
         spm_unit_payroll_tax_reported="SPM_FICA",
         spm_unit_federal_tax_reported="SPM_FEDTAX",
         spm_unit_state_tax_reported="SPM_STTAX",
-        spm_unit_work_childcare_expenses="SPM_CAPWKCCXPNS",
+        spm_unit_capped_work_childcare_expenses="SPM_CAPWKCCXPNS",
         spm_unit_medical_expenses="SPM_MEDXPNS",
         spm_unit_spm_threshold="SPM_POVTHRESHOLD",
         spm_unit_net_income_reported="SPM_RESOURCES",
         childcare_expenses="SPM_CHILDCAREXPNS",
     )
 
     for openfisca_variable, asec_variable in SPM_RENAMES.items():
-        cps[openfisca_variable] = spm_unit[asec_variable]
+        if asec_variable in spm_unit.columns:
+            cps[openfisca_variable] = spm_unit[asec_variable]
 
     cps["reduced_price_school_meals_reported"] = (
         cps["free_school_meals_reported"][...] * 0
@@ -364,85 +368,57 @@ def add_previous_year_income(self, cps: h5py.File) -> None:
         cps_current_year_data.person.PERIDNUM
     )
 
-    PREDICTORS = [
-        "WSAL_VAL",
-        "SEMP_VAL",
-        "A_AGE",
-        "A_SEX",
-        "DIV_VAL",
-        "INT_VAL",
-        "SS_VAL",
-        "ANN_VAL",
-        "PNSN_VAL",
-        "UC_VAL",
-        "CAP_VAL",
-        "CSP_VAL",
-        "CHSP_VAL",
-        "PAW_VAL",
-        "SSI_VAL",
-        "WICYN",
-        "PHIP_VAL",
-        "MOOP",
-    ]
-
-    in_sample = cps_previous_year_data.person.PERIDNUM[
-        cps_previous_year_data.person.PERIDNUM.isin(
-            cps_current_year_data.person.PERIDNUM
-        )
-    ]
-    cps_prev_long_subset = cps_previous_year.loc[in_sample]
-    cps_cur_long_subset = cps_current_year.set_index(
-        cps_current_year.PERIDNUM
-    ).loc[in_sample]
-
-    data_prev = cps_prev_long_subset[PREDICTORS].rename(
-        columns={x: x + "_prev" for x in PREDICTORS}
+    previous_year_data = cps_previous_year[
+        ["WSAL_VAL", "SEMP_VAL", "I_ERNVAL", "I_SEVAL"]
+    ].rename(
+        {
+            "WSAL_VAL": "employment_income_last_year",
+            "SEMP_VAL": "self_employment_income_last_year",
+        },
+        axis=1,
     )
-    data_cur = cps_cur_long_subset[PREDICTORS].rename(
-        columns={x: x + "_cur" for x in PREDICTORS}
-    )
-    data = pd.concat([data_prev, data_cur], axis=1)
 
-    X = data[[column + "_cur" for column in PREDICTORS]]
-    y = data[["WSAL_VAL_prev", "SEMP_VAL_prev"]]
+    previous_year_data = previous_year_data[
+        (previous_year_data.I_ERNVAL == 0) & (previous_year_data.I_SEVAL == 0)
+    ]
 
-    income_last_year = Imputation()
-    income_last_year.train(X, y)
+    previous_year_data.drop(["I_ERNVAL", "I_SEVAL"], axis=1, inplace=True)
 
-    df = pd.DataFrame()
-    df["person_id"] = cps_current_year.index
-    cps_cur_record_in_sample = cps_current_year.index.isin(
-        cps_previous_year.index
-    )
-    df["in_sample"] = cps_cur_record_in_sample
-    df["employment_income_prev"] = np.ones(len(df)) * np.nan
-    df["employment_income_prev"][cps_cur_record_in_sample] = (
-        cps_previous_year.loc[
-            cps_current_year.index[cps_cur_record_in_sample]
-        ].WSAL_VAL.values
-    )
-    df["self_employment_income_prev"] = np.ones(len(df)) * np.nan
-    df["self_employment_income_prev"][cps_cur_record_in_sample] = (
-        cps_previous_year.loc[
-            cps_current_year.index[cps_cur_record_in_sample]
-        ].SEMP_VAL.values
+    joined_data = cps_current_year.join(previous_year_data)[
+        [
+            "employment_income_last_year",
+            "self_employment_income_last_year",
+            "I_ERNVAL",
+            "I_SEVAL",
+        ]
+    ]
+    joined_data["previous_year_income_available"] = (
+        ~joined_data.employment_income_last_year.isna()
+        & ~joined_data.self_employment_income_last_year.isna()
+        & (joined_data.I_ERNVAL == 0)
+        & (joined_data.I_SEVAL == 0)
     )
+    joined_data = joined_data.fillna(-1).drop(["I_ERNVAL", "I_SEVAL"], axis=1)
 
-    X = cps_current_year[PREDICTORS][~cps_cur_record_in_sample]
-    X = X.rename(columns={x: x + "_cur" for x in PREDICTORS})
-    Y_pred = income_last_year.predict(X)
-    df["employment_income_prev"][
-        ~cps_cur_record_in_sample
-    ] = Y_pred.WSAL_VAL_prev.values
-    df["self_employment_income_prev"][
-        ~cps_cur_record_in_sample
-    ] = Y_pred.SEMP_VAL_prev.values
-
-    cps["employment_income_last_year"] = df["employment_income_prev"].values
-    cps["self_employment_income_last_year"] = df[
-        "self_employment_income_prev"
+    # CPS already ordered by PERIDNUM, so the join wouldn't change the order.
+    cps["employment_income_last_year"] = joined_data[
+        "employment_income_last_year"
     ].values
-    cps["previous_year_income_imputed"] = df["in_sample"].values
+    cps["self_employment_income_last_year"] = joined_data[
+        "self_employment_income_last_year"
+    ].values
+    cps["previous_year_income_available"] = joined_data[
+        "previous_year_income_available"
+    ].values
+
+
+class CPS_2019(CPS):
+    name = "cps_2019"
+    label = "CPS 2019"
+    raw_cps = RawCPS_2019
+    previous_year_raw_cps = RawCPS_2018
+    file_path = STORAGE_FOLDER / "cps_2019.h5"
+    time_period = 2019
 
 
 class CPS_2020(CPS):

diff --git a/policyengine_us/data/datasets/cps/enhanced_cps/enhanced_cps.py b/policyengine_us/data/datasets/cps/enhanced_cps/enhanced_cps.py
@@ -1,14 +1,14 @@
 from policyengine_us.data.storage import STORAGE_FOLDER
 from policyengine_core.data import Dataset
+import pandas as pd
 
 
-class EnhancedCPS_2023(Dataset):
-    name = "enhanced_cps_2023"
-    label = "Enhanced CPS (2023)"
-    file_path = STORAGE_FOLDER / "enhanced_cps.h5"
+class CalibratedPUFExtendedCPS(Dataset):
+    name = "calibrated_puf_extended_cps"
+    label = "Calibrated PUF-extended CPS (2023)"
+    file_path = STORAGE_FOLDER / "calibrated_puf_extended_cps.h5"
     data_format = Dataset.ARRAYS
     time_period = "2023"
-    url = "release://policyengine/policyengine-us/enhanced-cps-2023/enhanced_cps.h5"
 
     def generate(self):
         from .puf_extended_cps import PUFExtendedCPS_2023
@@ -28,3 +28,97 @@ def generate(self):
                 new_data[variable] = cps_data[variable][...]
 
         self.save_dataset(new_data)
+
+
+class EnhancedCPS_2023(Dataset):
+    name = "enhanced_cps_2023"
+    label = "Enhanced CPS (2023)"
+    file_path = STORAGE_FOLDER / "enhanced_cps.h5"
+    data_format = Dataset.ARRAYS
+    time_period = "2023"
+    url = "release://policyengine/policyengine-us/enhanced-cps-2023/enhanced_cps.h5"
+
+    def generate(self):
+        new_data = {}
+        cps = CalibratedPUFExtendedCPS()
+        from policyengine_us.data.datasets.cps.cps import CPS_2019
+
+        cps_data = cps.load()
+        for variable in cps.variables:
+            new_data[variable] = cps_data[variable][...]
+
+        # Add imputation of prior year income
+        from policyengine_us import Microsimulation
+
+        sim = Microsimulation(dataset=CPS_2019)
+
+        VARIABLES = [
+            "previous_year_income_available",
+            "employment_income",
+            "self_employment_income",
+            "age",
+            "is_male",
+            "spm_unit_state_fips",
+            "dividend_income",
+            "interest_income",
+            "social_security",
+            "capital_gains",
+            "is_disabled",
+            "is_blind",
+            "is_married",
+            "tax_unit_children",
+            "pension_income",
+        ]
+
+        OUTPUTS = [
+            "employment_income_last_year",
+            "self_employment_income_last_year",
+        ]
+
+        df = sim.calculate_dataframe(
+            VARIABLES + OUTPUTS, 2019, map_to="person"
+        )
+        df_train = df[df.previous_year_income_available]
+
+        from survey_enhance import Imputation
+
+        income_last_year = Imputation()
+        X = df_train[VARIABLES[1:]]
+        y = df_train[OUTPUTS]
+
+        income_last_year.train(X, y)
+
+        sim = Microsimulation(dataset=cps)
+
+        df = sim.calculate_dataframe(
+            VARIABLES + OUTPUTS, 2023, map_to="person"
+        )
+        # Path to targets:
+        # /policyengine_us/parameters/calibration/agi_by_source/projections.yaml {employment_income, self_employment_income}
+
+        parameters = sim.tax_benefit_system.parameters
+        projections = parameters("2022-01-01").calibration.gov.irs.soi
+
+        quantiles = income_last_year.solve_for_mean_quantiles(
+            [
+                projections.employment_income,
+                projections.self_employment_income,
+            ],
+            df[VARIABLES[1:]],
+            sim.calculate("household_weight", 2023, map_to="person").values,
+            max_iterations=7,
+        )
+        print(f"Mean quantiles: {quantiles}")
+        y_pred = income_last_year.predict(
+            df.drop(columns=OUTPUTS), mean_quantile=quantiles
+        )
+
+        df[OUTPUTS] = y_pred[OUTPUTS]
+        new_data["employment_income_last_year"] = df[
+            "employment_income_last_year"
+        ].values
+        new_data["self_employment_income_last_year"] = df[
+            "self_employment_income_last_year"
+        ].values
+
+        self.save_dataset(new_data)