PSLmodels · rickecon · Jun 7, 2024 · May 31, 2024 · May 31, 2024 · May 31, 2024
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -56,7 +56,7 @@ jobs:
         shell: bash -l {0}
         working-directory: ./
         run: |
-          python -m pytest -m "not local" --cov=./ --cov-report=xml
+          python -m pytest -m "not local and not needs_puf and not needs_tmd" --cov=./ --cov-report=xml
       - name: Upload coverage to Codecov
         if: matrix.os == 'ubuntu-latest' && contains(github.repository, 'PSLmodels/OG-USA')
         uses: codecov/codecov-action@v4

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.9] - 2024-06-07 12:00:00
+
+### Added
+
+- Updates the `get_micro_data.py` and `calibration.py` modules to allow for the user to use the CPS, PUF, and TMD files with Tax-Calculator or to provide their own custom datafile, with associated grow factors and weights.
+
 
 ## [0.1.8] - 2024-05-20 12:00:00
 
@@ -105,6 +111,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[0.1.9]: https://github.com/PSLmodels/OG-USA/compare/v0.1.8...v0.1.9
 [0.1.8]: https://github.com/PSLmodels/OG-USA/compare/v0.1.7...v0.1.8
 [0.1.7]: https://github.com/PSLmodels/OG-USA/compare/v0.1.6...v0.1.7
 [0.1.6]: https://github.com/PSLmodels/OG-USA/compare/v0.1.5...v0.1.6

diff --git a/cs-config/cs_config/functions.py b/cs-config/cs_config/functions.py
@@ -14,18 +14,22 @@
 import pickle
 import json
 import inspect
+import pandas as pd
 import paramtools
 from distributed import Client
-from taxcalc import Policy
+from taxcalc import Policy, Records, GrowFactors
 from collections import OrderedDict
-from .helpers import retrieve_puf
+from .helpers import retrieve_puf, retrieve_tmd
 from cs2tc import convert_policy_adjustment
 
 AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID", "")
 AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY", "")
 PUF_S3_FILE_LOCATION = os.environ.get(
     "PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
 )
+TMD_S3_FILE_LOCATION = os.environ.get(
+    "TMD_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
+)
 CUR_DIR = os.path.dirname(os.path.realpath(__file__))
 
 # Get Tax-Calculator default parameters
@@ -78,7 +82,7 @@ class MetaParams(paramtools.Parameters):
 
 
 def get_version():
-    return "0.1.2"
+    return "0.1.9"
 
 
 def get_inputs(meta_param_dict):
@@ -188,16 +192,46 @@ def run_model(meta_param_dict, adjustment):
 
     meta_params = MetaParams()
     meta_params.adjust(meta_param_dict)
+    # Get data chosen by user
     if meta_params.data_source == "PUF":
         data = retrieve_puf(
             PUF_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
         )
+        weights = Records.PUF_WEIGHTS_FILENAME
+        records_start_year = Records.PUFCSV_YEAR
         # set name of cached baseline file in case use below
         cached_pickle = "TxFuncEst_baseline_PUF.pkl"
-    else:
+        if data is not None:
+            if not isinstance(data, pd.DataFrame):
+                raise TypeError("'data' must be a Pandas DataFrame.")
+        else:
+            # Access keys are not available. Default to the CPS.
+            print("Defaulting to the CPS")
+            meta_params.adjust({"data_source": "CPS"})
+    elif meta_params.data_source == "TMD":
+        data = retrieve_tmd(
+            TMD_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
+        )
+        weights = Records.TMD_WEIGHTS_FILENAME
+        records_start_year = Records.TMDCSV_YEAR
+        if data is not None:
+            if not isinstance(data, pd.DataFrame):
+                raise TypeError("'data' must be a Pandas DataFrame.")
+        else:
+            # Access keys are not available. Default to the CPS.
+            print("Defaulting to the CPS")
+            meta_params.adjust({"data_source": "CPS"})
+    elif meta_params.data_source == "CPS":
         data = "cps"
+        weights = Records.PUF_WEIGHTS_FILENAME
+        records_start_year = Records.CPSCSV_YEAR
         # set name of cached baseline file in case use below
         cached_pickle = "TxFuncEst_baseline_CPS.pkl"
+    else:
+        raise ValueError(
+            f"Data source '{meta_params.data_source}' is not supported."
+        )
+
     # Get TC params adjustments
     iit_mods = convert_policy_adjustment(
         adjustment["Tax-Calculator Parameters"]
@@ -211,7 +245,7 @@ def run_model(meta_param_dict, adjustment):
 
     # Dask parmeters
     num_workers = 2
-    memory_limit = "10GiB"
+    memory_per_worker = "10GiB"
     client = Client(
         n_workers=num_workers,
         threads_per_worker=1,
@@ -222,8 +256,7 @@ def run_model(meta_param_dict, adjustment):
     # num_workers_txf = 5
     # num_workers_mod = 6
 
-    # whether to estimate tax functions from microdata
-    run_micro = True
+    # Read in whether user chose to solve for transition path
     time_path = meta_param_dict["time_path"][0]["value"]
 
     # filter out OG-USA params that will not change between baseline and
@@ -363,6 +396,9 @@ def run_model(meta_param_dict, adjustment):
         iit_reform=iit_mods,
         estimate_tax_functions=True,
         data=data,
+        gfactors=GrowFactors.FILE_NAME,
+        weights=weights,
+        records_start_year=records_start_year,
         client=client,
     )
     # update tax function parameters in Specifications Object

diff --git a/cs-config/cs_config/helpers.py b/cs-config/cs_config/helpers.py
@@ -24,6 +24,9 @@
 PUF_S3_FILE_LOCATION = os.environ.get(
     "PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
 )
+TMD_S3_FILE_LOCATION = os.environ.get(
+    "TMD_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
+)
 TC_LAST_YEAR = Policy.LAST_BUDGET_YEAR
 
 POLICY_SCHEMA = {
@@ -120,3 +123,40 @@ def retrieve_puf(
             f"s3_reader_installed={s3_reader_installed})"
         )
         return None
+
+
+def retrieve_tmd(
+    tmd_s3_file_location=TMD_S3_FILE_LOCATION,
+    aws_access_key_id=AWS_ACCESS_KEY_ID,
+    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+):
+    """
+    Function for retrieving the TMD from the S3 bucket
+    """
+    s3_reader_installed = S3FileSystem is not None
+    has_credentials = (
+        aws_access_key_id is not None and aws_secret_access_key is not None
+    )
+    if tmd_s3_file_location and has_credentials and s3_reader_installed:
+        print("Reading tmd from S3 bucket.", tmd_s3_file_location)
+        fs = S3FileSystem(
+            key=AWS_ACCESS_KEY_ID,
+            secret=AWS_SECRET_ACCESS_KEY,
+        )
+        with fs.open(tmd_s3_file_location) as f:
+            # Skips over header from top of file.
+            tmd_df = pd.read_csv(f)
+        return tmd_df
+    elif Path("tmd.csv.gz").exists():
+        print("Reading tmd from tmd.csv.gz.")
+        return pd.read_csv("tmd.csv.gz", compression="gzip")
+    elif Path("tmd.csv").exists():
+        print("Reading tmd from tmd.csv.")
+        return pd.read_csv("tmd.csv")
+    else:
+        warnings.warn(
+            f"TMD file not available (tmd_location={tmd_s3_file_location}, "
+            f"has_credentials={has_credentials}, "
+            f"s3_reader_installed={s3_reader_installed})"
+        )
+        return None
diff --git a/docs/book/content/api/get_micro_data.rst b/docs/book/content/api/get_micro_data.rst
@@ -9,4 +9,4 @@ ogusa.get_micro_data
 ------------------------------------------
 
 .. automodule:: ogusa.get_micro_data
-  :members: get_calculator, get_data, taxcalc_advance, cap_inc_mtr
+  :members: get_calculator, get_data, taxcalc_advance, cap_inc_mtr, update_policy, is_paramtools_format
diff --git a/ogusa/calibrate.py b/ogusa/calibrate.py
@@ -3,6 +3,7 @@
 from ogusa import get_micro_data
 import os
 import numpy as np
+from taxcalc import Records
 from ogcore import txfunc, demographics
 from ogcore.utils import safe_read_pickle, mkdirs
 import pkg_resources
@@ -23,6 +24,9 @@ def __init__(
         iit_reform={},
         guid="",
         data="cps",
+        gfactors=None,
+        weights=None,
+        records_start_year=Records.CPSCSV_YEAR,
         client=None,
         num_workers=1,
         demographic_data_path=None,
@@ -33,7 +37,7 @@ def __init__(
         parameter values for the OG-USA model.
 
         Args:
-            p (OGUSA Parameters object): parameters object
+            p (OG-USA Parameters object): parameters object
             estimate_tax_functions (bool): whether to estimate tax functions
             estimate_beta (bool): whether to estimate beta
             estimate_chi_n (bool): whether to estimate chi_n
@@ -42,7 +46,13 @@ def __init__(
             iit_baseline (dict): baseline policy to use
             iit_reform (dict): reform tax parameters
             guid (str): id for tax function parameters
-            data (str): data source for microsimulation model
+            data (str or Pandas DataFrame): path or DataFrame with
+                data for Tax-Calculator model
+            gfactors (str or Pandas DataFrame ): path or DataFrame with
+                growth factors for Tax-Calculator model
+            weights (str or Pandas DataFrame): path or DataFrame with
+                weights for Tax-Calculator model
+            records_start_year (int): year micro data begins
             client (Dask client object): client
             num_workers (int): number of workers for Dask client
             output_path (str): path to save output to
@@ -69,6 +79,9 @@ def __init__(
                 iit_reform,
                 guid,
                 data,
+                gfactors,
+                weights,
+                records_start_year,
                 client,
                 num_workers,
                 run_micro=run_micro,
@@ -143,6 +156,9 @@ def get_tax_function_parameters(
         iit_reform={},
         guid="",
         data="",
+        gfactors=None,
+        weights=None,
+        records_start_year=Records.CPSCSV_YEAR,
         client=None,
         num_workers=1,
         run_micro=False,
@@ -157,7 +173,13 @@ def get_tax_function_parameters(
             iit_baseline (dict): baseline policy to use
             iit_reform (dict): reform tax parameters
             guid (string): id for tax function parameters
-            data (string): data source for microsimulation model
+            data (str or Pandas DataFrame): path or DataFrame with
+                data for Tax-Calculator model
+            gfactors (str or Pandas DataFrame ): path or DataFrame with
+                growth factors for Tax-Calculator model
+            weights (str or Pandas DataFrame): path or DataFrame with
+                weights for Tax-Calculator model
+            records_start_year (int): year micro data begins
             client (Dask client object): client
             num_workers (int): number of workers for Dask client
             run_micro (bool): whether to estimate parameters from

diff --git a/ogusa/constants.py b/ogusa/constants.py
@@ -12,9 +12,6 @@
 DEFAULT_START_YEAR = 2021
 # Tax-Calculator start year
 TC_LAST_YEAR = taxcalc.Policy.LAST_BUDGET_YEAR
-# Years of the PUF and CPS files
-PUF_START_YEAR = taxcalc.Records.PUFCSV_YEAR
-CPS_START_YEAR = taxcalc.Records.CPSCSV_YEAR
 
 VAR_LABELS = {
     "Y": "GDP ($Y_t$)",