Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more flexibility in data for use with Tax-Calculator #114

Merged
merged 21 commits into from
Jun 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ jobs:
shell: bash -l {0}
working-directory: ./
run: |
python -m pytest -m "not local" --cov=./ --cov-report=xml
python -m pytest -m "not local and not needs_puf and not needs_tmd" --cov=./ --cov-report=xml
- name: Upload coverage to Codecov
if: matrix.os == 'ubuntu-latest' && contains(github.repository, 'PSLmodels/OG-USA')
uses: codecov/codecov-action@v4
Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.1.9] - 2024-06-07 12:00:00

### Added

- Updates the `get_micro_data.py` and `calibration.py` modules to allow for the user to use the CPS, PUF, and TMD files with Tax-Calculator or to provide their own custom datafile, with associated grow factors and weights.


## [0.1.8] - 2024-05-20 12:00:00

Expand Down Expand Up @@ -105,6 +111,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0



[0.1.9]: https://github.com/PSLmodels/OG-USA/compare/v0.1.8...v0.1.9
[0.1.8]: https://github.com/PSLmodels/OG-USA/compare/v0.1.7...v0.1.8
[0.1.7]: https://github.com/PSLmodels/OG-USA/compare/v0.1.6...v0.1.7
[0.1.6]: https://github.com/PSLmodels/OG-USA/compare/v0.1.5...v0.1.6
Expand Down
50 changes: 43 additions & 7 deletions cs-config/cs_config/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,22 @@
import pickle
import json
import inspect
import pandas as pd
import paramtools
from distributed import Client
from taxcalc import Policy
from taxcalc import Policy, Records, GrowFactors
from collections import OrderedDict
from .helpers import retrieve_puf
from .helpers import retrieve_puf, retrieve_tmd
from cs2tc import convert_policy_adjustment

AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID", "")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY", "")
PUF_S3_FILE_LOCATION = os.environ.get(
"PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
)
TMD_S3_FILE_LOCATION = os.environ.get(
"TMD_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
)
CUR_DIR = os.path.dirname(os.path.realpath(__file__))

# Get Tax-Calculator default parameters
Expand Down Expand Up @@ -78,7 +82,7 @@ class MetaParams(paramtools.Parameters):


def get_version():
return "0.1.2"
return "0.1.9"


def get_inputs(meta_param_dict):
Expand Down Expand Up @@ -188,16 +192,46 @@ def run_model(meta_param_dict, adjustment):

meta_params = MetaParams()
meta_params.adjust(meta_param_dict)
# Get data chosen by user
if meta_params.data_source == "PUF":
data = retrieve_puf(
PUF_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
)
weights = Records.PUF_WEIGHTS_FILENAME
records_start_year = Records.PUFCSV_YEAR
# set name of cached baseline file in case use below
cached_pickle = "TxFuncEst_baseline_PUF.pkl"
else:
if data is not None:
if not isinstance(data, pd.DataFrame):
raise TypeError("'data' must be a Pandas DataFrame.")
else:
# Access keys are not available. Default to the CPS.
print("Defaulting to the CPS")
meta_params.adjust({"data_source": "CPS"})
elif meta_params.data_source == "TMD":
data = retrieve_tmd(
TMD_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
)
weights = Records.TMD_WEIGHTS_FILENAME
records_start_year = Records.TMDCSV_YEAR
if data is not None:
if not isinstance(data, pd.DataFrame):
raise TypeError("'data' must be a Pandas DataFrame.")
else:
# Access keys are not available. Default to the CPS.
print("Defaulting to the CPS")
meta_params.adjust({"data_source": "CPS"})
elif meta_params.data_source == "CPS":
data = "cps"
weights = Records.PUF_WEIGHTS_FILENAME
records_start_year = Records.CPSCSV_YEAR
# set name of cached baseline file in case use below
cached_pickle = "TxFuncEst_baseline_CPS.pkl"
else:
raise ValueError(
f"Data source '{meta_params.data_source}' is not supported."
)

# Get TC params adjustments
iit_mods = convert_policy_adjustment(
adjustment["Tax-Calculator Parameters"]
Expand All @@ -211,7 +245,7 @@ def run_model(meta_param_dict, adjustment):

# Dask parmeters
num_workers = 2
memory_limit = "10GiB"
memory_per_worker = "10GiB"
client = Client(
n_workers=num_workers,
threads_per_worker=1,
Expand All @@ -222,8 +256,7 @@ def run_model(meta_param_dict, adjustment):
# num_workers_txf = 5
# num_workers_mod = 6

# whether to estimate tax functions from microdata
run_micro = True
# Read in whether user chose to solve for transition path
time_path = meta_param_dict["time_path"][0]["value"]

# filter out OG-USA params that will not change between baseline and
Expand Down Expand Up @@ -363,6 +396,9 @@ def run_model(meta_param_dict, adjustment):
iit_reform=iit_mods,
estimate_tax_functions=True,
data=data,
gfactors=GrowFactors.FILE_NAME,
weights=weights,
records_start_year=records_start_year,
client=client,
)
# update tax function parameters in Specifications Object
Expand Down
40 changes: 40 additions & 0 deletions cs-config/cs_config/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
PUF_S3_FILE_LOCATION = os.environ.get(
"PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
)
TMD_S3_FILE_LOCATION = os.environ.get(
"TMD_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
)
TC_LAST_YEAR = Policy.LAST_BUDGET_YEAR

POLICY_SCHEMA = {
Expand Down Expand Up @@ -120,3 +123,40 @@ def retrieve_puf(
f"s3_reader_installed={s3_reader_installed})"
)
return None


def retrieve_tmd(
tmd_s3_file_location=TMD_S3_FILE_LOCATION,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
):
"""
Function for retrieving the TMD from the S3 bucket
"""
s3_reader_installed = S3FileSystem is not None
has_credentials = (
aws_access_key_id is not None and aws_secret_access_key is not None
)
if tmd_s3_file_location and has_credentials and s3_reader_installed:
print("Reading tmd from S3 bucket.", tmd_s3_file_location)
fs = S3FileSystem(
key=AWS_ACCESS_KEY_ID,
secret=AWS_SECRET_ACCESS_KEY,
)
with fs.open(tmd_s3_file_location) as f:
# Skips over header from top of file.
tmd_df = pd.read_csv(f)
return tmd_df
elif Path("tmd.csv.gz").exists():
print("Reading tmd from tmd.csv.gz.")
return pd.read_csv("tmd.csv.gz", compression="gzip")
elif Path("tmd.csv").exists():
print("Reading tmd from tmd.csv.")
return pd.read_csv("tmd.csv")
else:
warnings.warn(
f"TMD file not available (tmd_location={tmd_s3_file_location}, "
f"has_credentials={has_credentials}, "
f"s3_reader_installed={s3_reader_installed})"
)
return None
2 changes: 1 addition & 1 deletion docs/book/content/api/get_micro_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ ogusa.get_micro_data
------------------------------------------

.. automodule:: ogusa.get_micro_data
:members: get_calculator, get_data, taxcalc_advance, cap_inc_mtr
:members: get_calculator, get_data, taxcalc_advance, cap_inc_mtr, update_policy, is_paramtools_format
28 changes: 25 additions & 3 deletions ogusa/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from ogusa import get_micro_data
import os
import numpy as np
from taxcalc import Records
from ogcore import txfunc, demographics
from ogcore.utils import safe_read_pickle, mkdirs
import pkg_resources
Expand All @@ -23,6 +24,9 @@ def __init__(
iit_reform={},
guid="",
data="cps",
gfactors=None,
weights=None,
records_start_year=Records.CPSCSV_YEAR,
client=None,
num_workers=1,
demographic_data_path=None,
Expand All @@ -33,7 +37,7 @@ def __init__(
parameter values for the OG-USA model.

Args:
p (OGUSA Parameters object): parameters object
p (OG-USA Parameters object): parameters object
estimate_tax_functions (bool): whether to estimate tax functions
estimate_beta (bool): whether to estimate beta
estimate_chi_n (bool): whether to estimate chi_n
Expand All @@ -42,7 +46,13 @@ def __init__(
iit_baseline (dict): baseline policy to use
iit_reform (dict): reform tax parameters
guid (str): id for tax function parameters
data (str): data source for microsimulation model
data (str or Pandas DataFrame): path or DataFrame with
data for Tax-Calculator model
gfactors (str or Pandas DataFrame ): path or DataFrame with
growth factors for Tax-Calculator model
weights (str or Pandas DataFrame): path or DataFrame with
weights for Tax-Calculator model
records_start_year (int): year micro data begins
client (Dask client object): client
num_workers (int): number of workers for Dask client
output_path (str): path to save output to
Expand All @@ -69,6 +79,9 @@ def __init__(
iit_reform,
guid,
data,
gfactors,
weights,
records_start_year,
client,
num_workers,
run_micro=run_micro,
Expand Down Expand Up @@ -143,6 +156,9 @@ def get_tax_function_parameters(
iit_reform={},
guid="",
data="",
gfactors=None,
weights=None,
records_start_year=Records.CPSCSV_YEAR,
client=None,
num_workers=1,
run_micro=False,
Expand All @@ -157,7 +173,13 @@ def get_tax_function_parameters(
iit_baseline (dict): baseline policy to use
iit_reform (dict): reform tax parameters
guid (string): id for tax function parameters
data (string): data source for microsimulation model
data (str or Pandas DataFrame): path or DataFrame with
data for Tax-Calculator model
gfactors (str or Pandas DataFrame ): path or DataFrame with
growth factors for Tax-Calculator model
weights (str or Pandas DataFrame): path or DataFrame with
weights for Tax-Calculator model
records_start_year (int): year micro data begins
client (Dask client object): client
num_workers (int): number of workers for Dask client
run_micro (bool): whether to estimate parameters from
Expand Down
3 changes: 0 additions & 3 deletions ogusa/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@
DEFAULT_START_YEAR = 2021
# Tax-Calculator start year
TC_LAST_YEAR = taxcalc.Policy.LAST_BUDGET_YEAR
# Years of the PUF and CPS files
PUF_START_YEAR = taxcalc.Records.PUFCSV_YEAR
CPS_START_YEAR = taxcalc.Records.CPSCSV_YEAR

VAR_LABELS = {
"Y": "GDP ($Y_t$)",
Expand Down
Loading
Loading