diff --git a/Examples/Pooling/PartialPoolingExample.ipynb b/Examples/Pooling/PartialPoolingExample.ipynb
new file mode 100644
index 0000000..a19a978
--- /dev/null
+++ b/Examples/Pooling/PartialPoolingExample.ipynb
@@ -0,0 +1,875 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from plotnine import *\n",
+ "\n",
+ "from vtreat.partial_pooling_estimator import pooled_effect_estimate, standard_effect_estimate\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rng = np.random.default_rng(2024)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location_id | \n",
+ " observation | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " loc_48 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " loc_135 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " loc_18 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " loc_42 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " loc_63 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 395 | \n",
+ " loc_190 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 396 | \n",
+ " loc_133 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 397 | \n",
+ " loc_39 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 398 | \n",
+ " loc_38 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 399 | \n",
+ " loc_38 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
400 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location_id observation\n",
+ "0 loc_48 1\n",
+ "1 loc_135 1\n",
+ "2 loc_18 1\n",
+ "3 loc_42 1\n",
+ "4 loc_63 1\n",
+ ".. ... ...\n",
+ "395 loc_190 0\n",
+ "396 loc_133 1\n",
+ "397 loc_39 0\n",
+ "398 loc_38 0\n",
+ "399 loc_38 1\n",
+ "\n",
+ "[400 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "d = pd.DataFrame({\n",
+ " 'location_id': rng.choice([\n",
+ " f'loc_{i}' for i in range(200)],\n",
+ " size=400,\n",
+ " replace=True)\n",
+ "})\n",
+ "d['observation'] = rng.binomial(n=1, p=0.5, size=d.shape[0])\n",
+ "d"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location_id | \n",
+ " mean | \n",
+ " var | \n",
+ " size | \n",
+ " estimate | \n",
+ " grand_mean | \n",
+ " impact | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " loc_0 | \n",
+ " 0.500000 | \n",
+ " 0.500000 | \n",
+ " 2 | \n",
+ " 0.500000 | \n",
+ " 0.5325 | \n",
+ " -0.032500 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " loc_1 | \n",
+ " 0.250000 | \n",
+ " 0.250000 | \n",
+ " 4 | \n",
+ " 0.250000 | \n",
+ " 0.5325 | \n",
+ " -0.282500 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " loc_10 | \n",
+ " 0.666667 | \n",
+ " 0.333333 | \n",
+ " 3 | \n",
+ " 0.666667 | \n",
+ " 0.5325 | \n",
+ " 0.134167 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " loc_101 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 3 | \n",
+ " 0.000000 | \n",
+ " 0.5325 | \n",
+ " -0.532500 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " loc_102 | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 1.000000 | \n",
+ " 0.5325 | \n",
+ " 0.467500 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 172 | \n",
+ " loc_94 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 0.000000 | \n",
+ " 0.5325 | \n",
+ " -0.532500 | \n",
+ "
\n",
+ " \n",
+ " 173 | \n",
+ " loc_95 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 0.000000 | \n",
+ " 0.5325 | \n",
+ " -0.532500 | \n",
+ "
\n",
+ " \n",
+ " 174 | \n",
+ " loc_96 | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 1.000000 | \n",
+ " 0.5325 | \n",
+ " 0.467500 | \n",
+ "
\n",
+ " \n",
+ " 175 | \n",
+ " loc_97 | \n",
+ " 0.625000 | \n",
+ " 0.267857 | \n",
+ " 8 | \n",
+ " 0.625000 | \n",
+ " 0.5325 | \n",
+ " 0.092500 | \n",
+ "
\n",
+ " \n",
+ " 176 | \n",
+ " loc_98 | \n",
+ " 0.500000 | \n",
+ " 0.500000 | \n",
+ " 2 | \n",
+ " 0.500000 | \n",
+ " 0.5325 | \n",
+ " -0.032500 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
177 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location_id mean var size estimate grand_mean impact\n",
+ "0 loc_0 0.500000 0.500000 2 0.500000 0.5325 -0.032500\n",
+ "1 loc_1 0.250000 0.250000 4 0.250000 0.5325 -0.282500\n",
+ "2 loc_10 0.666667 0.333333 3 0.666667 0.5325 0.134167\n",
+ "3 loc_101 0.000000 0.000000 3 0.000000 0.5325 -0.532500\n",
+ "4 loc_102 1.000000 NaN 1 1.000000 0.5325 0.467500\n",
+ ".. ... ... ... ... ... ... ...\n",
+ "172 loc_94 0.000000 NaN 1 0.000000 0.5325 -0.532500\n",
+ "173 loc_95 0.000000 NaN 1 0.000000 0.5325 -0.532500\n",
+ "174 loc_96 1.000000 NaN 1 1.000000 0.5325 0.467500\n",
+ "175 loc_97 0.625000 0.267857 8 0.625000 0.5325 0.092500\n",
+ "176 loc_98 0.500000 0.500000 2 0.500000 0.5325 -0.032500\n",
+ "\n",
+ "[177 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "r = standard_effect_estimate(d)\n",
+ "\n",
+ "r"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location_id | \n",
+ " mean | \n",
+ " var | \n",
+ " size | \n",
+ " estimate | \n",
+ " grand_mean | \n",
+ " impact | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3 | \n",
+ " loc_101 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3 | \n",
+ " 0.0 | \n",
+ " 0.5325 | \n",
+ " -0.5325 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " loc_102 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 0.5325 | \n",
+ " 0.4675 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " loc_105 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 0.5325 | \n",
+ " 0.4675 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " loc_11 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 0.5325 | \n",
+ " 0.4675 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " loc_111 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 2 | \n",
+ " 0.0 | \n",
+ " 0.5325 | \n",
+ " -0.5325 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 164 | \n",
+ " loc_85 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 0.5325 | \n",
+ " -0.5325 | \n",
+ "
\n",
+ " \n",
+ " 170 | \n",
+ " loc_92 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 0.5325 | \n",
+ " 0.4675 | \n",
+ "
\n",
+ " \n",
+ " 172 | \n",
+ " loc_94 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 0.5325 | \n",
+ " -0.5325 | \n",
+ "
\n",
+ " \n",
+ " 173 | \n",
+ " loc_95 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 0.5325 | \n",
+ " -0.5325 | \n",
+ "
\n",
+ " \n",
+ " 174 | \n",
+ " loc_96 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 0.5325 | \n",
+ " 0.4675 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
99 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location_id mean var size estimate grand_mean impact\n",
+ "3 loc_101 0.0 0.0 3 0.0 0.5325 -0.5325\n",
+ "4 loc_102 1.0 NaN 1 1.0 0.5325 0.4675\n",
+ "6 loc_105 1.0 NaN 1 1.0 0.5325 0.4675\n",
+ "10 loc_11 1.0 NaN 1 1.0 0.5325 0.4675\n",
+ "12 loc_111 0.0 0.0 2 0.0 0.5325 -0.5325\n",
+ ".. ... ... ... ... ... ... ...\n",
+ "164 loc_85 0.0 NaN 1 0.0 0.5325 -0.5325\n",
+ "170 loc_92 1.0 NaN 1 1.0 0.5325 0.4675\n",
+ "172 loc_94 0.0 NaN 1 0.0 0.5325 -0.5325\n",
+ "173 loc_95 0.0 NaN 1 0.0 0.5325 -0.5325\n",
+ "174 loc_96 1.0 NaN 1 1.0 0.5325 0.4675\n",
+ "\n",
+ "[99 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "r.loc[\n",
+ " (r['impact'] >= np.max(r['impact']))\n",
+ " | (r['impact'] <= np.min(r['impact'])),\n",
+ " :]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "metadata": {
+ "image/png": {
+ "height": 480,
+ "width": 640
+ }
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "(\n",
+ " ggplot(\n",
+ " data=r,\n",
+ " mapping=aes(x='impact'),\n",
+ " )\n",
+ " + geom_density()\n",
+ " + geom_rug()\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location_id | \n",
+ " mean | \n",
+ " var | \n",
+ " size | \n",
+ " estimate | \n",
+ " grand_mean | \n",
+ " impact | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " loc_0 | \n",
+ " 0.500000 | \n",
+ " 0.500000 | \n",
+ " 2 | \n",
+ " 0.519000 | \n",
+ " 0.5325 | \n",
+ " -0.013500 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " loc_1 | \n",
+ " 0.250000 | \n",
+ " 0.250000 | \n",
+ " 4 | \n",
+ " 0.333864 | \n",
+ " 0.5325 | \n",
+ " -0.198636 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " loc_10 | \n",
+ " 0.666667 | \n",
+ " 0.333333 | \n",
+ " 3 | \n",
+ " 0.611252 | \n",
+ " 0.5325 | \n",
+ " 0.078752 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " loc_101 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 3 | \n",
+ " 0.065500 | \n",
+ " 0.5325 | \n",
+ " -0.467000 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " loc_102 | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 0.786364 | \n",
+ " 0.5325 | \n",
+ " 0.253864 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 172 | \n",
+ " loc_94 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 0.243339 | \n",
+ " 0.5325 | \n",
+ " -0.289161 | \n",
+ "
\n",
+ " \n",
+ " 173 | \n",
+ " loc_95 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 0.243339 | \n",
+ " 0.5325 | \n",
+ " -0.289161 | \n",
+ "
\n",
+ " \n",
+ " 174 | \n",
+ " loc_96 | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 0.786364 | \n",
+ " 0.5325 | \n",
+ " 0.253864 | \n",
+ "
\n",
+ " \n",
+ " 175 | \n",
+ " loc_97 | \n",
+ " 0.625000 | \n",
+ " 0.267857 | \n",
+ " 8 | \n",
+ " 0.608035 | \n",
+ " 0.5325 | \n",
+ " 0.075535 | \n",
+ "
\n",
+ " \n",
+ " 176 | \n",
+ " loc_98 | \n",
+ " 0.500000 | \n",
+ " 0.500000 | \n",
+ " 2 | \n",
+ " 0.519000 | \n",
+ " 0.5325 | \n",
+ " -0.013500 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
177 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location_id mean var size estimate grand_mean impact\n",
+ "0 loc_0 0.500000 0.500000 2 0.519000 0.5325 -0.013500\n",
+ "1 loc_1 0.250000 0.250000 4 0.333864 0.5325 -0.198636\n",
+ "2 loc_10 0.666667 0.333333 3 0.611252 0.5325 0.078752\n",
+ "3 loc_101 0.000000 0.000000 3 0.065500 0.5325 -0.467000\n",
+ "4 loc_102 1.000000 NaN 1 0.786364 0.5325 0.253864\n",
+ ".. ... ... ... ... ... ... ...\n",
+ "172 loc_94 0.000000 NaN 1 0.243339 0.5325 -0.289161\n",
+ "173 loc_95 0.000000 NaN 1 0.243339 0.5325 -0.289161\n",
+ "174 loc_96 1.000000 NaN 1 0.786364 0.5325 0.253864\n",
+ "175 loc_97 0.625000 0.267857 8 0.608035 0.5325 0.075535\n",
+ "176 loc_98 0.500000 0.500000 2 0.519000 0.5325 -0.013500\n",
+ "\n",
+ "[177 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "r2 = pooled_effect_estimate(d)\n",
+ "\n",
+ "r2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location_id | \n",
+ " mean | \n",
+ " var | \n",
+ " size | \n",
+ " estimate | \n",
+ " grand_mean | \n",
+ " impact | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 41 | \n",
+ " loc_14 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 5 | \n",
+ " 0.975165 | \n",
+ " 0.5325 | \n",
+ " 0.442665 | \n",
+ "
\n",
+ " \n",
+ " 65 | \n",
+ " loc_165 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5 | \n",
+ " 0.028288 | \n",
+ " 0.5325 | \n",
+ " -0.504212 | \n",
+ "
\n",
+ " \n",
+ " 155 | \n",
+ " loc_74 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 5 | \n",
+ " 0.975165 | \n",
+ " 0.5325 | \n",
+ " 0.442665 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location_id mean var size estimate grand_mean impact\n",
+ "41 loc_14 1.0 0.0 5 0.975165 0.5325 0.442665\n",
+ "65 loc_165 0.0 0.0 5 0.028288 0.5325 -0.504212\n",
+ "155 loc_74 1.0 0.0 5 0.975165 0.5325 0.442665"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "r2.loc[\n",
+ " (r2['impact'] >= np.max(r2['impact']))\n",
+ " | (r2['impact'] <= np.min(r2['impact'])),\n",
+ " :]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "metadata": {
+ "image/png": {
+ "height": 480,
+ "width": 640
+ }
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "(\n",
+ " ggplot(\n",
+ " data=r2,\n",
+ " mapping=aes(x='impact'),\n",
+ " )\n",
+ " + geom_density()\n",
+ " + geom_rug()\n",
+ ")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "vtreat_dev_env",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/coverage.txt b/coverage.txt
index fe938bd..e69de29 100644
--- a/coverage.txt
+++ b/coverage.txt
@@ -1,105 +0,0 @@
-============================= test session starts ==============================
-platform darwin -- Python 3.11.5, pytest-7.4.0, pluggy-1.0.0
-rootdir: /Users/johnmount/Documents/work/pyvtreat/pkg
-plugins: anyio-3.5.0, cov-4.1.0
-collected 45 items
-
-pkg/tests/test_KDD2009.py . [ 2%]
-pkg/tests/test_ai200_examples.py .. [ 6%]
-pkg/tests/test_classification.py ... [ 13%]
-pkg/tests/test_col_name_issues.py ... [ 20%]
-pkg/tests/test_cross_plan_issues.py .. [ 24%]
-pkg/tests/test_db_adapter.py ... [ 31%]
-pkg/tests/test_dup.py . [ 33%]
-pkg/tests/test_effect_scaler.py .. [ 37%]
-pkg/tests/test_grouped_stats.py . [ 40%]
-pkg/tests/test_id_column_check.py . [ 42%]
-pkg/tests/test_imputation_controls.py . [ 44%]
-pkg/tests/test_logistic_small.py . [ 46%]
-pkg/tests/test_max_levels.py . [ 48%]
-pkg/tests/test_multinomial.py . [ 51%]
-pkg/tests/test_nan_inf.py . [ 53%]
-pkg/tests/test_outcome_name_required.py . [ 55%]
-pkg/tests/test_pipeparams.py . [ 57%]
-pkg/tests/test_pooled_calc.py .. [ 62%]
-pkg/tests/test_r1_issue.py . [ 64%]
-pkg/tests/test_range.py . [ 66%]
-pkg/tests/test_regression.py . [ 68%]
-pkg/tests/test_result_restriction.py . [ 71%]
-pkg/tests/test_stats.py ..... [ 82%]
-pkg/tests/test_unexepcted_nan.py . [ 84%]
-pkg/tests/test_unsup_perf.py . [ 86%]
-pkg/tests/test_unsupervised.py . [ 88%]
-pkg/tests/test_user_coders.py . [ 91%]
-pkg/tests/test_util.py .. [ 95%]
-pkg/tests/test_xicor.py .. [100%]
-
-=============================== warnings summary ===============================
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/lark/utils.py:116
- /Users/johnmount/opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/lark/utils.py:116: DeprecationWarning: module 'sre_parse' is deprecated
- import sre_parse
-
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/lark/utils.py:117
- /Users/johnmount/opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/lark/utils.py:117: DeprecationWarning: module 'sre_constants' is deprecated
- import sre_constants
-
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/google/rpc/__init__.py:18
- /Users/johnmount/opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/google/rpc/__init__.py:18: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html
- import pkg_resources
-
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
- /Users/johnmount/opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
- Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
- declare_namespace(pkg)
-
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
- /Users/johnmount/opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.cloud')`.
- Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
- declare_namespace(pkg)
-
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2350
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2350
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2350
- /Users/johnmount/opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2350: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
- Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
- declare_namespace(parent)
-
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
- /Users/johnmount/opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.logging')`.
- Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
- declare_namespace(pkg)
-
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
- /Users/johnmount/opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('mpl_toolkits')`.
- Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
- declare_namespace(pkg)
-
-../../../opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/google/rpc/__init__.py:20
- /Users/johnmount/opt/anaconda3/envs/vtreat_dev_env/lib/python3.11/site-packages/google/rpc/__init__.py:20: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.rpc')`.
- Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
- pkg_resources.declare_namespace(__name__)
-
--- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
-
----------- coverage: platform darwin, python 3.11.5-final-0 ----------
-Name Stmts Miss Cover
--------------------------------------------------------------
-pkg/vtreat/__init__.py 6 0 100%
-pkg/vtreat/cross_plan.py 50 1 98%
-pkg/vtreat/effect_scaler.py 59 4 93%
-pkg/vtreat/partial_pooling_estimator.py 34 0 100%
-pkg/vtreat/stats_utils.py 132 0 100%
-pkg/vtreat/test_util.py 84 18 79%
-pkg/vtreat/transform.py 14 0 100%
-pkg/vtreat/util.py 149 8 95%
-pkg/vtreat/vtreat_api.py 285 34 88%
-pkg/vtreat/vtreat_db_adapter.py 69 0 100%
-pkg/vtreat/vtreat_impl.py 711 61 91%
--------------------------------------------------------------
-TOTAL 1593 126 92%
-
-================= 45 passed, 15 warnings in 163.97s (0:02:43) ==================
diff --git a/pkg/build/lib/vtreat/__init__.py b/pkg/build/lib/vtreat/__init__.py
index 3403c9f..8b44a97 100644
--- a/pkg/build/lib/vtreat/__init__.py
+++ b/pkg/build/lib/vtreat/__init__.py
@@ -8,7 +8,14 @@
# noinspection PyUnresolvedReferences
import numpy
-from vtreat.vtreat_api import unsupervised_parameters, vtreat_parameters, BinomialOutcomeTreatment, MultinomialOutcomeTreatment, NumericOutcomeTreatment, UnsupervisedTreatment
+from vtreat.vtreat_api import (
+ unsupervised_parameters,
+ vtreat_parameters,
+ BinomialOutcomeTreatment,
+ MultinomialOutcomeTreatment,
+ NumericOutcomeTreatment,
+ UnsupervisedTreatment,
+)
__docformat__ = "restructuredtext"
diff --git a/pkg/build/lib/vtreat/da_adapter.py b/pkg/build/lib/vtreat/da_adapter.py
new file mode 100644
index 0000000..32c99c6
--- /dev/null
+++ b/pkg/build/lib/vtreat/da_adapter.py
@@ -0,0 +1,296 @@
+"""
+Convert the description of a vtreat variable treatment into a data algebra pipeline.
+"""
+
+from typing import Dict, Iterable, List, Optional, Tuple
+import numpy
+import pandas
+
+from vtreat.vtreat_impl import bad_sentinel, replace_bad_with_sentinel
+
+from data_algebra import pivot_specification, unpivot_specification
+from data_algebra.data_ops import (
+ data,
+ descr,
+ describe_table,
+
+ TableDescription,
+ ViewRepresentation,
+)
+
+
+def def_multi_column_map(
+ d: ViewRepresentation,
+ *,
+ mapping_table: ViewRepresentation,
+ row_keys: Iterable[str],
+ col_name_key: str = "column_name",
+ col_value_key: str = "column_value",
+ mapped_value_key: str = "mapped_value",
+ cols_to_map: Iterable[str],
+ coalesce_value=None,
+ cols_to_map_back: Optional[Iterable[str]] = None,
+) -> ViewRepresentation:
+ """
+ Map all columns in list cols_to_map through the mapping in mapping table (key by column name and value).
+ d should be uniquely keyed by row_keys, and mapping table should be uniquely keyed by [col_name_key, col_value_key].
+
+ :param d: view to re-map
+ :param mapping_table: view to get mappings from
+ :param row_keys: columns that uniquely identify rows in d
+ :param col_name_key: column name specifying columns in mapping_table
+ :param col_value_key: column name specifying pre-map values in mapping table
+ :param mapped_value_key: column name specifying post-map values in mapping table
+ :param cols_to_map: columns to re-map.
+ :param coalesce_value: if not None, coalesce to this value
+ :param cols_to_map_back: if not None new names for resulting columns
+ :return: operations specifying how to re-map DataFrame
+ """
+ assert not isinstance(row_keys, str)
+ row_keys = list(row_keys)
+ assert len(row_keys) > 0
+ assert not isinstance(cols_to_map, str)
+ cols_to_map = list(cols_to_map)
+ assert len(cols_to_map) > 0
+ assert isinstance(col_name_key, str)
+ assert isinstance(col_value_key, str)
+ assert isinstance(mapped_value_key, str)
+ if cols_to_map_back is not None:
+ assert not isinstance(cols_to_map_back, str)
+ cols_to_map_back = list(cols_to_map_back)
+ assert len(cols_to_map_back) == len(cols_to_map)
+ pre_col_names = row_keys + cols_to_map
+ assert len(pre_col_names) == len(set(pre_col_names))
+ mid_col_names = row_keys + [col_name_key, col_value_key, mapped_value_key]
+ assert len(mid_col_names) == len(set(mid_col_names))
+ if cols_to_map_back is None:
+ post_col_names = row_keys + cols_to_map
+ else:
+ post_col_names = row_keys + cols_to_map_back
+ assert len(post_col_names) == len(set(post_col_names))
+ record_map_to = None
+ record_map_back = None
+ if len(cols_to_map) > 1:
+ record_map_to = unpivot_specification(
+ row_keys=row_keys,
+ col_name_key=col_name_key,
+ col_value_key=col_value_key,
+ value_cols=cols_to_map,
+ )
+ record_map_back = pivot_specification(
+ row_keys=row_keys,
+ col_name_key=col_name_key,
+ col_value_key=mapped_value_key,
+ value_cols=cols_to_map,
+ )
+ ops = d.select_columns(row_keys + cols_to_map)
+ if record_map_to is not None:
+ ops = ops.convert_records(record_map_to)
+ else:
+ ops = (
+ ops
+ .map_columns({cols_to_map[0]: col_value_key})
+ .extend({'orig_var': repr(cols_to_map[0])})
+ )
+ ops = (
+ ops
+ .natural_join(
+ b=mapping_table.select_columns(
+ [col_name_key, col_value_key, mapped_value_key]
+ ),
+ jointype="left",
+ on=[col_name_key, col_value_key],
+ )
+ )
+ if coalesce_value is not None:
+ ops = ops.extend(
+ {mapped_value_key: f"{mapped_value_key}.coalesce({coalesce_value})"}
+ )
+ if record_map_back is not None:
+ ops = ops.convert_records(record_map_back)
+ else:
+ ops = (
+ ops
+ .drop_columns(['value', 'orig_var'])
+ .map_columns({'replacement': cols_to_map[0]})
+ )
+ if cols_to_map_back is not None:
+ # could do this in the record mapping, but this seems easier to read
+ ops = ops.rename_columns(
+ {
+ new_name: old_name
+ for new_name, old_name in zip(cols_to_map_back, cols_to_map)
+ }
+ )
+ return ops
+
+
+def _check_treatment_table(vtreat_descr: pandas.DataFrame):
+ """
+ Assert if expected invariants don't hold for vtreat_descr.
+
+ :param vtreat_descr: .description_matrix() description of a transform to check.
+ :return: no return, assert on failure
+ """
+
+ # belt and suspenders replace missing with sentinel
+ vtreat_descr = vtreat_descr.copy()
+ vtreat_descr["value"] = replace_bad_with_sentinel(vtreat_descr["value"])
+ # check our expected invariants
+ assert isinstance(vtreat_descr, pandas.DataFrame)
+ # numeric is a function of original variable only
+ check_fn_relnn = (
+ data(vtreat_descr=vtreat_descr)
+ .project({}, group_by=["orig_var", "orig_was_numeric"])
+ .extend({"one": 1})
+ .project({"count": "one.sum()"}, group_by=["orig_var"])
+ ).ex()
+ assert numpy.all(check_fn_relnn["count"] == 1)
+ # variable consumed is function of variable produced and treatment only
+ check_fn_reln2 = (
+ data(vtreat_descr=vtreat_descr)
+ .project({}, group_by=["treatment", "orig_var", "variable"])
+ .extend({"one": 1})
+ .project({"count": "one.sum()"}, group_by=["treatment", "variable"])
+ ).ex()
+ assert numpy.all(check_fn_reln2["count"] == 1)
+ # clean copies don't change variable names
+ cn_rows = vtreat_descr.loc[
+ vtreat_descr["treatment_class"] == "CleanNumericTransform", :
+ ].reset_index(inplace=False, drop=True)
+ assert numpy.all(cn_rows["variable"] == cn_rows["orig_var"])
+ # operations other than clean copy produce new variable names
+ ot_rows = vtreat_descr.loc[
+ vtreat_descr["treatment_class"] != "CleanNumericTransform", :
+ ].reset_index(inplace=False, drop=True)
+ assert len(set(ot_rows["variable"]).intersection(vtreat_descr["orig_var"])) == 0
+ # clean copy and re-mapping take disjoint inputs (one alters input as a prep-step, so they would interfere)
+ mp_rows = (
+ data(vtreat_descr=vtreat_descr)
+ .select_rows("treatment_class == 'MappedCodeTransform'")
+ .project({}, group_by=["orig_var", "variable"])
+ .order_rows(["orig_var", "variable"])
+ ).ex()
+ assert len(set(mp_rows["orig_var"]).intersection(cn_rows["orig_var"])) == 0
+
+
+def as_data_algebra_pipeline(
+ *,
+ source: TableDescription,
+ vtreat_descr: pandas.DataFrame,
+ treatment_table_name: str,
+ row_keys: Iterable[str],
+) -> ViewRepresentation:
+ """
+ Convert the description of a vtreat transform (gotten via .description_matrix())
+ into a data algebra pipeline.
+ See: https://github.com/WinVector/data_algebra and https://github.com/WinVector/pyvtreat .
+ Missing and nan are treated as synonyms for '_NA_'.
+ Assembling the entire pipeline can be expensive. If one is willing to instantiate tables
+ it can be better to sequence operations instead of composing them.
+ Another way to use this methodology would be to port this code as a stored procedure
+ in a target database of choice, meaning only the vtreat_descr table would be needed on such systems.
+
+ :param source: input data.
+ :param vtreat_descr: .description_matrix() description of transform.
+ Expected invariant: CleanNumericTransform doesn't change variable names,
+ all other operations produce new names.
+ :param treatment_table_name: name to use for the vtreat_descr table.
+ :param row_keys: list of columns uniquely keying rows
+ :return: data algebra pipeline implementing specified vtreat treatment
+ """
+
+ assert isinstance(source, TableDescription)
+ assert isinstance(vtreat_descr, pandas.DataFrame)
+ assert isinstance(treatment_table_name, str)
+ assert row_keys is not None
+ assert not isinstance(row_keys, str)
+ row_keys = list(row_keys)
+ assert len(row_keys) > 0
+ assert numpy.all([isinstance(v, str) for v in row_keys])
+
+ _check_treatment_table(vtreat_descr)
+ # belt and suspenders replace missing with sentinel
+ vtreat_descr = vtreat_descr.copy()
+ vtreat_descr["value"] = replace_bad_with_sentinel(vtreat_descr["value"])
+ # start building up operator pipeline
+ ops = source
+ step_1_ops = dict()
+ # add in is_bad indicators
+ im_rows = vtreat_descr.loc[
+ vtreat_descr["treatment_class"] == "IndicateMissingTransform", :
+ ].reset_index(inplace=False, drop=True)
+ for i in range(im_rows.shape[0]):
+ if im_rows["orig_was_numeric"][i]:
+ step_1_ops[
+ im_rows["variable"][i]
+ ] = f"{im_rows['orig_var'][i]}.is_bad().where(1.0, 0.0)"
+ else:
+ step_1_ops[
+ im_rows["variable"][i]
+ ] = f"({im_rows['orig_var'][i]}.coalesce('{bad_sentinel}') == '{bad_sentinel}').where(1.0, 0.0)"
+ # add in general value indicators or dummies, all indicators are non-numeric (string)
+ ic_rows = vtreat_descr.loc[
+ vtreat_descr["treatment_class"] == "IndicatorCodeTransform", :
+ ].reset_index(inplace=False, drop=True)
+ for i in range(ic_rows.shape[0]):
+ ov = ic_rows["orig_var"].values[i]
+ vi = ic_rows["value"].values[i]
+ step_1_ops[
+ ic_rows["variable"][i]
+ ] = f"({ov}.coalesce('{bad_sentinel}') == '{vi}').where(1.0, 0.0)"
+ if len(step_1_ops) > 0:
+ ops = ops.extend(step_1_ops)
+ # mapped columns
+ mapping_table = (
+ describe_table(vtreat_descr, table_name=treatment_table_name)
+ .select_rows('treatment_class == "MappedCodeTransform"')
+ .select_columns(["orig_var", "value", "replacement", "treatment"])
+ )
+ mapping_rows = mapping_table.transform(vtreat_descr)
+ if mapping_rows.shape[0] > 0:
+ groups = list(set(mapping_rows["treatment"]))
+ mapping_rows = mapping_rows.groupby("treatment")
+ for group_name in groups:
+ mg = mapping_rows.get_group(group_name)
+ if mg.shape[0] > 0:
+ cols_to_map = list(set(mg["orig_var"]))
+ cols_to_map_back = [f"{c}_{group_name}" for c in cols_to_map]
+ ops_g = def_multi_column_map(
+ source.extend(
+ {v: f"{v}.coalesce('{bad_sentinel}')" for v in cols_to_map}
+ ),
+ mapping_table=mapping_table.select_rows(
+ f'treatment == "{group_name}"'
+ ),
+ row_keys=row_keys,
+ cols_to_map=cols_to_map,
+ cols_to_map_back=cols_to_map_back,
+ coalesce_value=0.0,
+ col_name_key="orig_var",
+ col_value_key="value",
+ mapped_value_key="replacement",
+ )
+ ops = ops.natural_join(
+ b=ops_g,
+ by=row_keys,
+ jointype="left",
+ )
+ # add in any clean numeric copies, inputs are numeric- so disjoint of categorical processing
+ cn_rows = vtreat_descr.loc[
+ vtreat_descr["treatment_class"] == "CleanNumericTransform", :
+ ].reset_index(inplace=False, drop=True)
+ if cn_rows.shape[0] > 0:
+ step_3_exprs = dict()
+ for i in range(cn_rows.shape[0]):
+ step_3_exprs[
+ cn_rows["variable"][i]
+ ] = f"{cn_rows['orig_var'][i]}.coalesce({cn_rows['replacement'][i]})"
+ ops = ops.extend(step_3_exprs)
+ # remove any input variables that are not the same name as variables we produced
+ # this prevents non-numerics from leaking forward
+ to_del = list(set(vtreat_descr["orig_var"]) - set(vtreat_descr["variable"]))
+ if len(to_del) > 0:
+ to_del.sort()
+ ops = ops.drop_columns(to_del)
+ return ops
diff --git a/pkg/build/lib/vtreat/effect_scaler.py b/pkg/build/lib/vtreat/effect_scaler.py
index a9e45df..ec52fd8 100644
--- a/pkg/build/lib/vtreat/effect_scaler.py
+++ b/pkg/build/lib/vtreat/effect_scaler.py
@@ -1,4 +1,3 @@
-
import numpy as np
import pandas as pd
@@ -80,14 +79,20 @@ def transform_col(i: int, *, xi: np.ndarray) -> np.ndarray:
if isinstance(X, pd.DataFrame):
if self._colnames is not None:
assert list(X.columns) == self._colnames
- return pd.DataFrame({
- c: transform_col(i, xi=np.array(X.loc[:, c], float)) for i, c in zip(range(X.shape[1]), X.columns)
- })
+ return pd.DataFrame(
+ {
+ c: transform_col(i, xi=np.array(X.loc[:, c], float))
+ for i, c in zip(range(X.shape[1]), X.columns)
+ }
+ )
else:
- return pd.DataFrame({
- i: transform_col(i, xi=np.array(X[:, i], float)) for i in range(X.shape[1])
- })
-
+ return pd.DataFrame(
+ {
+ i: transform_col(i, xi=np.array(X[:, i], float))
+ for i in range(X.shape[1])
+ }
+ )
+
# noinspection PyPep8Naming
def predict(self, X) -> pd.DataFrame:
return self.transform(X)
diff --git a/pkg/build/lib/vtreat/partial_pooling_estimator.py b/pkg/build/lib/vtreat/partial_pooling_estimator.py
index b807cfd..61fb474 100644
--- a/pkg/build/lib/vtreat/partial_pooling_estimator.py
+++ b/pkg/build/lib/vtreat/partial_pooling_estimator.py
@@ -1,10 +1,7 @@
-
-
import numpy as np
import pandas as pd
-
# define the standard estimator
def standard_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
"""
@@ -17,13 +14,15 @@ def standard_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
assert isinstance(observations, pd.DataFrame)
means = (
observations.loc[:, ["location_id", "observation"]]
- .reset_index(drop=True, inplace=False)
- .groupby(["location_id"])
- .mean()
- .reset_index(drop=False, inplace=False)
+ .reset_index(drop=True, inplace=False)
+ .groupby(["location_id"])
+ .agg(['mean', 'var', 'size'])
+ .reset_index(drop=False, inplace=False)
)
+ cols = [' '.join(col).strip() for col in means.columns]
+ means.columns = [c.removeprefix('observation ') for c in cols]
means.sort_values(["location_id"], inplace=True, ignore_index=True)
- means.rename(columns={"observation": "estimate"}, inplace=True)
+ means['estimate'] = means['mean']
means["grand_mean"] = np.mean(observations["observation"])
means["impact"] = means["estimate"] - means["grand_mean"]
means.sort_values(["location_id"], inplace=True, ignore_index=True)
@@ -40,69 +39,38 @@ def pooled_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
:return: pooled estimates
"""
assert isinstance(observations, pd.DataFrame)
- observations = observations.loc[:, ["location_id", "observation"]].reset_index(inplace=False, drop=True)
+ observations = observations.loc[:, ["location_id", "observation"]].reset_index(
+ inplace=False, drop=True
+ )
# get the standard estimates
estimated_centers = standard_effect_estimate(observations=observations)
if estimated_centers.shape[0] <= 1:
# no pooling possible
return estimated_centers
# get counts per group
- obs_count_frame = (
- pd.DataFrame({"location_id": observations["location_id"], "count": 1})
- .groupby(["location_id"])
- .sum()
- .reset_index(drop=False, inplace=False)
- .sort_values(["location_id"], inplace=False, ignore_index=True)
- )
- n_j = obs_count_frame["count"].values
- # get the observed variance for each item at for each location
- combined = (
- observations
- .merge(
- estimated_centers,
- on=["location_id"],
- how="left",
- )
- .merge(
- obs_count_frame,
- on=["location_id"],
- how="left",
- )
- )
- combined.sort_values(["location_id"], inplace=True, ignore_index=True)
- per_location_observation_var = (
- np.sum((combined["observation"] - combined["estimate"])**2)
- / (combined.shape[0] - len(set(combined["location_id"])))
- )
+ n_j = estimated_centers["size"]
+ per_location_observation_var = estimated_centers['var'].copy()
+ # inflate a bit
+ per_location_observation_var[pd.isnull(per_location_observation_var)] = 0
+ per_location_observation_var = (n_j * per_location_observation_var + np.var(observations['observation'])) / (n_j + 1)
# get the observed variance between locations
between_location_var = np.var(estimated_centers["estimate"], ddof=1)
# get v, the pooling coefficient
if between_location_var <= 0:
- v = 0
- elif per_location_observation_var <= 0:
- v = 1
+ v = 0 * per_location_observation_var
else:
# as between_location_var > 0 and per_location_observation_var > 0 here
# v will be in the range 0 to 1
v = 1 / (1 + per_location_observation_var / (n_j * between_location_var))
- # our estimate of the overall shared effect
- # note we are using the mixing proportions suggested by the variance reduction ideas
- # simpler weightings include:
- # combined["obs_weight"] = 1 # weights all observations equally
- # combined["obs_weight"] = 1 / combined["count"] # weights all locations equally
- # below, weights larger observations groups more, but with a diminishing return
- # this is an ad-hoc heuristic to try to reduce square error when the number of
- # observations per location has a lot of variation
- combined["obs_weight"] = 1
- if (between_location_var > 0) and (per_location_observation_var > 0):
- combined["obs_weight"] = 1 / (1 + per_location_observation_var / (combined["count"] * between_location_var))
# this quantity can be improved using knowledge of the variances
- grand_mean = np.sum(combined["observation"] * combined["obs_weight"]) / np.sum(combined["obs_weight"])
+ grand_mean = estimated_centers['grand_mean']
# build the pooled estimate
pooled_estimate = v * estimated_centers["estimate"] + (1 - v) * grand_mean
- return pd.DataFrame({
- "location_id": estimated_centers["location_id"],
- "estimate": pooled_estimate,
- "grand_mean": grand_mean,
- "impact": pooled_estimate - grand_mean,
- })
+ return pd.DataFrame(
+ {
+ "location_id": estimated_centers["location_id"],
+ "estimate": pooled_estimate,
+ "grand_mean": grand_mean,
+ "impact": pooled_estimate - grand_mean,
+ }
+ )
diff --git a/pkg/build/lib/vtreat/stats_utils.py b/pkg/build/lib/vtreat/stats_utils.py
index 1559b3f..839e164 100644
--- a/pkg/build/lib/vtreat/stats_utils.py
+++ b/pkg/build/lib/vtreat/stats_utils.py
@@ -182,7 +182,7 @@ def xicor(xvec, yvec, *, n_reps: int = 5) -> Tuple[float, float]:
PI[perm] = PI_inv # invert permutation, self assignment fails
ord = numpy.argsort(PI)
fr = fr_orig[ord]
- A1 = numpy.sum(numpy.abs(fr[0:(n - 1)] - fr[1:n])) / (2 * n)
+ A1 = numpy.sum(numpy.abs(fr[0 : (n - 1)] - fr[1:n])) / (2 * n)
xi = 1 - A1 / CU
xi_s[rep_i] = xi
return numpy.mean(xi_s), numpy.std(xi_s) / numpy.sqrt(n_reps)
@@ -207,21 +207,23 @@ def xicor_for_frame(d: pandas.DataFrame, y, *, n_reps=5):
assert len(y) == n
assert isinstance(n_reps, int)
assert n_reps > 0
- res = pandas.DataFrame({
- 'variable': d.columns,
- 'xicor': 0.0,
- 'xicor_se': 0.0,
- 'xicor_perm_mean': 0.0,
- 'xicor_perm_stddev': 0.0,
- 'xicor_perm_sum': 0.0,
- 'xicor_perm_sum_sq': 0.0,
- })
+ res = pandas.DataFrame(
+ {
+ "variable": d.columns,
+ "xicor": 0.0,
+ "xicor_se": 0.0,
+ "xicor_perm_mean": 0.0,
+ "xicor_perm_stddev": 0.0,
+ "xicor_perm_sum": 0.0,
+ "xicor_perm_sum_sq": 0.0,
+ }
+ )
# get the xicor estimates
for col_i in range(len(d.columns)):
xvec = d[d.columns[col_i]]
xi_est, xi_est_dev = xicor(xvec, y, n_reps=n_reps)
- res.loc[col_i, 'xicor'] = xi_est
- res.loc[col_i, 'xicor_se'] = xi_est_dev
+ res.loc[col_i, "xicor"] = xi_est
+ res.loc[col_i, "xicor_se"] = xi_est_dev
# score all x-columns with the same y-permutation
# estimate stddev with expanding squares to cut down storage
for rep_j in range(n_reps):
@@ -229,11 +231,17 @@ def xicor_for_frame(d: pandas.DataFrame, y, *, n_reps=5):
for col_i in range(len(d.columns)):
xvec = d[d.columns[col_i]]
xi_perm, _ = xicor(xvec, y_perm, n_reps=1)
- res.loc[col_i, 'xicor_perm_sum'] = res.loc[col_i, 'xicor_perm_sum'] + xi_perm
- res.loc[col_i, 'xicor_perm_sum_sq'] = res.loc[col_i, 'xicor_perm_sum_sq'] + xi_perm * xi_perm
- res['xicor_perm_mean'] = res['xicor_perm_sum'] / n_reps
- res['xicor_perm_stddev'] = numpy.sqrt((1 / (n_reps - 1)) * (
- res['xicor_perm_sum_sq'] - (1 / n_reps) * res['xicor_perm_sum']**2))
- del res['xicor_perm_sum']
- del res['xicor_perm_sum_sq']
+ res.loc[col_i, "xicor_perm_sum"] = (
+ res.loc[col_i, "xicor_perm_sum"] + xi_perm
+ )
+ res.loc[col_i, "xicor_perm_sum_sq"] = (
+ res.loc[col_i, "xicor_perm_sum_sq"] + xi_perm * xi_perm
+ )
+ res["xicor_perm_mean"] = res["xicor_perm_sum"] / n_reps
+ res["xicor_perm_stddev"] = numpy.sqrt(
+ (1 / (n_reps - 1))
+ * (res["xicor_perm_sum_sq"] - (1 / n_reps) * res["xicor_perm_sum"] ** 2)
+ )
+ del res["xicor_perm_sum"]
+ del res["xicor_perm_sum_sq"]
return res
diff --git a/pkg/build/lib/vtreat/test_util.py b/pkg/build/lib/vtreat/test_util.py
index 341a34e..41699b9 100644
--- a/pkg/build/lib/vtreat/test_util.py
+++ b/pkg/build/lib/vtreat/test_util.py
@@ -6,8 +6,6 @@
import pandas as pd
-
-
def equivalent_frames(
a: pd.DataFrame,
b: pd.DataFrame,
@@ -93,9 +91,7 @@ def equivalent_frames(
if np.any(ca_inf != cb_inf):
return False
if np.any(ca_inf):
- if np.any(
- np.sign(ca_n[ca_inf]) != np.sign(cb_n[cb_inf])
- ):
+ if np.any(np.sign(ca_n[ca_inf]) != np.sign(cb_n[cb_inf])):
return False
if np.any(np.logical_not(ca_inf)):
ca_f = ca_n[np.logical_not(ca_inf)]
diff --git a/pkg/build/lib/vtreat/util.py b/pkg/build/lib/vtreat/util.py
index cf7dd0e..8412961 100644
--- a/pkg/build/lib/vtreat/util.py
+++ b/pkg/build/lib/vtreat/util.py
@@ -38,7 +38,7 @@ def can_convert_v_to_numeric(x) -> bool:
not_bad = numpy.logical_not(pandas.isnull(x))
n_not_bad = numpy.sum(not_bad)
if n_not_bad < 1:
- return True # All nan/None, can convert to numeric
+ return True # All nan/None, can convert to numeric
try:
numpy.asarray(
x[not_bad] + 0, dtype=float
@@ -152,21 +152,24 @@ def pooled_impact_estimate(x, y):
n = len(x)
assert n > 0
assert n == len(y)
- obs = pandas.DataFrame({
- "location_id": x,
- "observation": safe_to_numeric_array(y),
- })
+ obs = pandas.DataFrame(
+ {
+ "location_id": x,
+ "observation": safe_to_numeric_array(y),
+ }
+ )
res = pooled_effect_estimate(obs)
res.rename(columns={"location_id": "x"}, inplace=True)
return res
def score_variables(
- cross_frame: pandas.DataFrame,
- *,
- variables: Optional[Iterable[str]],
- outcome,
- is_classification: bool = False) -> Optional[pandas.DataFrame]:
+ cross_frame: pandas.DataFrame,
+ *,
+ variables: Optional[Iterable[str]],
+ outcome,
+ is_classification: bool = False
+) -> Optional[pandas.DataFrame]:
"""
Score the linear relation of variables to outcome.
@@ -200,7 +203,7 @@ def f(v):
and (numpy.max(outcome) > numpy.min(outcome))
):
cor, sig = vtreat.stats_utils.our_corr_score(y_true=outcome, y_pred=col)
- r2 = cor ** 2
+ r2 = cor**2
if is_classification:
r2, sig = vtreat.stats_utils.our_pseudo_R2(y_true=outcome, y_pred=col)
sfi = pandas.DataFrame(
@@ -283,7 +286,7 @@ def clean_string(s: str) -> str:
}
s = s.strip()
s = re.sub(r"\s+", " ", s)
- for (k, v) in mp.items():
+ for k, v in mp.items():
s = s.replace(k, v)
return s
diff --git a/pkg/build/lib/vtreat/vtreat_api.py b/pkg/build/lib/vtreat/vtreat_api.py
index 914ac94..6d18056 100644
--- a/pkg/build/lib/vtreat/vtreat_api.py
+++ b/pkg/build/lib/vtreat/vtreat_api.py
@@ -108,11 +108,11 @@ def __init__(
):
"""
- :param var_list: list or tuple of column names, if empty all non outcome and copy columns are used
- :param outcome_name: name of column containing dependent variable
- :param cols_to_copy: list or tuple of column names
- :param params: vtreat.vtreat_parameters()
- :param imputation_map: map of column names to custom missing imputation values or functions
+ :param var_list: list or tuple of column names, if empty all non outcome and copy columns are used
+ :param outcome_name: name of column containing dependent variable
+ :param cols_to_copy: list or tuple of column names
+ :param params: vtreat.vtreat_parameters()
+ :param imputation_map: map of column names to custom missing imputation values or functions
"""
params = self.merge_params(params)
vtreat_impl.VariableTreatment.__init__(
@@ -304,12 +304,12 @@ def __init__(
):
"""
- :param var_list: list or tuple of column names, if empty all non outcome and copy columns are used
- :param outcome_name: name of column containing dependent variable
- :param outcome_target: value of outcome to consider "positive"
- :param cols_to_copy: list or tuple of column names
- :param params: vtreat.vtreat_parameters()
- :param imputation_map: map of column names to custom missing imputation values or functions
+ :param var_list: list or tuple of column names, if empty all non outcome and copy columns are used
+ :param outcome_name: name of column containing dependent variable
+ :param outcome_target: value of outcome to consider "positive"
+ :param cols_to_copy: list or tuple of column names
+ :param params: vtreat.vtreat_parameters()
+ :param imputation_map: map of column names to custom missing imputation values or functions
"""
params = self.merge_params(params)
vtreat_impl.VariableTreatment.__init__(
@@ -502,11 +502,11 @@ def __init__(
):
"""
- :param var_list: list or tuple of column names, if empty all non outcome and copy columns are used
- :param outcome_name: name of column containing dependent variable
- :param cols_to_copy: list or tuple of column names
- :param params: vtreat.vtreat_parameters()
- :param imputation_map: map of column names to custom missing imputation values or functions
+ :param var_list: list or tuple of column names, if empty all non outcome and copy columns are used
+ :param outcome_name: name of column containing dependent variable
+ :param cols_to_copy: list or tuple of column names
+ :param params: vtreat.vtreat_parameters()
+ :param imputation_map: map of column names to custom missing imputation values or functions
"""
params = self.merge_params(params)
diff --git a/pkg/build/lib/vtreat/vtreat_db_adapter.py b/pkg/build/lib/vtreat/vtreat_db_adapter.py
index d222ebc..c11effd 100644
--- a/pkg/build/lib/vtreat/vtreat_db_adapter.py
+++ b/pkg/build/lib/vtreat/vtreat_db_adapter.py
@@ -1,178 +1 @@
-"""
-Convert the description of a vtreat variable treatment into a data algebra pipeline.
-"""
-
-from typing import Dict, Iterable, List, Optional, Tuple
-import numpy
-import pandas
-
-from vtreat.vtreat_impl import bad_sentinel, replace_bad_with_sentinel
-
-from data_algebra.data_ops import data, descr, describe_table, TableDescription, ViewRepresentation
-from data_algebra.solutions import def_multi_column_map
-
-
-def _check_treatment_table(vtreat_descr: pandas.DataFrame):
- """
- Assert if expected invariants don't hold for vtreat_descr.
-
- :param vtreat_descr: .description_matrix() description of a transform to check.
- :return: no return, assert on failure
- """
-
- # belt and suspenders replace missing with sentinel
- vtreat_descr = vtreat_descr.copy()
- vtreat_descr["value"] = replace_bad_with_sentinel(vtreat_descr["value"])
- # check our expected invariants
- assert isinstance(vtreat_descr, pandas.DataFrame)
- # numeric is a function of original variable only
- check_fn_relnn = (
- data(vtreat_descr=vtreat_descr)
- .project({}, group_by=["orig_var", "orig_was_numeric"])
- .extend({"one": 1})
- .project({"count": "one.sum()"}, group_by=["orig_var"])
- ).ex()
- assert numpy.all(check_fn_relnn["count"] == 1)
- # variable consumed is function of variable produced and treatment only
- check_fn_reln2 = (
- data(vtreat_descr=vtreat_descr)
- .project({}, group_by=["treatment", "orig_var", "variable"])
- .extend({"one": 1})
- .project({"count": "one.sum()"}, group_by=["treatment", "variable"])
- ).ex()
- assert numpy.all(check_fn_reln2["count"] == 1)
- # clean copies don't change variable names
- cn_rows = vtreat_descr.loc[
- vtreat_descr["treatment_class"] == "CleanNumericTransform", :
- ].reset_index(inplace=False, drop=True)
- assert numpy.all(cn_rows["variable"] == cn_rows["orig_var"])
- # operations other than clean copy produce new variable names
- ot_rows = vtreat_descr.loc[
- vtreat_descr["treatment_class"] != "CleanNumericTransform", :
- ].reset_index(inplace=False, drop=True)
- assert len(set(ot_rows["variable"]).intersection(vtreat_descr["orig_var"])) == 0
- # clean copy and re-mapping take disjoint inputs (one alters input as a prep-step, so they would interfere)
- mp_rows = (
- data(vtreat_descr=vtreat_descr)
- .select_rows("treatment_class == 'MappedCodeTransform'")
- .project({}, group_by=["orig_var", "variable"])
- .order_rows(["orig_var", "variable"])
- ).ex()
- assert len(set(mp_rows["orig_var"]).intersection(cn_rows["orig_var"])) == 0
-
-
-def as_data_algebra_pipeline(
- *,
- source: TableDescription,
- vtreat_descr: pandas.DataFrame,
- treatment_table_name: str,
- row_keys: Iterable[str],
-) -> ViewRepresentation:
- """
- Convert the description of a vtreat transform (gotten via .description_matrix())
- into a data algebra pipeline.
- See: https://github.com/WinVector/data_algebra and https://github.com/WinVector/pyvtreat .
- Missing and nan are treated as synonyms for '_NA_'.
- Assembling the entire pipeline can be expensive. If one is willing to instantiate tables
- it can be better to sequence operations instead of composing them.
- Another way to use this methodology would be to port this code as a stored procedure
- in a target database of choice, meaning only the vtreat_descr table would be needed on such systems.
-
- :param source: input data.
- :param vtreat_descr: .description_matrix() description of transform.
- Expected invariant: CleanNumericTransform doesn't change variable names,
- all other operations produce new names.
- :param treatment_table_name: name to use for the vtreat_descr table.
- :param row_keys: list of columns uniquely keying rows
- :return: data algebra pipeline implementing specified vtreat treatment
- """
-
- assert isinstance(source, TableDescription)
- assert isinstance(vtreat_descr, pandas.DataFrame)
- assert isinstance(treatment_table_name, str)
- assert row_keys is not None
- assert not isinstance(row_keys, str)
- row_keys = list(row_keys)
- assert len(row_keys) > 0
- assert numpy.all([isinstance(v, str) for v in row_keys])
-
- _check_treatment_table(vtreat_descr)
- # belt and suspenders replace missing with sentinel
- vtreat_descr = vtreat_descr.copy()
- vtreat_descr["value"] = replace_bad_with_sentinel(vtreat_descr["value"])
- # start building up operator pipeline
- ops = source
- step_1_ops = dict()
- # add in is_bad indicators
- im_rows = vtreat_descr.loc[
- vtreat_descr["treatment_class"] == "IndicateMissingTransform", :
- ].reset_index(inplace=False, drop=True)
- for i in range(im_rows.shape[0]):
- if im_rows['orig_was_numeric'][i]:
- step_1_ops[
- im_rows["variable"][i]
- ] = f"{im_rows['orig_var'][i]}.is_bad().where(1.0, 0.0)"
- else:
- step_1_ops[
- im_rows["variable"][i]
- ] = f"({im_rows['orig_var'][i]}.coalesce('{bad_sentinel}') == '{bad_sentinel}').where(1.0, 0.0)"
- # add in general value indicators or dummies, all indicators are non-numeric (string)
- ic_rows = vtreat_descr.loc[
- vtreat_descr["treatment_class"] == "IndicatorCodeTransform", :
- ].reset_index(inplace=False, drop=True)
- for i in range(ic_rows.shape[0]):
- ov = ic_rows["orig_var"].values[i]
- vi = ic_rows["value"].values[i]
- step_1_ops[
- ic_rows["variable"][i]
- ] = f"({ov}.coalesce('{bad_sentinel}') == '{vi}').where(1.0, 0.0)"
- if len(step_1_ops) > 0:
- ops = ops.extend(step_1_ops)
- # mapped columns
- mapping_table = (
- describe_table(vtreat_descr, table_name=treatment_table_name)
- .select_rows('treatment_class == "MappedCodeTransform"')
- .select_columns(['orig_var', 'value', 'replacement', 'treatment']))
- mapping_rows = mapping_table.transform(vtreat_descr)
- if mapping_rows.shape[0] > 0:
- groups = list(set(mapping_rows['treatment']))
- mapping_rows = mapping_rows.groupby('treatment')
- for group_name in groups:
- mg = mapping_rows.get_group(group_name)
- if mg.shape[0] > 0:
- cols_to_map = list(set(mg['orig_var']))
- cols_to_map_back = [f'{c}_{group_name}' for c in cols_to_map]
- ops_g = def_multi_column_map(
- source.extend({v: f"{v}.coalesce('{bad_sentinel}')" for v in cols_to_map}),
- mapping_table=mapping_table.select_rows(f'treatment == "{group_name}"'),
- row_keys=row_keys,
- cols_to_map=cols_to_map,
- cols_to_map_back=cols_to_map_back,
- coalesce_value=0.0,
- col_name_key='orig_var',
- col_value_key='value',
- mapped_value_key='replacement',
- )
- ops = ops.natural_join(
- b=ops_g,
- by=row_keys,
- jointype='left',
- )
- # add in any clean numeric copies, inputs are numeric- so disjoint of categorical processing
- cn_rows = vtreat_descr.loc[
- vtreat_descr["treatment_class"] == "CleanNumericTransform", :
- ].reset_index(inplace=False, drop=True)
- if cn_rows.shape[0] > 0:
- step_3_exprs = dict()
- for i in range(cn_rows.shape[0]):
- step_3_exprs[
- cn_rows["variable"][i]
- ] = f"{cn_rows['orig_var'][i]}.coalesce({cn_rows['replacement'][i]})"
- ops = ops.extend(step_3_exprs)
- # remove any input variables that are not the same name as variables we produced
- # this prevents non-numerics from leaking forward
- to_del = list(set(vtreat_descr["orig_var"]) - set(vtreat_descr["variable"]))
- if len(to_del) > 0:
- to_del.sort()
- ops = ops.drop_columns(to_del)
- return ops
+from vtreat.da_adapter import as_data_algebra_pipeline
diff --git a/pkg/build/lib/vtreat/vtreat_impl.py b/pkg/build/lib/vtreat/vtreat_impl.py
index b9ee41e..64da862 100644
--- a/pkg/build/lib/vtreat/vtreat_impl.py
+++ b/pkg/build/lib/vtreat/vtreat_impl.py
@@ -146,13 +146,14 @@ class TreatmentPlan:
xforms: Tuple[VarTransform, ...]
def __init__(
- self,
- *,
- outcome_name: Optional[str] = None,
- cols_to_copy: Optional[Iterable[str]] = None,
- num_list: Optional[Iterable[str]] = None,
- cat_list: Optional[Iterable[str]] = None,
- xforms: Iterable[Optional[VarTransform]]):
+ self,
+ *,
+ outcome_name: Optional[str] = None,
+ cols_to_copy: Optional[Iterable[str]] = None,
+ num_list: Optional[Iterable[str]] = None,
+ cat_list: Optional[Iterable[str]] = None,
+ xforms: Iterable[Optional[VarTransform]],
+ ):
self.outcome_name = outcome_name
if cols_to_copy is None:
self.cols_to_copy = tuple()
@@ -296,10 +297,7 @@ def __init__(
class CleanNumericTransform(VarTransform):
"""Class for numeric column cleaner."""
- def __init__(self,
- *,
- incoming_column_name: str,
- replacement_value: float):
+ def __init__(self, *, incoming_column_name: str, replacement_value: float):
"""
:param incoming_column_name:
@@ -354,11 +352,13 @@ def description_matrix(self) -> pandas.DataFrame:
class IndicateMissingTransform(VarTransform):
"""Class for missing value indicator."""
- def __init__(self,
- *,
- incoming_column_name: str,
- incoming_column_is_numeric: bool,
- derived_column_name: str):
+ def __init__(
+ self,
+ *,
+ incoming_column_name: str,
+ incoming_column_is_numeric: bool,
+ derived_column_name: str,
+ ):
"""
:param incoming_column_name:
@@ -568,15 +568,15 @@ def fit_binomial_impact_code(
eps = 1.0e-3
if params["use_hierarchical_estimate"]:
cf = vtreat.util.pooled_impact_estimate(x, y)
- cf["_logit_code"] = (
- numpy.log((numpy.maximum(cf["estimate"], 0.0) + eps)
- / (numpy.maximum(cf["grand_mean"], 0.0) + eps))
+ cf["_logit_code"] = numpy.log(
+ (numpy.maximum(cf["estimate"], 0.0) + eps)
+ / (numpy.maximum(cf["grand_mean"], 0.0) + eps)
)
else:
cf = vtreat.util.grouped_by_x_statistics(x, y)
- cf["_logit_code"] = (
- numpy.log((numpy.maximum(cf["_group_mean"], 0.0) + eps)
- / (numpy.maximum(cf["_gm"], 0.0) + eps))
+ cf["_logit_code"] = numpy.log(
+ (numpy.maximum(cf["_group_mean"], 0.0) + eps)
+ / (numpy.maximum(cf["_gm"], 0.0) + eps)
)
if cf.shape[0] <= 1:
return None
@@ -715,7 +715,9 @@ def fit_indicator_code(
return None
return IndicatorCodeTransform(
incoming_column_name=incoming_column_name,
- derived_column_names=vtreat.util.build_level_codes(incoming_column_name, levels),
+ derived_column_names=vtreat.util.build_level_codes(
+ incoming_column_name, levels
+ ),
levels=levels,
sparse_indicators=sparse_indicators,
)
@@ -748,7 +750,7 @@ def fit_prevalence_code(incoming_column_name: str, x) -> Optional[VarTransform]:
incoming_column_name=incoming_column_name,
derived_column_name=newcol,
treatment="prevalence_code",
- code_book=sf
+ code_book=sf,
)
@@ -841,7 +843,7 @@ def fit_numeric_outcome_treatment(
IndicateMissingTransform(
incoming_column_name=vi,
incoming_column_is_numeric=vi in num_set,
- derived_column_name=vi + "_is_bad"
+ derived_column_name=vi + "_is_bad",
)
)
if "clean_copy" in params["coders"]:
@@ -934,7 +936,7 @@ def fit_binomial_outcome_treatment(
IndicateMissingTransform(
incoming_column_name=vi,
incoming_column_is_numeric=vi in num_set,
- derived_column_name=vi + "_is_bad"
+ derived_column_name=vi + "_is_bad",
)
)
if "clean_copy" in params["coders"]:
@@ -1020,7 +1022,7 @@ def fit_multinomial_outcome_treatment(
IndicateMissingTransform(
incoming_column_name=vi,
incoming_column_is_numeric=vi in num_set,
- derived_column_name=vi + "_is_bad"
+ derived_column_name=vi + "_is_bad",
)
)
if "clean_copy" in params["coders"]:
@@ -1106,7 +1108,7 @@ def fit_unsupervised_treatment(
IndicateMissingTransform(
incoming_column_name=vi,
incoming_column_is_numeric=vi in num_set,
- derived_column_name=vi + "_is_bad"
+ derived_column_name=vi + "_is_bad",
)
)
if "clean_copy" in params["coders"]:
@@ -1502,7 +1504,9 @@ def describe_ut(ut):
return score_frame
-class VariableTreatment(abc.ABC, sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
+class VariableTreatment(
+ abc.ABC, sklearn.base.BaseEstimator, sklearn.base.TransformerMixin
+):
"""
Class for variable treatments, implements much of the sklearn pipeline/transformer
API. https://sklearn-template.readthedocs.io/en/latest/user_guide.html#transformer
@@ -1687,7 +1691,7 @@ def set_params(self, **params):
:return: self (for method chaining)
"""
- for (k, v) in params.items():
+ for k, v in params.items():
if k in self.params_["tunable_params"]:
self.params_[k] = v
return self
diff --git a/pkg/dist/vtreat-1.3.1-py3-none-any.whl b/pkg/dist/vtreat-1.3.1-py3-none-any.whl
index 029cb97..70ae09c 100644
Binary files a/pkg/dist/vtreat-1.3.1-py3-none-any.whl and b/pkg/dist/vtreat-1.3.1-py3-none-any.whl differ
diff --git a/pkg/dist/vtreat-1.3.1.tar.gz b/pkg/dist/vtreat-1.3.1.tar.gz
index d990b74..76cb33a 100644
Binary files a/pkg/dist/vtreat-1.3.1.tar.gz and b/pkg/dist/vtreat-1.3.1.tar.gz differ
diff --git a/pkg/vtreat/partial_pooling_estimator.py b/pkg/vtreat/partial_pooling_estimator.py
index 61fb474..84dc51d 100644
--- a/pkg/vtreat/partial_pooling_estimator.py
+++ b/pkg/vtreat/partial_pooling_estimator.py
@@ -66,11 +66,6 @@ def pooled_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
grand_mean = estimated_centers['grand_mean']
# build the pooled estimate
pooled_estimate = v * estimated_centers["estimate"] + (1 - v) * grand_mean
- return pd.DataFrame(
- {
- "location_id": estimated_centers["location_id"],
- "estimate": pooled_estimate,
- "grand_mean": grand_mean,
- "impact": pooled_estimate - grand_mean,
- }
- )
+ estimated_centers["estimate"] = pooled_estimate
+ estimated_centers['impact'] = pooled_estimate - grand_mean
+ return estimated_centers
diff --git a/vtreat_dev_env_package_list.txt b/vtreat_dev_env_package_list.txt
index 3284aed..f1c15e3 100644
--- a/vtreat_dev_env_package_list.txt
+++ b/vtreat_dev_env_package_list.txt
@@ -1,261 +1,270 @@
# This file may be used to create an environment using:
# $ conda create --name --file
# platform: osx-64
-abseil-cpp=20211102.0=he9d5cce_0
-aiofiles=22.1.0=py311hecd8cb5_0
-aiosqlite=0.18.0=py311hecd8cb5_0
-anyio=3.5.0=py311hecd8cb5_0
+_py-xgboost-mutex=2.0=cpu_0
+abseil-cpp=20230802.0=h61975a4_2
+anyio=4.2.0=py311hecd8cb5_0
appnope=0.1.2=py311hecd8cb5_1001
argon2-cffi=21.3.0=pyhd3eb1b0_0
argon2-cffi-bindings=21.2.0=py311h6c40b1e_0
-arrow-cpp=11.0.0=h89a8245_2
-astroid=2.14.2=py311hecd8cb5_0
+arrow-cpp=14.0.2=h3ade35f_1
+astroid=3.2.2=py311hecd8cb5_0
asttokens=2.0.5=pyhd3eb1b0_0
+async-lru=2.0.4=py311hecd8cb5_0
atomicwrites=1.4.0=py_0
attrs=23.1.0=py311hecd8cb5_0
-aws-c-common=0.6.8=h6c40b1e_1
-aws-c-event-stream=0.1.6=hcec6c5f_6
-aws-checksums=0.1.11=h6c40b1e_2
-aws-sdk-cpp=1.8.185=h1a8d504_1
+aws-c-auth=0.6.19=h6c40b1e_0
+aws-c-cal=0.5.20=h3333b6a_0
+aws-c-common=0.8.5=h6c40b1e_0
+aws-c-compression=0.2.16=h6c40b1e_0
+aws-c-event-stream=0.2.15=hcec6c5f_0
+aws-c-http=0.6.25=h6c40b1e_0
+aws-c-io=0.13.10=h6c40b1e_0
+aws-c-mqtt=0.7.13=h6c40b1e_0
+aws-c-s3=0.1.51=h6c40b1e_0
+aws-c-sdkutils=0.1.6=h6c40b1e_0
+aws-checksums=0.1.13=h6c40b1e_0
+aws-crt-cpp=0.18.16=hcec6c5f_0
+aws-sdk-cpp=1.10.55=h61975a4_0
babel=2.11.0=py311hecd8cb5_0
-backcall=0.2.0=pyhd3eb1b0_0
-beautifulsoup4=4.12.2=py311hecd8cb5_0
-black=23.3.0=py311hecd8cb5_0
-blas=1.0=mkl
+beautifulsoup4=4.12.3=py311hecd8cb5_0
+black=24.4.2=py311hecd8cb5_0
+blas=2.116=openblas
+blas-devel=3.9.0=16_osx64_openblas
bleach=4.1.0=pyhd3eb1b0_0
-boost-cpp=1.82.0=ha357a0b_1
-bottleneck=1.3.5=py311hb9e55a9_0
-brotli=1.0.9=hca72f7f_7
-brotli-bin=1.0.9=hca72f7f_7
-brotlipy=0.7.0=py311h6c40b1e_1002
-bzip2=1.0.8=h1de35cc_0
+boost-cpp=1.82.0=ha357a0b_2
+bottleneck=1.3.7=py311hb3a5e46_0
+brotli=1.0.9=h6c40b1e_8
+brotli-bin=1.0.9=h6c40b1e_8
+brotli-python=1.0.9=py311hcec6c5f_8
+bzip2=1.0.8=h6c40b1e_6
c-ares=1.19.1=h6c40b1e_0
-ca-certificates=2023.08.22=hecd8cb5_0
-cachetools=5.3.1=pypi_0
-certifi=2023.7.22=py311hecd8cb5_0
-cffi=1.15.1=py311h6c40b1e_3
+ca-certificates=2024.3.11=hecd8cb5_0
+cachetools=5.3.3=pypi_0
+certifi=2024.6.2=py311hecd8cb5_0
+cffi=1.16.0=py311h6c40b1e_1
charset-normalizer=2.0.4=pyhd3eb1b0_0
-click=8.0.4=py311hecd8cb5_0
+click=8.1.7=py311hecd8cb5_0
cmarkgfm=2022.10.27=py311h6c40b1e_0
-colorama=0.4.6=py311hecd8cb5_0
-comm=0.1.2=py311hecd8cb5_0
-contourpy=1.0.5=py311ha357a0b_0
+comm=0.2.1=py311hecd8cb5_0
+contourpy=1.2.0=py311ha357a0b_0
coverage=7.2.2=py311h6c40b1e_0
-cryptography=41.0.3=py311h30e54ef_0
+cryptography=42.0.5=py311h30e54ef_1
cycler=0.11.0=pyhd3eb1b0_0
-data-algebra=1.7.0=pypi_0
+data-algebra=1.7.1=pypi_0
+db-dtypes=1.2.0=pyhd8ed1ab_0
debugpy=1.6.7=py311hcec6c5f_0
decorator=5.1.1=pyhd3eb1b0_0
defusedxml=0.7.1=pyhd3eb1b0_0
-dill=0.3.7=py311hecd8cb5_0
+dill=0.3.8=py311hecd8cb5_0
docutils=0.18.1=py311hecd8cb5_3
-entrypoints=0.4=py311hecd8cb5_0
executing=0.8.3=pyhd3eb1b0_0
-fonttools=4.25.0=pyhd3eb1b0_0
+fonttools=4.51.0=py311h6c40b1e_0
freetype=2.12.1=hd8bbffd_0
-gflags=2.2.2=h0a44026_0
-giflib=5.2.1=h6c40b1e_3
-glog=0.5.0=h23ab428_0
-google-api-core=2.12.0=pypi_0
-google-auth=2.23.2=pypi_0
+gflags=2.2.2=hcec6c5f_1
+glog=0.5.0=hcec6c5f_1
+google-api-core=2.19.0=pypi_0
+google-auth=2.30.0=pypi_0
google-cloud=0.34.0=pypi_0
-google-cloud-bigquery=3.11.4=pypi_0
-google-cloud-core=2.3.3=pypi_0
+google-cloud-bigquery=3.24.0=pypi_0
+google-cloud-core=2.4.1=pypi_0
google-crc32c=1.5.0=pypi_0
-google-resumable-media=2.6.0=pypi_0
-googleapis-common-protos=1.60.0=pypi_0
-greenlet=2.0.1=py311hcec6c5f_0
-grpc-cpp=1.48.2=h4ed1731_1
-grpcio=1.58.0=pypi_0
-grpcio-status=1.58.0=pypi_0
+google-resumable-media=2.7.1=pypi_0
+googleapis-common-protos=1.63.1=pypi_0
+greenlet=3.0.1=py311hcec6c5f_0
+grpc-cpp=1.48.2=hbe2b35a_4
+grpcio=1.64.1=pypi_0
+grpcio-status=1.62.2=pypi_0
+gtest=1.14.0=ha357a0b_1
icu=73.1=hcec6c5f_0
-idna=3.4=py311hecd8cb5_0
-importlib-metadata=6.0.0=py311hecd8cb5_0
-importlib_metadata=6.0.0=hd3eb1b0_0
+idna=3.7=py311hecd8cb5_0
+importlib-metadata=7.0.1=py311hecd8cb5_0
+importlib_metadata=7.0.1=hd3eb1b0_0
iniconfig=1.1.1=pyhd3eb1b0_0
-ipykernel=6.25.0=py311h85bffb1_0
-ipython=8.15.0=py311hecd8cb5_0
-ipython_genutils=0.2.0=pyhd3eb1b0_1
+ipykernel=6.28.0=py311hecd8cb5_0
+ipython=8.20.0=py311hecd8cb5_0
isort=5.9.3=pyhd3eb1b0_0
jaraco.classes=3.2.1=pyhd3eb1b0_0
jedi=0.18.1=py311hecd8cb5_1
-jinja2=3.1.2=py311hecd8cb5_0
-joblib=1.2.0=py311hecd8cb5_0
+jinja2=3.1.4=py311hecd8cb5_0
+joblib=1.4.2=py311hecd8cb5_0
jpeg=9e=h6c40b1e_1
js2py=0.74=py311hecd8cb5_0
json5=0.9.6=pyhd3eb1b0_0
-jsonschema=4.17.3=py311hecd8cb5_0
-jupyter_client=7.4.9=py311hecd8cb5_0
-jupyter_core=5.3.0=py311hecd8cb5_0
-jupyter_events=0.6.3=py311hecd8cb5_0
-jupyter_server=1.23.4=py311hecd8cb5_0
-jupyter_server_fileid=0.9.0=py311hecd8cb5_0
-jupyter_server_ydoc=0.8.0=py311hecd8cb5_1
-jupyter_ydoc=0.2.4=py311hecd8cb5_0
-jupyterlab=3.6.3=py311hecd8cb5_0
+jsonschema=4.19.2=py311hecd8cb5_0
+jsonschema-specifications=2023.7.1=py311hecd8cb5_0
+jupyter-lsp=2.2.0=py311hecd8cb5_0
+jupyter_client=8.6.0=py311hecd8cb5_0
+jupyter_core=5.5.0=py311hecd8cb5_0
+jupyter_events=0.8.0=py311hecd8cb5_0
+jupyter_server=2.10.0=py311hecd8cb5_0
+jupyter_server_terminals=0.4.4=py311hecd8cb5_1
+jupyterlab=4.0.11=py311hecd8cb5_0
jupyterlab_pygments=0.1.2=py_0
-jupyterlab_server=2.22.0=py311hecd8cb5_0
-keyring=23.13.1=py311hecd8cb5_0
+jupyterlab_server=2.25.1=py311hecd8cb5_0
+keyring=24.3.1=py311hecd8cb5_0
kiwisolver=1.4.4=py311hcec6c5f_0
krb5=1.20.1=h428f121_1
lark=1.1.2=py311hecd8cb5_0
-lazy-object-proxy=1.6.0=py311h6c40b1e_0
lcms2=2.12=hf1fd2bf_0
lerc=3.0=he9d5cce_0
-libboost=1.82.0=h74d5ea2_1
-libbrotlicommon=1.0.9=hca72f7f_7
-libbrotlidec=1.0.9=hca72f7f_7
-libbrotlienc=1.0.9=hca72f7f_7
-libcurl=8.2.1=hf20ceda_0
+libblas=3.9.0=16_osx64_openblas
+libboost=1.82.0=hf53b9f2_2
+libbrotlicommon=1.0.9=h6c40b1e_8
+libbrotlidec=1.0.9=h6c40b1e_8
+libbrotlienc=1.0.9=h6c40b1e_8
+libcblas=3.9.0=16_osx64_openblas
+libcurl=8.7.1=hf20ceda_0
libcxx=14.0.6=h9765a3e_0
-libdeflate=1.17=hb664fd8_0
-libedit=3.1.20221030=h6c40b1e_0
+libdeflate=1.17=hb664fd8_1
+libedit=3.1.20230828=h6c40b1e_0
libev=4.33=h9ed2024_1
libevent=2.1.12=h04015c4_1
-libffi=3.4.4=hecd8cb5_0
+libffi=3.4.4=hecd8cb5_1
libgfortran=5.0.0=11_3_0_hecd8cb5_28
libgfortran5=11.3.0=h9dfd629_28
-libiconv=1.16=hca72f7f_2
-libnghttp2=1.52.0=h9beae6a_1
+libiconv=1.16=h6c40b1e_3
+liblapack=3.9.0=16_osx64_openblas
+liblapacke=3.9.0=16_osx64_openblas
+libnghttp2=1.57.0=h9beae6a_0
+libopenblas=0.3.21=openmp_h429af6e_3
libpng=1.6.39=h6c40b1e_0
-libpq=12.15=h04015c4_1
+libpq=12.17=h04015c4_0
libprotobuf=3.20.3=hfff2838_0
libsodium=1.0.18=h1de35cc_0
-libssh2=1.10.0=h04015c4_2
+libssh2=1.11.0=hf20ceda_0
libthrift=0.15.0=h70b4b81_2
libtiff=4.5.1=hcec6c5f_0
-libwebp=1.3.2=hf6ce154_0
libwebp-base=1.3.2=h6c40b1e_0
-libxml2=2.10.4=h1bd7e62_1
-libxslt=1.1.37=h6c40b1e_1
+libxgboost=2.0.3=hab2016f_0
llvm-openmp=14.0.6=h0dcd299_0
-lxml=4.9.3=py311h946e0e5_0
-lz4-c=1.9.4=hcec6c5f_0
+lz4-c=1.9.4=hcec6c5f_1
markdown-it-py=2.2.0=py311hecd8cb5_1
-markupsafe=2.1.1=py311h6c40b1e_0
-matplotlib=3.7.2=py311hecd8cb5_0
-matplotlib-base=3.7.2=py311h8251f7d_0
+markupsafe=2.1.3=py311h6c40b1e_0
+matplotlib=3.8.4=py311hecd8cb5_0
+matplotlib-base=3.8.4=py311h41a4f6b_0
matplotlib-inline=0.1.6=py311hecd8cb5_0
mccabe=0.7.0=pyhd3eb1b0_0
mdurl=0.1.0=py311hecd8cb5_0
-mistune=0.8.4=py311h6c40b1e_1000
-mkl=2023.1.0=h44ed08c_48681
-mkl-service=2.4.0=py311h6c40b1e_1
-mkl_fft=1.3.8=py311h6c40b1e_0
-mkl_random=1.2.4=py311ha357a0b_0
-more-itertools=8.12.0=pyhd3eb1b0_0
-munkres=1.1.4=py_0
+mistune=2.0.4=py311hecd8cb5_0
+mizani=0.11.4=pypi_0
+more-itertools=10.1.0=py311hecd8cb5_0
mypy_extensions=1.0.0=py311hecd8cb5_0
-nbclassic=0.5.5=py311hecd8cb5_0
-nbclient=0.5.13=py311hecd8cb5_0
-nbconvert=6.5.4=py311hecd8cb5_0
+nbclient=0.8.0=py311hecd8cb5_0
+nbconvert=7.10.0=py311hecd8cb5_0
nbformat=5.9.2=py311hecd8cb5_0
ncurses=6.4=hcec6c5f_0
-nest-asyncio=1.5.6=py311hecd8cb5_0
-notebook=6.5.4=py311hecd8cb5_1
-notebook-shim=0.2.2=py311hecd8cb5_0
-numexpr=2.8.4=py311h728a8a3_1
-numpy=1.26.0=py311h728a8a3_0
-numpy-base=1.26.0=py311h53bf9ac_0
-openssl=3.0.11=hca72f7f_2
+nest-asyncio=1.6.0=py311hecd8cb5_0
+notebook-shim=0.2.3=py311hecd8cb5_0
+numexpr=2.8.7=py311h91b6869_0
+numpy=1.26.4=py311h91b6869_0
+numpy-base=1.26.4=py311hb3ec012_0
+openblas=0.3.21=openmp_hbefa662_3
+openjpeg=2.4.0=h66ea3da_0
+openssl=3.0.13=hca72f7f_2
orc=1.7.4=h995b336_1
-packaging=23.1=py311hecd8cb5_0
-pandas=2.0.3=py311hdb55bb0_0
+overrides=7.4.0=py311hecd8cb5_0
+packaging=23.2=py311hecd8cb5_0
+pandas=2.2.1=py311hdb55bb0_0
pandocfilters=1.5.0=pyhd3eb1b0_0
parso=0.8.3=pyhd3eb1b0_0
pathspec=0.10.3=py311hecd8cb5_0
-pdoc=14.1.0=pypi_0
+patsy=0.5.6=pypi_0
+pdoc=14.5.0=pypi_0
pexpect=4.8.0=pyhd3eb1b0_3
-pickleshare=0.7.5=pyhd3eb1b0_1003
-pillow=9.4.0=py311hcec6c5f_1
-pip=23.2.1=py311hecd8cb5_0
-pkginfo=1.9.6=py311hecd8cb5_0
+pillow=10.3.0=py311h6c40b1e_0
+pip=24.0=py311hecd8cb5_0
+pkginfo=1.10.0=py311hecd8cb5_0
platformdirs=3.10.0=py311hecd8cb5_0
+plotnine=0.13.6=pypi_0
pluggy=1.0.0=py311hecd8cb5_1
-polars=0.19.5=pypi_0
+polars=0.20.31=pypi_0
prometheus_client=0.14.1=py311hecd8cb5_0
-prompt-toolkit=3.0.36=py311hecd8cb5_0
-proto-plus=1.22.3=pypi_0
-protobuf=4.24.3=pypi_0
+prompt-toolkit=3.0.43=py311hecd8cb5_0
+prompt_toolkit=3.0.43=hd3eb1b0_0
+proto-plus=1.23.0=pypi_0
+protobuf=4.25.3=pypi_0
psutil=5.9.0=py311h6c40b1e_0
-psycopg2=2.9.3=py311h6c40b1e_1
+psycopg2=2.9.9=py311h6c40b1e_0
ptyprocess=0.7.0=pyhd3eb1b0_2
pure_eval=0.2.2=pyhd3eb1b0_0
-py=1.11.0=pyhd3eb1b0_0
+py-xgboost=2.0.3=py311hecd8cb5_0
py4j=0.10.9.7=py311hecd8cb5_0
-pyarrow=11.0.0=py311hf41f4e6_1
-pyasn1=0.5.0=pypi_0
-pyasn1-modules=0.3.0=pypi_0
+pyarrow=14.0.2=py311h2a249a5_0
+pyasn1=0.6.0=pypi_0
+pyasn1-modules=0.4.0=pypi_0
+pybind11-abi=4=hd3eb1b0_1
pycparser=2.21=pyhd3eb1b0_0
pygments=2.15.1=py311hecd8cb5_1
pyjsparser=2.7.1=py311hecd8cb5_0
-pylint=2.16.2=py311hecd8cb5_0
+pylint=3.2.2=py311hecd8cb5_0
pymysql=1.0.2=py311hecd8cb5_1
-pyopenssl=23.2.0=py311hecd8cb5_0
pyparsing=3.0.9=py311hecd8cb5_0
-pyrsistent=0.18.0=py311h6c40b1e_0
pysocks=1.7.1=py311hecd8cb5_0
pyspark=3.4.1=py311hecd8cb5_0
-pytest=7.4.0=py311hecd8cb5_0
-pytest-cov=4.1.0=py311hecd8cb5_0
-python=3.11.5=hf27a42d_0
-python-dateutil=2.8.2=pyhd3eb1b0_0
+pytest=7.4.4=py311hecd8cb5_0
+pytest-cov=4.1.0=py311hecd8cb5_1
+python=3.11.9=hf27a42d_0
+python-dateutil=2.9.0post0=py311hecd8cb5_2
python-fastjsonschema=2.16.2=py311hecd8cb5_0
python-json-logger=2.0.7=py311hecd8cb5_0
python-tzdata=2023.3=pyhd3eb1b0_0
-pytz=2023.3.post1=py311hecd8cb5_0
-pyyaml=6.0=py311h6c40b1e_1
-pyzmq=23.2.0=py311hcec6c5f_0
+pytz=2024.1=py311hecd8cb5_0
+pyyaml=6.0.1=py311h6c40b1e_0
+pyzmq=25.1.2=py311hcec6c5f_0
re2=2022.04.01=he9d5cce_0
readline=8.2=hca72f7f_0
readme_renderer=40.0=py311hecd8cb5_0
-regex=2022.7.9=py311h6c40b1e_0
-requests=2.31.0=py311hecd8cb5_0
+referencing=0.30.2=py311hecd8cb5_0
+regex=2023.10.3=py311h6c40b1e_0
+requests=2.32.2=py311hecd8cb5_0
requests-toolbelt=1.0.0=py311hecd8cb5_0
rfc3339-validator=0.1.4=py311hecd8cb5_0
rfc3986=1.4.0=pyhd3eb1b0_0
rfc3986-validator=0.1.1=py311hecd8cb5_0
rich=13.3.5=py311hecd8cb5_0
+rpds-py=0.10.6=py311hf2ad997_0
rsa=4.9=pypi_0
-scikit-learn=1.2.2=py311hcec6c5f_1
-scipy=1.11.1=py311h224febf_0
+scikit-learn=1.4.2=py311hdb55bb0_1
+scipy=1.13.1=py311hedc7b93_0
seaborn=0.12.2=py311hecd8cb5_0
-send2trash=1.8.0=pyhd3eb1b0_1
-setuptools=68.0.0=py311hecd8cb5_0
+send2trash=1.8.2=py311hecd8cb5_0
+setuptools=69.5.1=py311hecd8cb5_0
six=1.16.0=pyhd3eb1b0_1
-snappy=1.1.9=he9d5cce_0
-sniffio=1.2.0=py311hecd8cb5_1
-soupsieve=2.4=py311hecd8cb5_0
-sqlalchemy=2.0.21=py311h6c40b1e_0
-sqlite=3.41.2=h6c40b1e_0
+snappy=1.1.10=hcec6c5f_1
+sniffio=1.3.0=py311hecd8cb5_0
+soupsieve=2.5=py311hecd8cb5_0
+sqlalchemy=2.0.25=py311h6c40b1e_0
+sqlite=3.45.3=h6c40b1e_0
stack_data=0.2.0=pyhd3eb1b0_0
-tbb=2021.8.0=ha357a0b_0
+statsmodels=0.14.2=pypi_0
terminado=0.17.1=py311hecd8cb5_0
threadpoolctl=2.2.0=pyh0d69192_0
tinycss2=1.2.1=py311hecd8cb5_0
-tk=8.6.12=h5d9f67b_0
+tk=8.6.14=h4d00af3_0
toml=0.10.2=pyhd3eb1b0_0
tomlkit=0.11.1=py311hecd8cb5_0
-tornado=6.3.2=py311h6c40b1e_0
-traitlets=5.7.1=py311hecd8cb5_0
+tornado=6.3.3=py311h6c40b1e_0
+traitlets=5.14.3=py311hecd8cb5_0
twine=4.0.2=py311hecd8cb5_0
-typing-extensions=4.7.1=py311hecd8cb5_0
-typing_extensions=4.7.1=py311hecd8cb5_0
-tzdata=2023c=h04d1e81_0
-tzlocal=2.1=py311hecd8cb5_1
-urllib3=1.26.16=py311hecd8cb5_0
-utf8proc=2.6.1=h9ed2024_0
+typing-extensions=4.11.0=py311hecd8cb5_0
+typing_extensions=4.11.0=py311hecd8cb5_0
+tzdata=2024a=h04d1e81_0
+tzlocal=5.2=py311hecd8cb5_0
+unicodedata2=15.1.0=py311h6c40b1e_0
+urllib3=2.2.1=py311hecd8cb5_0
+utf8proc=2.6.1=h6c40b1e_1
vtreat=1.3.1=dev_0
wcwidth=0.2.5=pyhd3eb1b0_0
webencodings=0.5.1=py311hecd8cb5_1
-websocket-client=0.58.0=py311hecd8cb5_4
-wheel=0.41.2=py311hecd8cb5_0
-wrapt=1.14.1=py311h6c40b1e_0
-xz=5.4.2=h6c40b1e_0
-y-py=0.5.9=py311h7242b5c_0
+websocket-client=1.8.0=py311hecd8cb5_0
+wheel=0.43.0=py311hecd8cb5_0
+wvpy=1.1.2=pypi_0
+wvu=0.3.9=pypi_0
+xgboost=2.0.3=py311hecd8cb5_0
+xz=5.4.6=h6c40b1e_1
yaml=0.2.5=haf1e3a3_0
-ypy-websocket=0.8.2=py311hecd8cb5_0
-zeromq=4.3.4=h23ab428_0
-zipp=3.11.0=py311hecd8cb5_0
-zlib=1.2.13=h4dc903c_0
-zstd=1.5.5=hc035e20_0
+zeromq=4.3.5=hcec6c5f_0
+zipp=3.17.0=py311hecd8cb5_0
+zlib=1.2.13=h4b97444_1
+zstd=1.5.5=hc035e20_2