Skip to content

Commit

Permalink
Refactor bootstrap_ccram and related tests to utilize GenericCheckerb…
Browse files Browse the repository at this point in the history
…oardCopula; remove unused display_prediction_summary function; enhance example notebook with score calculations and variance display
  • Loading branch information
DhyeyMavani2003 committed Jan 28, 2025
1 parent 1c322d7 commit 8c138c0
Show file tree
Hide file tree
Showing 5 changed files with 753 additions and 132 deletions.
2 changes: 0 additions & 2 deletions discopula/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from discopula.checkerboard.genstatsim import (
bootstrap_ccram,
bootstrap_predict_category_summary,
display_prediction_summary,
permutation_test_ccram,
)

Expand All @@ -17,6 +16,5 @@
"gen_case_form_to_contingency",
"bootstrap_ccram",
"bootstrap_predict_category_summary",
"display_prediction_summary",
"permutation_test_ccram",
]
68 changes: 28 additions & 40 deletions discopula/checkerboard/genstatsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def plot_distribution(self, title=None):
print(f"Warning: Could not create plot: {str(e)}")
return None

def bootstrap_ccram(contingency_table: np.ndarray,
def bootstrap_ccram(gen_copula: GenericCheckerboardCopula,
from_axes: Union[List[int], int],
to_axis: int,
scaled: bool = False,
Expand All @@ -67,8 +67,8 @@ def bootstrap_ccram(contingency_table: np.ndarray,
Parameters
----------
contingency_table : numpy.ndarray
Input contingency table
gen_copula : GenericCheckerboardCopula
Checkerboard copula object
from_axes : Union[List[int], int]
Source axis index or list of indices
to_axis : int
Expand Down Expand Up @@ -97,10 +97,10 @@ def bootstrap_ccram(contingency_table: np.ndarray,
metric_name = f"{'SCCRAM' if scaled else 'CCRAM'} ({from_axes_str})->{to_axis}"

# Calculate observed value
gen_copula = GenericCheckerboardCopula.from_contingency_table(contingency_table)
observed_ccram = gen_copula.calculate_CCRAM_vectorized(from_axes, to_axis, scaled)

# Convert to case form
contingency_table = gen_copula.contingency_table
cases = gen_contingency_to_case_form(contingency_table)

# Split variables
Expand Down Expand Up @@ -238,9 +238,11 @@ def prediction_stat(*args, axis=0):
)

def bootstrap_predict_category_summary(
contingency_table: np.ndarray,
gen_copula: GenericCheckerboardCopula,
from_axes: List[int],
from_axes_names: List[str],
to_axis: int,
to_axis_name: str = "Y",
n_resamples: int = 9999,
confidence_level: float = 0.95,
method: str = 'percentile',
Expand All @@ -250,12 +252,16 @@ def bootstrap_predict_category_summary(
Parameters
----------
contingency_table : numpy.ndarray
Contingency table
gen_copula : GenericCheckerboardCopula
Checkerboard copula object
from_axes : List[int]
Source axes indices
from_axes_names : List[str]
Source axes names
to_axis : int
Target axis index
to_axis_name : str, default='Y'
Target axis name
n_resamples : int, default=9999
Number of resamples
confidence_level : float, default=0.95
Expand All @@ -267,10 +273,11 @@ def bootstrap_predict_category_summary(
Returns
-------
Tuple[numpy.ndarray, List[int]]
Summary table of prediction proportions and source dimensions
summary_df : pd.DataFrame
DataFrame of prediction summary
"""
# Get dimensions for each source axis
contingency_table = gen_copula.contingency_table
source_dims = [contingency_table.shape[axis] for axis in from_axes]
target_dim = contingency_table.shape[to_axis]

Expand Down Expand Up @@ -307,54 +314,35 @@ def bootstrap_predict_category_summary(
for val, count in zip(unique_preds, counts):
summary[(int(val),) + source_indices] = (count / total) * 100

return summary, source_dims

def display_prediction_summary(
summary_matrix: np.ndarray,
source_dims: List[int],
from_axes_names: List[str],
to_axis_name: str = "Y"
) -> None:
"""Display multi-dimensional prediction summary.
Parameters
----------
summary_matrix : numpy.ndarray
Multi-dimensional array of prediction percentages
source_dims : List[int]
Dimensions of source axes
from_axes_names : List[str]
Names of source variables
to_axis_name : str
Name of target variable
"""
# Create multi-index for source categories
source_names = [
[f"{name}={i}" for i in range(dim)]
for name, dim in zip(from_axes_names, source_dims)
]

# Create target categories
target_categories = [f"{to_axis_name}={i}" for i in range(summary_matrix.shape[0])]
target_categories = [f"{to_axis_name}={i}" for i in range(summary.shape[0])]

# Reshape summary matrix for DataFrame
reshaped_summary = summary_matrix.reshape(summary_matrix.shape[0], -1)
reshaped_summary = summary.reshape(summary.shape[0], -1)

# Create multi-index columns
column_tuples = list(itertools.product(*source_names))
columns = pd.MultiIndex.from_tuples(column_tuples)

# Create DataFrame
df = pd.DataFrame(
summary_df = pd.DataFrame(
reshaped_summary,
index=target_categories,
columns=columns
)

print("\nPrediction Summary (% of bootstrap samples)")
print("-" * 50)
print(df.round(1).to_string(float_format=lambda x: f"{x:5.1f}%"))
print("-" * 50)
print("-" * 80)
print(summary_df.round(1).to_string(float_format=lambda x: f"{x:5.1f}%"))
print("-" * 80)

return summary_df

@dataclass
class CustomPermutationResult:
Expand Down Expand Up @@ -399,7 +387,7 @@ def plot_distribution(self, title=None):
print(f"Warning: Could not create plot: {str(e)}")
return None

def permutation_test_ccram(contingency_table: np.ndarray,
def permutation_test_ccram(gen_copula: GenericCheckerboardCopula,
from_axes: Union[List[int], int],
to_axis: int,
scaled: bool = False,
Expand All @@ -410,8 +398,8 @@ def permutation_test_ccram(contingency_table: np.ndarray,
Parameters
----------
contingency_table : numpy.ndarray
Input contingency table
gen_copula : GenericCheckerboardCopula
Checkerboard copula object
from_axes : Union[List[int], int]
Source axis index or list of indices
to_axis : int
Expand All @@ -435,7 +423,7 @@ def permutation_test_ccram(contingency_table: np.ndarray,

from_axes_str = ",".join(map(str, from_axes))
metric_name = f"{'SCCRAM' if scaled else 'CCRAM'} ({from_axes_str})->{to_axis}"

contingency_table = gen_copula.contingency_table
cases = gen_contingency_to_case_form(contingency_table)
source_data = [cases[:, axis] for axis in from_axes]
target_data = cases[:, to_axis]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -341,14 +341,14 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shape of the inferred joint probability matrix P: (5, 3)\n",
"Shape of the inferred joint probability matrix P: (2, 3, 2, 6)\n",
"Probability matrix P:\n",
"[[[[0. 0. 0. 0. 0.01941748 0.01941748]\n",
" [0. 0. 0.00970874 0.00970874 0.02912621 0. ]]\n",
Expand Down Expand Up @@ -418,7 +418,7 @@
" [1,0,1,3],[1,0,1,3],[1,0,1,3]\n",
"])\n",
"rda_copula = GenericCheckerboardCopula.from_cases(cases=real_cases_data, shape=(2,3,2,6))\n",
"print(f\"Shape of the inferred joint probability matrix P: {copula.P.shape}\")\n",
"print(f\"Shape of the inferred joint probability matrix P: {rda_copula.P.shape}\")\n",
"print(f\"Probability matrix P:\\n{rda_copula.P}\\n\")\n",
"print(f\"Marginal pdfs:\\n{rda_copula.marginal_pdfs}\\n\")\n",
"print(f\"Marginal cdfs:\\n{rda_copula.marginal_cdfs}\")"
Expand All @@ -433,7 +433,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -462,7 +462,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -491,7 +491,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -541,6 +541,73 @@
"print(\"\\nPredictions from X1, X2, X3 to Y = X4:\")\n",
"print(rda_predictions_012_to_3_named)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculating Scores and their Variances"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Scores for axis 0:\n",
"[np.float64(0.1990291262135922), np.float64(0.6990291262135921)]\n",
"\n",
"Scores for axis 1:\n",
"[np.float64(0.13592233009708737), np.float64(0.5339805825242718), np.float64(0.8980582524271845)]\n",
"\n",
"Scores for axis 2:\n",
"[np.float64(0.3203883495145631), np.float64(0.820388349514563)]\n",
"\n",
"Scores for axis 3:\n",
"[np.float64(0.02427184466019417), np.float64(0.11650485436893201), np.float64(0.27184466019417475), np.float64(0.45631067961165045), np.float64(0.6893203883495145), np.float64(0.912621359223301)]\n",
"\n",
"Variance of scores for axis 0: 0.059901970025450064\n",
"\n",
"Variance of scores for axis 1: 0.06894448476151865\n",
"\n",
"Variance of scores for axis 2: 0.05754548025261568\n",
"\n",
"Variance of scores for axis 3: 0.07994082694030621\n"
]
}
],
"source": [
"# Calculate and display scores for both axes\n",
"rda_scores_axis0 = rda_copula.calculate_scores(axis=0)\n",
"rda_scores_axis1 = rda_copula.calculate_scores(axis=1)\n",
"rda_scores_axis2 = rda_copula.calculate_scores(axis=2)\n",
"rda_scores_axis3 = rda_copula.calculate_scores(axis=3)\n",
"\n",
"print(\"Scores for axis 0:\")\n",
"print(rda_scores_axis0)\n",
"print(\"\\nScores for axis 1:\")\n",
"print(rda_scores_axis1)\n",
"print(\"\\nScores for axis 2:\")\n",
"print(rda_scores_axis2)\n",
"print(\"\\nScores for axis 3:\")\n",
"print(rda_scores_axis3)\n",
"\n",
"# Calculate and display variance of scores\n",
"rda_variance_axis0 = rda_copula.calculate_variance_S(axis=0)\n",
"rda_variance_axis1 = rda_copula.calculate_variance_S(axis=1)\n",
"rda_variance_axis2 = rda_copula.calculate_variance_S(axis=2)\n",
"rda_variance_axis3 = rda_copula.calculate_variance_S(axis=3)\n",
"\n",
"print(\"\\nVariance of scores for axis 0:\", rda_variance_axis0)\n",
"print(\"\\nVariance of scores for axis 1:\", rda_variance_axis1)\n",
"print(\"\\nVariance of scores for axis 2:\", rda_variance_axis2)\n",
"print(\"\\nVariance of scores for axis 3:\", rda_variance_axis3)\n",
"# Expected 12 * (variance of scores for axis 3): 0.07994082694030621*12 = 0.95928992328"
]
}
],
"metadata": {
Expand Down
Loading

0 comments on commit 8c138c0

Please sign in to comment.