owkin · BorisMuzellec · Jan 7, 2025 · Jan 7, 2025 · Jan 9, 2025 · Jan 9, 2025
@@ -174,11 +174,21 @@ fedpydeseq2/substra_utils/credentials/*
 tests/local-worker/*
 tmp_substrafl*
 
+# Experiments
 /experiments/credentials/
 /paper_experiments/gsea/gsea.sh
+
+# Lock files
 *.lock
 !poetry.lock
 
 tests/datasets_parent_dir.txt
 
 data/raw/*
+
+# MKDocs
+/docs/build.out
+docs/generated/gallery/
+docs/examples/data/raw/tcga/
+docs/generated/gallery/
+docs/examples/data/processed/
@@ -0,0 +1,24 @@
+from snakemake.utils import min_version
+
+##### set minimum snakemake version #####
+min_version("7.30.0")
+
+##### setup report #####
+configfile: "config/config.yaml"
+report: "report/workflow.rst"
+
+##### load rules #####
+wildcard_constraints:
+    dataset="[A-Za-z_]+",
+    output_path="*"
+
+include: "rules/common.smk"
+include: "rules/download_data.smk"
+include: "rules/parquet_check_data.smk"
+include: "rules/move_data.smk"
+include: "rules/check_csv_data.smk"
+
+##### target rules #####
+rule all:
+    input:
+        get_output
@@ -0,0 +1 @@
+# Tutorials
@@ -0,0 +1 @@
+TCGA RNA data used for fedpydeseq2
@@ -0,0 +1,63 @@
+import pathlib
+
+import anndata as ad
+import pandas as pd
+import substratools as tools
+from pydeseq2.utils import load_example_data
+
+
+class TCGAOpener(tools.Opener):
+    """Opener class for TCGA RNA-seq datasets.
+
+    Creates an AnnData object from a path containing a counts_data.csv and a
+    metadata.csv.
+    """
+
+    def fake_data(self, n_samples=None):
+        """Create a fake AnnData object for testing purposes.
+
+        Parameters
+        ----------
+        n_samples : int
+            Number of samples to generate. If None, generate 100 samples.
+
+        Returns
+        -------
+        AnnData
+            An AnnData object with fake counts and metadata.
+        """
+        N_SAMPLES = n_samples if n_samples and n_samples <= 100 else 100
+        fake_counts = load_example_data(modality="raw_counts").iloc[:N_SAMPLES]
+        fake_metadata = load_example_data(modality="metadata").iloc[:N_SAMPLES]
+        return ad.AnnData(X=fake_counts, obs=fake_metadata)
+
+    def get_data(self, folders):
+        """Open the TCGA dataset.
+
+        Parameters
+        ----------
+        folders : list
+            list of paths to the dataset folders, whose first element should contain a
+            counts_data.csv and a metadata.csv file.
+
+        Returns
+        -------
+        AnnData
+            An AnnData object containing the counts and metadata loaded for the FL pipe.
+        """
+        # get .csv files
+        data_path = pathlib.Path(folders[0]).resolve()
+        counts_data = pd.read_csv(data_path / "counts_data.csv", index_col=0)
+        metadata = pd.read_csv(data_path / "metadata.csv", index_col=0)
+        center_id = metadata["center_id"].iloc[0]
+        # We assume that the center id is not present in the counts data, if it is
+        # present, we raise an error (it should have been removed in an earlier
+        # step)
+        if "center_id" in counts_data.columns:
+            raise ValueError("center_id column found in counts data")
+        metadata.drop(columns=["center_id"], inplace=True)
+        # Build an Anndata object
+        adata = ad.AnnData(X=counts_data, obs=metadata)
+        # Add the center id to be accessible within the local states
+        adata.uns["center_id"] = center_id
+        return adata
@@ -0,0 +1,2 @@
+datasets :
+  - LUAD
@@ -0,0 +1,177 @@
+"""FedPyDESeq2 demo on the TCGA-LUAD dataset.
+
+This example demonstrates how to run a FedPyDESeq2 experiment on the TCGA-LUAD dataset
+from a single machine, using Substra's simulation mode.
+
+We will show how to perform a simple differential expression analysis, comparing samples
+with `"Advanced"` vs `"Non-advanced"` tumoral `stage`.
+"""
+
+# %%
+
+from pathlib import Path
+
+import pandas as pd
+from fedpydeseq2_datasets.process_and_split_data import setup_tcga_dataset
+from IPython.display import display
+
+from fedpydeseq2.fedpydeseq2_pipeline import run_fedpydeseq2_experiment
+
+# %%
+# ## Dataset setup
+#
+# In a real federated setup, the data is distributed across multiple medical centers
+# and must be registered with Substra beforehand. Hence, each center would have a folder
+# containing two csvs (one fore the counts and one for the metadata), as well as an
+# opener python file and a markdown readme file (see
+# [Substra's documentation](https://docs.substra.org/en/stable/documentation/concepts.html#assets)
+# on how to register a datasample).
+# Then, we would only need pass the `dataset_datasample_keys path`.
+#
+# In this tutorial, however, we use FedPyDESeq2's simulation mode, which
+# allows us to emulate a federated setup from a single machine.
+#
+# The simulation mode assumes the data to be organized in the following structure:
+#
+# ```
+# processed_data_path/
+# ├── centers_data/
+# │   └── tcga/
+# │       └── {experiment_id}/
+# │           ├── center_0/
+# │           │   ├── counts.csv
+# │           │   └── metadata.csv
+# │           ├── center_1/
+# │           │   ├── counts.csv
+# │           │   └── metadata.csv
+# │           └── ...
+# └── pooled_data/
+#     └── tcga/
+#         └── {experiment_id}/
+#             ├── counts.csv
+#             └── metadata.csv
+# ```
+#
+# In this tutorial, we have already downloaded the data in the `data/raw` directory.
+#
+# The `setup_tcga_dataset` function from `fedpydeseq2_datasets` will automatically
+# organize the data in the `data/processed` directory.
+#
+# It will split the TCGA-LUAD dataset into 7 centers according to the geographical
+# origin of the samples, as described in the
+# [FedPyDESeq2 paper](https://www.biorxiv.org/content/10.1101/2024.12.06.627138v1).
+#
+# See also the [`fedpydeseq2_datasets`](https://github.com/owkin/fedpydeseq2-datasets)
+# repository for more details.
+
+
+dataset_name = "TCGA-LUAD"
+raw_data_path = Path("data/raw").resolve()
+processed_data_path = Path("data/processed").resolve()
+design_factors = "stage"
+
+
+setup_tcga_dataset(
+    raw_data_path,
+    processed_data_path,
+    dataset_name=dataset_name,
+    small_samples=False,
+    small_genes=False,
+    only_two_centers=False,
+    design_factors=design_factors,
+    force=True,
+)
+
+experiment_id = "TCGA-LUAD-stage"
+
+# %%
+# ## Running the experiment
+#
+# We can now run the experiment.
+#
+# [Substra](https://github.com/substra), the FL framework on which FedPyDESeq2 is built,
+# supports a simulated mode which may be run locally from a single machine, which we
+# will use here.
+#
+# Let's run our FedPyDESeq2 experiment. This may be done using the
+# `run_fedpydeseq2_experiment` wrapper function, which takes the following parameters:
+#
+# * `n_centers=7`: Our data is distributed across 7 different medical centers
+#
+# * `backend="subprocess"` and `simulate=True`: We'll run the analysis locally on our
+#   machine to simulate a federated setup, rather than in a real distributed environment
+#
+# * `register_data=True`: We'll register our dataset with Substra before analysis.
+#   In the case of a real federated setup, this would be set to `False` if data was
+#   already registered by Substra.
+#
+# * `asset_directory`: This directory should contain an opener.py file, containing an
+#   Opener class, and datasets.description.md file. Here, we copy them from
+#   [`fedpydeseq2_datasets/assets/tcga`](https://github.com/owkin/fedpydeseq2-datasets/tree/main/fedpydeseq2_datasets/assets/tcga)
+#
+# * `centers_root_directory`: Where the processed data for each center is stored
+#
+# * `compute_plan_name`: We'll call this analysis "Example-TCGA-LUAD-pipeline"
+#   in Substra
+#
+# * `dataset_name`: We're working with the TCGA-LUAD lung cancer dataset
+#
+# * `dataset_datasamples_keys_path`: Path to a YAML file containing the keys for our
+#   data samples. This is only used in the case of a real (unsimulated) federated setup.
+#
+# * `design_factors`: This should be a list of the design factors we wish to include in
+#   our analysis. Here, we're studying how "stage" (the cancer stage) affects gene
+#   expression
+#
+# * `ref_levels`: We're setting "Non-advanced" as our baseline cancer stage
+#
+# * `contrast`: This should be a list of three strings, of the form
+#  `["factor", "alternative_level", "baseline_level"]`. To compare gene expression
+#   between "Advanced" vs "Non-advanced" stages, we set
+#  `contrast=["stage", "Advanced", "Non-advanced"]`.
+#
+# * `refit_cooks=True`: After finding outliers using Cook's distance, we'll refit the
+#   model without them for more robust results
+
+fl_results = run_fedpydeseq2_experiment(
+    n_centers=7,
+    backend="subprocess",
+    simulate=True,
+    register_data=True,
+    asset_directory=Path("assets/tcga").resolve(),
+    centers_root_directory=processed_data_path
+    / "centers_data"
+    / "tcga"
+    / experiment_id,
+    compute_plan_name="Example-TCGA-LUAD-pipeline",
+    dataset_name="TCGA-LUAD",
+    dataset_datasamples_keys_path=Path(
+        f"credentials/{experiment_id}-datasamples-keys.yaml"
+    ).resolve(),
+    design_factors="stage",
+    ref_levels={"stage": "Non-advanced"},
+    contrast=["stage", "Advanced", "Non-advanced"],
+    refit_cooks=True,
+)
+
+# %%
+# ## Results
+# The results are then stored in a `fl_results` dictionary, which does not contain any
+# individual sample information.
+fl_results.keys()
+
+# %%
+# We can then extract the results for our contrast of interest, and store them in a
+# pandas DataFrame.
+
+res_df = pd.DataFrame()
+res_df["LFC"] = fl_results["LFC"]["stage_Advanced_vs_Non-advanced"]
+res_df["pvalue"] = fl_results["p_values"]
+res_df["padj"] = fl_results["padj"]
+
+res_df = res_df.loc[fl_results["non_zero"], :]
+
+# %%
+display(res_df)
+
+# %%
@@ -0,0 +1,30 @@
+from pathlib import Path
+
+import fedpydeseq2_datasets
+from fedpydeseq2_datasets.download_data.download_data import download_data
+from mkdocs_gallery.gen_gallery import DefaultResetArgv
+
+config_path = Path("docs/examples/config/config_luad.yaml").resolve()
+raw_data_path = Path("docs/examples/data/raw").resolve()
+processed_data_path = Path("docs/examples/data/processed").resolve()
+
+if (raw_data_path / "tcga" / "LUAD").exists() and any(
+    (raw_data_path / "tcga" / "LUAD").iterdir()
+):
+    print(f"Data already exists in {raw_data_path}, skipping download")
+else:
+    print(f"Downloading data to {raw_data_path}")
+    download_data(
+        config_path=config_path,
+        download_data_directory=Path(
+            fedpydeseq2_datasets.download_data.download_data.__file__
+        ).parent.resolve(),
+        raw_data_output_path=raw_data_path,
+        snakemake_env_name="snakemake_env",
+        conda_activate_path=None,
+    )
+
+
+conf = {
+    "reset_argv": DefaultResetArgv(),
+}
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - default
+  - bioconda
+dependencies:
+  - snakemake=8.20.3
+  - mamba=1.5.10
@@ -63,6 +63,11 @@ extra_css:
 
 
 plugins:
+  - gallery:
+      examples_dirs: docs/examples          # path to example scripts
+      gallery_dirs: docs/generated/gallery  # where to save generated gallery
+      conf_script: docs/gallery_conf.py
+
   - search:
   - git-revision-date-localized:
       type: timeago
@@ -91,6 +96,7 @@ nav:
     - Installation: usage/installation.md
     - Contributing: usage/contributing.md
     - References: usage/references.md
+  - generated/gallery
   - API:
     - Home: api/index.md
     - Running an experiment: api/fedpydeseq2_pipeline.md