Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DOC Add an "Tutorials" section to the docs with a demo on TCGA-LUAD #3

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,21 @@ fedpydeseq2/substra_utils/credentials/*
tests/local-worker/*
tmp_substrafl*

# Experiments
/experiments/credentials/
/paper_experiments/gsea/gsea.sh

# Lock files
*.lock
!poetry.lock

tests/datasets_parent_dir.txt

data/raw/*

# MKDocs
/docs/build.out
docs/generated/gallery/
docs/examples/data/raw/tcga/
docs/generated/gallery/
docs/examples/data/processed/
24 changes: 24 additions & 0 deletions docs/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from snakemake.utils import min_version

##### set minimum snakemake version #####
min_version("7.30.0")

##### setup report #####
configfile: "config/config.yaml"
report: "report/workflow.rst"

##### load rules #####
wildcard_constraints:
dataset="[A-Za-z_]+",
output_path="*"

include: "rules/common.smk"
include: "rules/download_data.smk"
include: "rules/parquet_check_data.smk"
include: "rules/move_data.smk"
include: "rules/check_csv_data.smk"

##### target rules #####
rule all:
input:
get_output
1 change: 1 addition & 0 deletions docs/examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Tutorials
1 change: 1 addition & 0 deletions docs/examples/assets/tcga/description.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
TCGA RNA data used for fedpydeseq2
63 changes: 63 additions & 0 deletions docs/examples/assets/tcga/opener.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import pathlib

import anndata as ad
import pandas as pd
import substratools as tools
from pydeseq2.utils import load_example_data


class TCGAOpener(tools.Opener):
"""Opener class for TCGA RNA-seq datasets.

Creates an AnnData object from a path containing a counts_data.csv and a
metadata.csv.
"""

def fake_data(self, n_samples=None):
"""Create a fake AnnData object for testing purposes.

Parameters
----------
n_samples : int
Number of samples to generate. If None, generate 100 samples.

Returns
-------
AnnData
An AnnData object with fake counts and metadata.
"""
N_SAMPLES = n_samples if n_samples and n_samples <= 100 else 100
fake_counts = load_example_data(modality="raw_counts").iloc[:N_SAMPLES]
fake_metadata = load_example_data(modality="metadata").iloc[:N_SAMPLES]
return ad.AnnData(X=fake_counts, obs=fake_metadata)

def get_data(self, folders):
"""Open the TCGA dataset.

Parameters
----------
folders : list
list of paths to the dataset folders, whose first element should contain a
counts_data.csv and a metadata.csv file.

Returns
-------
AnnData
An AnnData object containing the counts and metadata loaded for the FL pipe.
"""
# get .csv files
data_path = pathlib.Path(folders[0]).resolve()
counts_data = pd.read_csv(data_path / "counts_data.csv", index_col=0)
metadata = pd.read_csv(data_path / "metadata.csv", index_col=0)
center_id = metadata["center_id"].iloc[0]
# We assume that the center id is not present in the counts data, if it is
# present, we raise an error (it should have been removed in an earlier
# step)
if "center_id" in counts_data.columns:
raise ValueError("center_id column found in counts data")
metadata.drop(columns=["center_id"], inplace=True)
# Build an Anndata object
adata = ad.AnnData(X=counts_data, obs=metadata)
# Add the center id to be accessible within the local states
adata.uns["center_id"] = center_id
return adata
2 changes: 2 additions & 0 deletions docs/examples/config/config_luad.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
datasets :
- LUAD
Empty file.
Empty file.
Empty file.
177 changes: 177 additions & 0 deletions docs/examples/plot_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
"""FedPyDESeq2 demo on the TCGA-LUAD dataset.

This example demonstrates how to run a FedPyDESeq2 experiment on the TCGA-LUAD dataset
from a single machine, using Substra's simulation mode.

We will show how to perform a simple differential expression analysis, comparing samples
with `"Advanced"` vs `"Non-advanced"` tumoral `stage`.
"""

# %%

from pathlib import Path

import pandas as pd
from fedpydeseq2_datasets.process_and_split_data import setup_tcga_dataset
from IPython.display import display

from fedpydeseq2.fedpydeseq2_pipeline import run_fedpydeseq2_experiment

# %%
# ## Dataset setup
#
# In a real federated setup, the data is distributed across multiple medical centers
# and must be registered with Substra beforehand. Hence, each center would have a folder
# containing two csvs (one fore the counts and one for the metadata), as well as an
# opener python file and a markdown readme file (see
# [Substra's documentation](https://docs.substra.org/en/stable/documentation/concepts.html#assets)
# on how to register a datasample).
# Then, we would only need pass the `dataset_datasample_keys path`.
#
# In this tutorial, however, we use FedPyDESeq2's simulation mode, which
# allows us to emulate a federated setup from a single machine.
#
# The simulation mode assumes the data to be organized in the following structure:
#
# ```
# processed_data_path/
# ├── centers_data/
# │ └── tcga/
# │ └── {experiment_id}/
# │ ├── center_0/
# │ │ ├── counts.csv
# │ │ └── metadata.csv
# │ ├── center_1/
# │ │ ├── counts.csv
# │ │ └── metadata.csv
# │ └── ...
# └── pooled_data/
# └── tcga/
# └── {experiment_id}/
# ├── counts.csv
# └── metadata.csv
# ```
#
# In this tutorial, we have already downloaded the data in the `data/raw` directory.
#
# The `setup_tcga_dataset` function from `fedpydeseq2_datasets` will automatically
# organize the data in the `data/processed` directory.
#
# It will split the TCGA-LUAD dataset into 7 centers according to the geographical
# origin of the samples, as described in the
# [FedPyDESeq2 paper](https://www.biorxiv.org/content/10.1101/2024.12.06.627138v1).
#
# See also the [`fedpydeseq2_datasets`](https://github.com/owkin/fedpydeseq2-datasets)
# repository for more details.


dataset_name = "TCGA-LUAD"
raw_data_path = Path("data/raw").resolve()
processed_data_path = Path("data/processed").resolve()
design_factors = "stage"


setup_tcga_dataset(
raw_data_path,
processed_data_path,
dataset_name=dataset_name,
small_samples=False,
small_genes=False,
only_two_centers=False,
design_factors=design_factors,
force=True,
)

experiment_id = "TCGA-LUAD-stage"

# %%
# ## Running the experiment
#
# We can now run the experiment.
#
# [Substra](https://github.com/substra), the FL framework on which FedPyDESeq2 is built,
# supports a simulated mode which may be run locally from a single machine, which we
# will use here.
#
# Let's run our FedPyDESeq2 experiment. This may be done using the
# `run_fedpydeseq2_experiment` wrapper function, which takes the following parameters:
#
# * `n_centers=7`: Our data is distributed across 7 different medical centers
#
# * `backend="subprocess"` and `simulate=True`: We'll run the analysis locally on our
# machine to simulate a federated setup, rather than in a real distributed environment
#
# * `register_data=True`: We'll register our dataset with Substra before analysis.
# In the case of a real federated setup, this would be set to `False` if data was
# already registered by Substra.
#
# * `asset_directory`: This directory should contain an opener.py file, containing an
# Opener class, and datasets.description.md file. Here, we copy them from
# [`fedpydeseq2_datasets/assets/tcga`](https://github.com/owkin/fedpydeseq2-datasets/tree/main/fedpydeseq2_datasets/assets/tcga)
#
# * `centers_root_directory`: Where the processed data for each center is stored
#
# * `compute_plan_name`: We'll call this analysis "Example-TCGA-LUAD-pipeline"
# in Substra
#
# * `dataset_name`: We're working with the TCGA-LUAD lung cancer dataset
#
# * `dataset_datasamples_keys_path`: Path to a YAML file containing the keys for our
# data samples. This is only used in the case of a real (unsimulated) federated setup.
#
# * `design_factors`: This should be a list of the design factors we wish to include in
# our analysis. Here, we're studying how "stage" (the cancer stage) affects gene
# expression
#
# * `ref_levels`: We're setting "Non-advanced" as our baseline cancer stage
#
# * `contrast`: This should be a list of three strings, of the form
# `["factor", "alternative_level", "baseline_level"]`. To compare gene expression
# between "Advanced" vs "Non-advanced" stages, we set
# `contrast=["stage", "Advanced", "Non-advanced"]`.
#
# * `refit_cooks=True`: After finding outliers using Cook's distance, we'll refit the
# model without them for more robust results

fl_results = run_fedpydeseq2_experiment(
n_centers=7,
backend="subprocess",
simulate=True,
register_data=True,
asset_directory=Path("assets/tcga").resolve(),
centers_root_directory=processed_data_path
/ "centers_data"
/ "tcga"
/ experiment_id,
compute_plan_name="Example-TCGA-LUAD-pipeline",
dataset_name="TCGA-LUAD",
dataset_datasamples_keys_path=Path(
f"credentials/{experiment_id}-datasamples-keys.yaml"
).resolve(),
design_factors="stage",
ref_levels={"stage": "Non-advanced"},
contrast=["stage", "Advanced", "Non-advanced"],
refit_cooks=True,
)

# %%
# ## Results
# The results are then stored in a `fl_results` dictionary, which does not contain any
# individual sample information.
fl_results.keys()

# %%
# We can then extract the results for our contrast of interest, and store them in a
# pandas DataFrame.

res_df = pd.DataFrame()
res_df["LFC"] = fl_results["LFC"]["stage_Advanced_vs_Non-advanced"]
res_df["pvalue"] = fl_results["p_values"]
res_df["padj"] = fl_results["padj"]

res_df = res_df.loc[fl_results["non_zero"], :]

# %%
display(res_df)

# %%
30 changes: 30 additions & 0 deletions docs/gallery_conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from pathlib import Path

import fedpydeseq2_datasets
from fedpydeseq2_datasets.download_data.download_data import download_data
from mkdocs_gallery.gen_gallery import DefaultResetArgv

config_path = Path("docs/examples/config/config_luad.yaml").resolve()
raw_data_path = Path("docs/examples/data/raw").resolve()
processed_data_path = Path("docs/examples/data/processed").resolve()

if (raw_data_path / "tcga" / "LUAD").exists() and any(
(raw_data_path / "tcga" / "LUAD").iterdir()
):
print(f"Data already exists in {raw_data_path}, skipping download")
else:
print(f"Downloading data to {raw_data_path}")
download_data(
config_path=config_path,
download_data_directory=Path(
fedpydeseq2_datasets.download_data.download_data.__file__
).parent.resolve(),
raw_data_output_path=raw_data_path,
snakemake_env_name="snakemake_env",
conda_activate_path=None,
)


conf = {
"reset_argv": DefaultResetArgv(),
}
Empty file added docs/generated/.gitkeep
Empty file.
7 changes: 7 additions & 0 deletions docs/snakemake_env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
channels:
- conda-forge
- default
- bioconda
dependencies:
- snakemake=8.20.3
- mamba=1.5.10
6 changes: 6 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ extra_css:


plugins:
- gallery:
examples_dirs: docs/examples # path to example scripts
gallery_dirs: docs/generated/gallery # where to save generated gallery
conf_script: docs/gallery_conf.py

- search:
- git-revision-date-localized:
type: timeago
Expand Down Expand Up @@ -91,6 +96,7 @@ nav:
- Installation: usage/installation.md
- Contributing: usage/contributing.md
- References: usage/references.md
- generated/gallery
- API:
- Home: api/index.md
- Running an experiment: api/fedpydeseq2_pipeline.md
Expand Down
Loading
Loading