From 0a16e864df5563e249f151adb475404ef4852d2c Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Sat, 13 Nov 2021 09:38:16 -0500 Subject: [PATCH 01/16] Drop LDA. --- docs/api.rst | 2 - examples/03_annotation/03_lda.py | 43 ----- examples/03_annotation/03_plot_lda.py | 74 +++++++++ nimare/annotate/__init__.py | 5 +- nimare/annotate/lda.py | 219 -------------------------- nimare/extract/__init__.py | 2 - nimare/extract/extract.py | 49 ------ nimare/tests/test_utils.py | 25 --- nimare/utils.py | 26 --- 9 files changed, 75 insertions(+), 370 deletions(-) delete mode 100644 examples/03_annotation/03_lda.py create mode 100644 examples/03_annotation/03_plot_lda.py delete mode 100644 nimare/annotate/lda.py diff --git a/docs/api.rst b/docs/api.rst index 4bfbfe257..ae43c8df2 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -112,7 +112,6 @@ API annotate.cogat annotate.gclda - annotate.lda annotate.text annotate.utils @@ -216,7 +215,6 @@ For more information about fetching data from the internet, see :ref:`fetching t extract.fetch_neuroquery extract.fetch_neurosynth extract.download_nidm_pain - extract.download_mallet extract.download_cognitive_atlas extract.download_abstracts extract.download_peaks2maps_model diff --git a/examples/03_annotation/03_lda.py b/examples/03_annotation/03_lda.py deleted file mode 100644 index 5b4ec94be..000000000 --- a/examples/03_annotation/03_lda.py +++ /dev/null @@ -1,43 +0,0 @@ -# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- -# ex: set sts=4 ts=4 sw=4 et: -""" - -.. _annotations_lda: - -================== -LDA topic modeling -================== - -This example trains a latent Dirichlet allocation model with MALLET using abstracts from -Neurosynth. -""" -import os - -import nimare -from nimare import annotate -from nimare.tests.utils import get_test_data_path - -############################################################################### -# Load dataset with abstracts -# --------------------------- -dset = nimare.dataset.Dataset(os.path.join(get_test_data_path(), "neurosynth_laird_studies.json")) - -############################################################################### -# Download MALLET -# --------------- -# MALLET is a Java toolbox for natural language processing. -# While LDA is implemented in some Python libraries, like scikit-learn, -# MALLET appears to do a better job at LDA than other tools. -# LDAModel will download MALLET automatically, but it's included here for clarity. -mallet_dir = nimare.extract.download_mallet() - -############################################################################### -# Run model -# --------- -# This may take some time, so we won't run it in the gallery. -model = annotate.lda.LDAModel(dset.texts, text_column="abstract", n_iters=5) -model.fit() -model.save("lda_model.pkl.gz") - -# Let's remove the model now that you know how to generate it. -os.remove("lda_model.pkl.gz") diff --git a/examples/03_annotation/03_plot_lda.py b/examples/03_annotation/03_plot_lda.py new file mode 100644 index 000000000..e28139bf1 --- /dev/null +++ b/examples/03_annotation/03_plot_lda.py @@ -0,0 +1,74 @@ +# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- +# ex: set sts=4 ts=4 sw=4 et: +""" + +.. _annotations_lda: + +================== +LDA topic modeling +================== + +This example trains a latent Dirichlet allocation model with scikit-learn +using abstracts from Neurosynth. +""" +import os + +import pandas as pd +from sklearn.decomposition import LatentDirichletAllocation + +import nimare +from nimare import annotate +from nimare.tests.utils import get_test_data_path + +############################################################################### +# Load dataset with abstracts +# --------------------------- +dset = nimare.dataset.Dataset( + os.path.join(get_test_data_path(), "neurosynth_laird_studies.json") +) + +############################################################################### +# Extract term counts from the abstracts +# -------------------------------------- +counts_df = annotate.text.generate_counts( + dset.texts, + text_column="abstract", + tfidf=False, + max_df=len(dset.ids) - 2, + min_df=2, +) +vocabulary = counts_df.columns.tolist() +count_values = counts_df.values +study_ids = counts_df.index.tolist() +N_TOPICS = 5 +topic_names = [f"Topic {str(i+1).zfill(3)}" for i in range(N_TOPICS)] + +############################################################################### +# Run model +# --------- +# This may take some time, so we won't run it in the gallery. +model = LatentDirichletAllocation( + n_components=N_TOPICS, + max_iter=1000, + learning_method="online", +) +doc_topic_weights = model.fit_transform(count_values) +doc_topic_weights_df = pd.DataFrame( + index=study_ids, + columns=topic_names, + data=doc_topic_weights, +) +topic_word_weights = model.components_ +topic_word_weights_df = pd.DataFrame( + index=topic_names, + columns=vocabulary, + data=topic_word_weights, +) + +############################################################################### +# View results +# ------------ +doc_topic_weights_df.head() + +############################################################################### +topic_word_weights_df.head() diff --git a/nimare/annotate/__init__.py b/nimare/annotate/__init__.py index 3fe49a8de..f4f5272e5 100644 --- a/nimare/annotate/__init__.py +++ b/nimare/annotate/__init__.py @@ -1,9 +1,8 @@ """Automated annotation tools.""" -from . import cogat, gclda, lda, text, utils +from . import cogat, gclda, text, utils from .cogat import CogAtLemmatizer, expand_counts, extract_cogat from .gclda import GCLDAModel -from .lda import LDAModel from .text import generate_counts __all__ = [ @@ -11,11 +10,9 @@ "expand_counts", "extract_cogat", "GCLDAModel", - "LDAModel", "generate_counts", "cogat", "gclda", - "lda", "text", "utils", ] diff --git a/nimare/annotate/lda.py b/nimare/annotate/lda.py deleted file mode 100644 index c9075fa36..000000000 --- a/nimare/annotate/lda.py +++ /dev/null @@ -1,219 +0,0 @@ -"""Topic modeling with latent Dirichlet allocation via MALLET.""" -import logging -import os -import shutil - -import numpy as np -import pandas as pd - -from .. import references -from ..base import NiMAREBase -from ..due import due -from ..extract import download_mallet, utils -from ..utils import _run_shell_command - -LGR = logging.getLogger(__name__) - - -@due.dcite(references.LDA, description="Introduces LDA.") -@due.dcite(references.MALLET, description="Citation for MALLET toolbox") -@due.dcite( - references.LDAMODEL, - description="First use of LDA for automated annotation of neuroimaging literature.", -) -class LDAModel(NiMAREBase): - """Perform topic modeling using Latent Dirichlet Allocation (LDA). - - Build an LDA [1]_ topic model with the Java toolbox MALLET [2]_, as - performed in [3]_. - - Parameters - ---------- - text_df : :obj:`pandas.DataFrame` - A pandas DataFrame with two columns ('id' and text_column) containing - article text. - text_column : :obj:`str`, optional - Name of column in text_df that contains text. Default is 'abstract'. - n_topics : :obj:`int`, optional - Number of topics to generate. Default=50. - n_iters : :obj:`int`, optional - Number of iterations to run in training topic model. Default=1000. - alpha : :obj:`float` or 'auto', optional - The Dirichlet prior on the per-document topic distributions. - Default: auto, which calculates 50 / n_topics, based on Poldrack et al. - (2012). - beta : :obj:`float`, optional - The Dirichlet prior on the per-topic word distribution. Default: 0.001, - based on Poldrack et al. (2012). - - Attributes - ---------- - commands_ : :obj:`list` of :obj:`str` - List of MALLET commands called to fit model. - - References - ---------- - .. [1] Blei, David M., Andrew Y. Ng, and Michael I. Jordan. "Latent - dirichlet allocation." Journal of machine Learning research 3.Jan - (2003): 993-1022. - .. [2] McCallum, Andrew Kachites. "Mallet: A machine learning for language - toolkit." (2002). - .. [3] Poldrack, Russell A., et al. "Discovering relations between mind, - brain, and mental disorders using topic mapping." PLoS computational - biology 8.10 (2012): e1002707. - https://doi.org/10.1371/journal.pcbi.1002707 - - See Also - -------- - nimare.extract.download_mallet : This function will be called automatically to download MALLET. - """ - - def __init__( - self, text_df, text_column="abstract", n_topics=50, n_iters=1000, alpha="auto", beta=0.001 - ): - mallet_dir = download_mallet() - mallet_bin = os.path.join(mallet_dir, "bin/mallet") - - model_dir = utils._get_dataset_dir("mallet_model") - text_dir = os.path.join(model_dir, "texts") - - if not os.path.isdir(model_dir): - os.mkdir(model_dir) - - if alpha == "auto": - alpha = 50.0 / n_topics - elif not isinstance(alpha, float): - raise ValueError('Argument alpha must be float or "auto"') - - self.params = {"n_topics": n_topics, "n_iters": n_iters, "alpha": alpha, "beta": beta} - self.model_dir = model_dir - - # Check for presence of text files and convert if necessary - if not os.path.isdir(text_dir): - LGR.info("Texts folder not found. Creating text files...") - os.mkdir(text_dir) - - # Remove rows with empty text cells - orig_ids = text_df["id"].tolist() - text_df = text_df.dropna(subset=[text_column]) - keep_ids = text_df["id"].tolist() - - if len(keep_ids) != len(orig_ids): - LGR.info(f"Retaining {len(keep_ids)}/{len(orig_ids)} studies") - - for id_ in text_df["id"].values: - text = text_df.loc[text_df["id"] == id_, text_column].values[0] - with open(os.path.join(text_dir, str(id_) + ".txt"), "w") as fo: - fo.write(text) - - # Run MALLET topic modeling - LGR.info("Compiling MALLET commands...") - import_str = ( - f"{mallet_bin} import-dir " - f"--input {text_dir} " - f"--output {model_dir}/topic-input.mallet " - "--keep-sequence " - "--remove-stopwords" - ) - - train_str = ( - f"{mallet_bin} train-topics " - f"--input {model_dir}/topic-input.mallet " - f"--num-topics {self.params['n_topics']} " - f"--output-doc-topics {model_dir}/doc_topics.txt " - f"--topic-word-weights-file {model_dir}/topic_word_weights.txt " - f"--num-iterations {self.params['n_iters']} " - f"--output-model {model_dir}/saved_model.mallet " - "--random-seed 1 " - f"--alpha {self.params['alpha']} " - f"--beta {self.params['beta']}" - ) - self.commands_ = [import_str, train_str] - - def fit(self): - """ - Fit LDA model to corpus. - - Attributes - ---------- - p_topic_g_doc_ : :obj:`numpy.ndarray` - Probability of each topic given a document - p_word_g_topic_ : :obj:`numpy.ndarray` - Probability of each word given a topic - """ - LGR.info("Generating topics...") - _run_shell_command(self.commands_[0]) - _run_shell_command(self.commands_[1]) - - # Read in and convert doc_topics and topic_keys. - topic_names = [f"topic_{i:03d}" for i in range(self.params["n_topics"])] - - # doc_topics: Topic weights for each paper. - # The conversion here is pretty ugly at the moment. - # First row should be dropped. First column is row number and can be used - # as the index. - # Second column is 'file: /full/path/to/id.txt' <-- Parse to get id. - # After that, odd columns are topic numbers and even columns are the - # weights for the topics in the preceding column. These columns are sorted - # on an individual id basis by the weights. - n_cols = (2 * self.params["n_topics"]) + 1 - dt_df = pd.read_csv( - os.path.join(self.model_dir, "doc_topics.txt"), - delimiter="\t", - skiprows=1, - header=None, - index_col=0, - ) - dt_df = dt_df[dt_df.columns[:n_cols]] - - # Get ids from filenames - dt_df[1] = dt_df[1].apply(self._clean_str) - - # Put weights (even cols) and topics (odd cols) into separate dfs. - weights_df = dt_df[dt_df.columns[2::2]] - weights_df.index = dt_df[1] - weights_df.columns = range(self.params["n_topics"]) - - topics_df = dt_df[dt_df.columns[1:-1:2]] - topics_df.index = dt_df[1] - topics_df.columns = range(self.params["n_topics"]) - - # Sort columns in weights_df separately for each row using topics_df. - sorters_df = topics_df.apply(self._get_sort, axis=1) - weights = weights_df.values - sorters = np.vstack(sorters_df.values) - # there has to be a better way to do this. - for i in range(sorters.shape[0]): - weights[i, :] = weights[i, sorters[i, :]] - - # Define topic names (e.g., topic_000) - p_topic_g_doc_df = pd.DataFrame(columns=topic_names, data=weights, index=dt_df[1]) - p_topic_g_doc_df.index.name = "id" - self.p_topic_g_doc_ = p_topic_g_doc_df.values - self.p_topic_g_doc_df_ = p_topic_g_doc_df - - # Topic word weights - p_word_g_topic_df = pd.read_csv( - os.path.join(self.model_dir, "topic_word_weights.txt"), - dtype=str, - keep_default_na=False, - na_values=[], - sep="\t", - header=None, - names=["topic", "word", "weight"], - ) - p_word_g_topic_df["weight"] = p_word_g_topic_df["weight"].astype(float) - p_word_g_topic_df["topic"] = p_word_g_topic_df["topic"].astype(int) - p_word_g_topic_df = p_word_g_topic_df.pivot(index="topic", columns="word", values="weight") - p_word_g_topic_df = p_word_g_topic_df.div(p_word_g_topic_df.sum(axis=1), axis=0) - self.p_word_g_topic_ = p_word_g_topic_df.values - self.p_word_g_topic_df_ = p_word_g_topic_df - - # Remove all temporary files (text files, model, and outputs). - shutil.rmtree(self.model_dir) - - def _clean_str(self, string): - return os.path.basename(os.path.splitext(string)[0]) - - def _get_sort(self, lst): - return [i[0] for i in sorted(enumerate(lst), key=lambda x: x[1])] diff --git a/nimare/extract/__init__.py b/nimare/extract/__init__.py index 709854be8..71afe7526 100644 --- a/nimare/extract/__init__.py +++ b/nimare/extract/__init__.py @@ -3,7 +3,6 @@ from .extract import ( download_abstracts, download_cognitive_atlas, - download_mallet, download_nidm_pain, download_peaks2maps_model, fetch_neuroquery, @@ -12,7 +11,6 @@ __all__ = [ "download_nidm_pain", - "download_mallet", "download_cognitive_atlas", "download_abstracts", "download_peaks2maps_model", diff --git a/nimare/extract/extract.py b/nimare/extract/extract.py index c1d426b68..82bf2564f 100644 --- a/nimare/extract/extract.py +++ b/nimare/extract/extract.py @@ -305,55 +305,6 @@ def download_nidm_pain(data_dir=None, overwrite=False): return data_dir -def download_mallet(data_dir=None, overwrite=False): - """Download the MALLET toolbox for LDA topic modeling. - - .. versionadded:: 0.0.2 - - Parameters - ---------- - data_dir : :obj:`pathlib.Path` or :obj:`str`, optional - Path where data should be downloaded. By default, files are downloaded in home directory. - overwrite : :obj:`bool`, optional - Whether to overwrite existing files or not. Default is False. - - Returns - ------- - data_dir : :obj:`str` - Updated data directory pointing to MALLET files. - """ - url = "http://mallet.cs.umass.edu/dist/mallet-2.0.7.tar.gz" - - temp_dataset_name = "mallet__temp" - temp_data_dir = _get_dataset_dir(temp_dataset_name, data_dir=data_dir) - - dataset_name = "mallet" - data_dir = temp_data_dir.replace(temp_dataset_name, dataset_name) - - desc_file = op.join(data_dir, "description.txt") - if op.isfile(desc_file) and overwrite is False: - shutil.rmtree(temp_data_dir) - return data_dir - - mallet_file = op.join(temp_data_dir, op.basename(url)) - _download_zipped_file(url, mallet_file) - - with tarfile.open(mallet_file) as tf: - tf.extractall(path=temp_data_dir) - - os.rename(op.join(temp_data_dir, "mallet-2.0.7"), data_dir) - - os.remove(mallet_file) - shutil.rmtree(temp_data_dir) - - with open(desc_file, "w") as fo: - fo.write("The MALLET toolbox for latent Dirichlet allocation.") - - LGR.debug(f"Dataset moved to {data_dir}") - - return data_dir - - def download_cognitive_atlas(data_dir=None, overwrite=False): """Download Cognitive Atlas ontology and extract IDs and relationships. diff --git a/nimare/tests/test_utils.py b/nimare/tests/test_utils.py index a5820cf5c..55679965f 100644 --- a/nimare/tests/test_utils.py +++ b/nimare/tests/test_utils.py @@ -2,7 +2,6 @@ import logging import os import os.path as op -import time import nibabel as nib import numpy as np @@ -165,27 +164,3 @@ def test_mm2vox(): img = utils.get_template(space="mni152_2mm", mask=None) aff = img.affine assert np.array_equal(utils.mm2vox(test, aff), true) - - -def test_run_shell_command(caplog): - """Test _run_shell_command.""" - with caplog.at_level(logging.INFO): - utils._run_shell_command("echo 'output'") - assert "output" in caplog.text - - # Check that the exception is registered as such - with pytest.raises(Exception) as execinfo: - utils._run_shell_command("echo 'Error!' 1>&2;exit 64") - assert "Error!" in str(execinfo.value) - - # Check that the function actually waits until the command completes - dur = 3 - start = time.time() - with caplog.at_level(logging.INFO): - utils._run_shell_command(f"echo 'hi';sleep {dur}s;echo 'bye'") - end = time.time() - - assert "hi" in caplog.text - assert "bye" in caplog.text - duration = end - start - assert duration >= dur diff --git a/nimare/utils.py b/nimare/utils.py index f18519abb..aeec594db 100755 --- a/nimare/utils.py +++ b/nimare/utils.py @@ -5,7 +5,6 @@ import os import os.path as op import re -import subprocess from functools import wraps from tempfile import mkstemp @@ -932,28 +931,3 @@ def _boolean_unmask(data_array, bool_array): unmasked_data[bool_array] = data_array unmasked_data = unmasked_data.T return unmasked_data - - -def _run_shell_command(command, env=None): - """Run a given command with certain environment variables set.""" - merged_env = os.environ - if env: - merged_env.update(env) - - process = subprocess.Popen( - command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True, - env=merged_env, - ) - while True: - line = process.stdout.readline() - line = str(line, "utf-8")[:-1] - LGR.info(line) - if line == "" and process.poll() is not None: - break - - if process.returncode != 0: - stderr_line = str(process.stderr.read(), "utf-8")[:-1] - raise Exception(f"Non zero return code: {process.returncode}\n{command}\n\n{stderr_line}") From cbc1f2de57025cb8f8c377fa85ba6c43c3ae07d7 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Sat, 13 Nov 2021 11:44:14 -0500 Subject: [PATCH 02/16] Delete 03_lda.py --- examples/03_annotation/03_lda.py | 45 -------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 examples/03_annotation/03_lda.py diff --git a/examples/03_annotation/03_lda.py b/examples/03_annotation/03_lda.py deleted file mode 100644 index ed03792eb..000000000 --- a/examples/03_annotation/03_lda.py +++ /dev/null @@ -1,45 +0,0 @@ -# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- -# ex: set sts=4 ts=4 sw=4 et: -""" - -.. _annotations_lda: - -================== -LDA topic modeling -================== - -This example trains a latent Dirichlet allocation model with MALLET using abstracts from -Neurosynth. -""" -import os - -from nimare import annotate, extract -from nimare.dataset import Dataset -from nimare.utils import get_resource_path - -############################################################################### -# Load dataset with abstracts -# --------------------------- -dset = Dataset( - os.path.join(get_resource_path(), "neurosynth_laird_studies.json") -) - -############################################################################### -# Download MALLET -# --------------- -# MALLET is a Java toolbox for natural language processing. -# While LDA is implemented in some Python libraries, like scikit-learn, -# MALLET appears to do a better job at LDA than other tools. -# LDAModel will download MALLET automatically, but it's included here for clarity. -mallet_dir = extract.download_mallet() - -############################################################################### -# Run model -# --------- -# This may take some time, so we won't run it in the gallery. -model = annotate.lda.LDAModel(dset.texts, text_column="abstract", n_iters=5) -model.fit() -model.save("lda_model.pkl.gz") - -# Let's remove the model now that you know how to generate it. -os.remove("lda_model.pkl.gz") From 926f5ef960dc7a53f19c396d967ca840e500f1ac Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Sat, 13 Nov 2021 12:01:35 -0500 Subject: [PATCH 03/16] Use resources instead of test data. --- examples/03_annotation/03_plot_lda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/03_annotation/03_plot_lda.py b/examples/03_annotation/03_plot_lda.py index e28139bf1..292988bac 100644 --- a/examples/03_annotation/03_plot_lda.py +++ b/examples/03_annotation/03_plot_lda.py @@ -18,13 +18,13 @@ import nimare from nimare import annotate -from nimare.tests.utils import get_test_data_path +from nimare.utils import get_resource_path ############################################################################### # Load dataset with abstracts # --------------------------- dset = nimare.dataset.Dataset( - os.path.join(get_test_data_path(), "neurosynth_laird_studies.json") + os.path.join(get_resource_path(), "neurosynth_laird_studies.json") ) ############################################################################### From fb8429acb401e1868b3210842d2952f6fbebe5af Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 8 Dec 2021 10:15:01 -0500 Subject: [PATCH 04/16] Bundle sklearn model in new class. --- docs/api.rst | 1 + nimare/annotate/__init__.py | 5 +- nimare/annotate/lda.py | 93 +++++++++++++++++++++++++++++++++++++ nimare/base.py | 24 ++++++++++ 4 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 nimare/annotate/lda.py diff --git a/docs/api.rst b/docs/api.rst index ae43c8df2..410f58a46 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -112,6 +112,7 @@ API annotate.cogat annotate.gclda + annotate.lda annotate.text annotate.utils diff --git a/nimare/annotate/__init__.py b/nimare/annotate/__init__.py index f4f5272e5..3fe49a8de 100644 --- a/nimare/annotate/__init__.py +++ b/nimare/annotate/__init__.py @@ -1,8 +1,9 @@ """Automated annotation tools.""" -from . import cogat, gclda, text, utils +from . import cogat, gclda, lda, text, utils from .cogat import CogAtLemmatizer, expand_counts, extract_cogat from .gclda import GCLDAModel +from .lda import LDAModel from .text import generate_counts __all__ = [ @@ -10,9 +11,11 @@ "expand_counts", "extract_cogat", "GCLDAModel", + "LDAModel", "generate_counts", "cogat", "gclda", + "lda", "text", "utils", ] diff --git a/nimare/annotate/lda.py b/nimare/annotate/lda.py new file mode 100644 index 000000000..870250bd8 --- /dev/null +++ b/nimare/annotate/lda.py @@ -0,0 +1,93 @@ +"""Topic modeling with latent Dirichlet allocation.""" +import pandas as pd +from sklearn.decomposition import LatentDirichletAllocation + +from nimare.annotate.text import generate_counts +from nimare.base import Annotator + + +class LDAModel(Annotator): + """Generate a latent Dirichlet allocation (LDA) topic model. + + This class is a light wrapper around scikit-learn tools for tokenization and LDA. + + Parameters + ---------- + n_topics + max_iter + text_column + + Attributes + ---------- + model : :obj:`sklearn.decomposition.LatentDirichletAllocation` + + See also + -------- + :class:`sklearn.feature_extraction.text.CountVectorizer` + :class:`sklearn.decomposition.LatentDirichletAllocation` + """ + def __init__(self, n_topics, max_iter, text_column="abstract"): + self.n_topics = n_topics + self.max_iter = max_iter + self.text_column = text_column + self.model = LatentDirichletAllocation( + n_components=n_topics, + max_iter=max_iter, + learning_method="online", + ) + + def transform(self, dset): + """Fit the LDA topic model to text from a Dataset. + + Parameters + ---------- + dset + + Returns + ------- + dset + + Attributes + ---------- + distributions_ : :obj:`dict` + A dictionary containing additional distributions produced by the model, including: + + - p_topic_g_word: numpy ndarray of shape (n_topics, n_tokens) containing the + topic-term weights for the model. + - p_topic_g_word_df: pandas DataFrame of shape (n_topics, n_tokens) containing + the topic-term weights for the model. + """ + counts_df = generate_counts( + dset.texts, + text_column=self.text_column, + tfidf=False, + max_df=len(dset.ids) - 2, + min_df=2, + ) + vocabulary = counts_df.columns.tolist() + count_values = counts_df.values + study_ids = counts_df.index.tolist() + topic_names = [f"Topic {str(i+1).zfill(3)}" for i in range(self.n_topics)] + + doc_topic_weights = self.model.fit_transform(count_values) + doc_topic_weights_df = pd.DataFrame( + index=study_ids, + columns=topic_names, + data=doc_topic_weights, + ) + topic_word_weights = self.model.components_ + topic_word_weights_df = pd.DataFrame( + index=topic_names, + columns=vocabulary, + data=topic_word_weights, + ) + self.distributions_ = { + "p_topic_g_word": topic_word_weights, + "p_topic_g_word_df": topic_word_weights_df, + } + + annotations = dset.annotations.copy() + new_annotations = pd.merge(annotations, doc_topic_weights_df) + new_dset = dset.copy() + new_dset.annotations = new_annotations + return new_dset diff --git a/nimare/base.py b/nimare/base.py index 58dd3758c..53c40a0f7 100644 --- a/nimare/base.py +++ b/nimare/base.py @@ -463,6 +463,30 @@ def transform(self, dataset): ) +class Annotator(NiMAREBase): + """Base class for annotators in :mod:`nimare.annotate`. + + Annotators operate like Transformers in that they ingest Datasets and output modified Datasets. + One difference between Annotators and Transformers is that Annotators retain extra information + in a ``distributions_`` attribute. + + .. versionadded:: 0.0.11 + + """ + + def __init__(self): + pass + + @abstractmethod + def transform(self, dataset): + """Add stuff to transformer.""" + # Using attribute check instead of type check to allow fake Datasets for testing. + if not hasattr(dataset, "slice"): + raise ValueError( + f"Argument 'dataset' must be a valid Dataset object, not a {type(dataset)}" + ) + + class Decoder(NiMAREBase): """Base class for decoders in :mod:`nimare.decode`. From 4954886582e43b2ea893da6b562f49cc87432d75 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 8 Dec 2021 10:46:58 -0500 Subject: [PATCH 05/16] More updates. --- examples/03_annotation/03_plot_lda.py | 50 +++++---------------------- nimare/annotate/lda.py | 11 +++++- 2 files changed, 18 insertions(+), 43 deletions(-) diff --git a/examples/03_annotation/03_plot_lda.py b/examples/03_annotation/03_plot_lda.py index 292988bac..f971f5f6c 100644 --- a/examples/03_annotation/03_plot_lda.py +++ b/examples/03_annotation/03_plot_lda.py @@ -13,62 +13,28 @@ """ import os -import pandas as pd -from sklearn.decomposition import LatentDirichletAllocation - -import nimare -from nimare import annotate +from nimare import annotate, dataset from nimare.utils import get_resource_path ############################################################################### # Load dataset with abstracts # --------------------------- -dset = nimare.dataset.Dataset( - os.path.join(get_resource_path(), "neurosynth_laird_studies.json") -) +dset = dataset.Dataset(os.path.join(get_resource_path(), "neurosynth_laird_studies.json")) ############################################################################### -# Extract term counts from the abstracts -# -------------------------------------- -counts_df = annotate.text.generate_counts( - dset.texts, - text_column="abstract", - tfidf=False, - max_df=len(dset.ids) - 2, - min_df=2, -) -vocabulary = counts_df.columns.tolist() -count_values = counts_df.values -study_ids = counts_df.index.tolist() -N_TOPICS = 5 -topic_names = [f"Topic {str(i+1).zfill(3)}" for i in range(N_TOPICS)] +# Initialize LDA model +# -------------------- +model = annotate.lda.LDAModel(n_topics=5, max_iter=1000, text_column="abstract") ############################################################################### # Run model # --------- -# This may take some time, so we won't run it in the gallery. -model = LatentDirichletAllocation( - n_components=N_TOPICS, - max_iter=1000, - learning_method="online", -) -doc_topic_weights = model.fit_transform(count_values) -doc_topic_weights_df = pd.DataFrame( - index=study_ids, - columns=topic_names, - data=doc_topic_weights, -) -topic_word_weights = model.components_ -topic_word_weights_df = pd.DataFrame( - index=topic_names, - columns=vocabulary, - data=topic_word_weights, -) +new_dset = model.transform(dset) ############################################################################### # View results # ------------ -doc_topic_weights_df.head() +new_dset.annotations.head() ############################################################################### -topic_word_weights_df.head() +model.distributions_["p_topic_g_word_df"].head() diff --git a/nimare/annotate/lda.py b/nimare/annotate/lda.py index 870250bd8..30cdb5039 100644 --- a/nimare/annotate/lda.py +++ b/nimare/annotate/lda.py @@ -2,10 +2,17 @@ import pandas as pd from sklearn.decomposition import LatentDirichletAllocation +from nimare import references from nimare.annotate.text import generate_counts from nimare.base import Annotator +from nimare.due import due +@due.dcite(references.LDA, description="Introduces LDA.") +@due.dcite( + references.LDAMODEL, + description="First use of LDA for automated annotation of neuroimaging literature.", +) class LDAModel(Annotator): """Generate a latent Dirichlet allocation (LDA) topic model. @@ -21,11 +28,12 @@ class LDAModel(Annotator): ---------- model : :obj:`sklearn.decomposition.LatentDirichletAllocation` - See also + See Also -------- :class:`sklearn.feature_extraction.text.CountVectorizer` :class:`sklearn.decomposition.LatentDirichletAllocation` """ + def __init__(self, n_topics, max_iter, text_column="abstract"): self.n_topics = n_topics self.max_iter = max_iter @@ -67,6 +75,7 @@ def transform(self, dset): vocabulary = counts_df.columns.tolist() count_values = counts_df.values study_ids = counts_df.index.tolist() + # LDA50__1_word1_word2_word3 topic_names = [f"Topic {str(i+1).zfill(3)}" for i in range(self.n_topics)] doc_topic_weights = self.model.fit_transform(count_values) From bdac98cf6c8aaa0d96c9581d67d2fe3ad25fa042 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 8 Dec 2021 11:01:57 -0500 Subject: [PATCH 06/16] Fix. --- nimare/annotate/lda.py | 4 +++- nimare/base.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/nimare/annotate/lda.py b/nimare/annotate/lda.py index 30cdb5039..16bc5b43a 100644 --- a/nimare/annotate/lda.py +++ b/nimare/annotate/lda.py @@ -96,7 +96,9 @@ def transform(self, dset): } annotations = dset.annotations.copy() - new_annotations = pd.merge(annotations, doc_topic_weights_df) + new_annotations = pd.merge( + annotations, doc_topic_weights_df, left_on="id", right_index=True + ) new_dset = dset.copy() new_dset.annotations = new_annotations return new_dset diff --git a/nimare/base.py b/nimare/base.py index 53c40a0f7..904d28978 100644 --- a/nimare/base.py +++ b/nimare/base.py @@ -470,6 +470,9 @@ class Annotator(NiMAREBase): One difference between Annotators and Transformers is that Annotators retain extra information in a ``distributions_`` attribute. + Depending on the Annotator, they should accept either a text column or a set of annotations to + use. + .. versionadded:: 0.0.11 """ From 0e80ab03852b9df68e8d49184a45f17ba1d245c2 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 8 Dec 2021 11:26:15 -0500 Subject: [PATCH 07/16] Add test. --- nimare/annotate/lda.py | 2 +- nimare/tests/test_annotate_lda.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 nimare/tests/test_annotate_lda.py diff --git a/nimare/annotate/lda.py b/nimare/annotate/lda.py index 16bc5b43a..0e0b26a76 100644 --- a/nimare/annotate/lda.py +++ b/nimare/annotate/lda.py @@ -76,7 +76,7 @@ def transform(self, dset): count_values = counts_df.values study_ids = counts_df.index.tolist() # LDA50__1_word1_word2_word3 - topic_names = [f"Topic {str(i+1).zfill(3)}" for i in range(self.n_topics)] + topic_names = [f"LDA{self.n_topics}__{i}" for i in range(self.n_topics)] doc_topic_weights = self.model.fit_transform(count_values) doc_topic_weights_df = pd.DataFrame( diff --git a/nimare/tests/test_annotate_lda.py b/nimare/tests/test_annotate_lda.py new file mode 100644 index 000000000..9a9687731 --- /dev/null +++ b/nimare/tests/test_annotate_lda.py @@ -0,0 +1,15 @@ +"""Test nimare.annotate.lda (LDA).""" +from nimare import annotate + + +def test_lda(testdata_laird): + """A smoke test for LDA.""" + N_TOPICS = 5 + model = annotate.lda.LDAModel( + n_topics=N_TOPICS, + max_iter=100, + text_column="abstract", + ) + new_dset = model.transform(testdata_laird) + topic_columns = [c for c in new_dset.annotations.columns if c.startswith("LDA")] + assert len(topic_columns) == N_TOPICS From 01b5873b7d5b3de5233b5749da8a8c835ef8f9ea Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 8 Dec 2021 12:10:23 -0500 Subject: [PATCH 08/16] Update 03_plot_lda.py --- examples/03_annotation/03_plot_lda.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/examples/03_annotation/03_plot_lda.py b/examples/03_annotation/03_plot_lda.py index f971f5f6c..44657157c 100644 --- a/examples/03_annotation/03_plot_lda.py +++ b/examples/03_annotation/03_plot_lda.py @@ -13,6 +13,8 @@ """ import os +import pandas as pd + from nimare import annotate, dataset from nimare.utils import get_resource_path @@ -38,3 +40,17 @@ ############################################################################### model.distributions_["p_topic_g_word_df"].head() + +############################################################################### +n_top_terms = 10 +top_term_df = model.distributions_["p_topic_g_word_df"].T +temp_df = top_term_df.copy() +top_term_df = pd.DataFrame(columns=top_term_df.columns, index=range(n_top_terms)) +top_term_df.index.name = "Token" +for col in top_term_df.columns: + top_tokens = temp_df.sort_values(by=col, ascending=False).index.tolist()[:n_top_terms] + top_term_df.loc[:, col] = top_tokens + +top_term_df = top_term_df[top_term_df.columns[:n_top_terms]] + +top_term_df.head() From 2b404c38c8b438a7b67d2811eddd2ecbbab74e18 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 8 Dec 2021 12:29:06 -0500 Subject: [PATCH 09/16] Improve things. --- examples/03_annotation/03_plot_lda.py | 10 +++++----- nimare/annotate/lda.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/03_annotation/03_plot_lda.py b/examples/03_annotation/03_plot_lda.py index 44657157c..62ee5ee20 100644 --- a/examples/03_annotation/03_plot_lda.py +++ b/examples/03_annotation/03_plot_lda.py @@ -36,10 +36,12 @@ ############################################################################### # View results # ------------ -new_dset.annotations.head() +# Given that the new annotations DataFrame is very wide (many terms), +# but also very short (5 studies), we will transpose it before presenting it. +new_dset.annotations.T.head(10) ############################################################################### -model.distributions_["p_topic_g_word_df"].head() +model.distributions_["p_topic_g_word_df"].T.head(10) ############################################################################### n_top_terms = 10 @@ -51,6 +53,4 @@ top_tokens = temp_df.sort_values(by=col, ascending=False).index.tolist()[:n_top_terms] top_term_df.loc[:, col] = top_tokens -top_term_df = top_term_df[top_term_df.columns[:n_top_terms]] - -top_term_df.head() +top_term_df diff --git a/nimare/annotate/lda.py b/nimare/annotate/lda.py index 0e0b26a76..ef5890b7c 100644 --- a/nimare/annotate/lda.py +++ b/nimare/annotate/lda.py @@ -76,7 +76,7 @@ def transform(self, dset): count_values = counts_df.values study_ids = counts_df.index.tolist() # LDA50__1_word1_word2_word3 - topic_names = [f"LDA{self.n_topics}__{i}" for i in range(self.n_topics)] + topic_names = [f"LDA{self.n_topics}__{i + 1}" for i in range(self.n_topics)] doc_topic_weights = self.model.fit_transform(count_values) doc_topic_weights_df = pd.DataFrame( From 18d931beead8b5dd122ef1f68b4fbe40ec11c811 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 8 Dec 2021 14:13:57 -0500 Subject: [PATCH 10/16] Link to CBMA documentation. --- docs/api.rst | 4 +++- docs/cbma.rst | 6 +++--- docs/decoding.rst | 2 -- examples/02_meta-analyses/README.txt | 3 +++ examples/04_decoding/README.txt | 2 +- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 410f58a46..9d5420fbb 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -24,6 +24,8 @@ API :mod:`nimare.meta`: Meta-analytic algorithms -------------------------------------------------- +For more information about the components of coordinate-based meta-analysis in NiMARE, see :doc:`cbma`. + .. automodule:: nimare.meta :no-members: :no-inherited-members: @@ -121,7 +123,7 @@ API :mod:`nimare.decode`: Functional characterization analysis ----------------------------------------------------------- -For more information about functional characterization analysis, see :ref:`decoding methods`. +For more information about functional characterization analysis, see :doc:`decoding`. .. automodule:: nimare.decode :no-members: diff --git a/docs/cbma.rst b/docs/cbma.rst index 8d36933a9..1d9ca0817 100644 --- a/docs/cbma.rst +++ b/docs/cbma.rst @@ -23,7 +23,7 @@ The kernel used to create the modeled activation map varies across approaches, b the :class:`ALEKernel`, which convolves coordinates with a 3D Gaussian distribution, and the :class:`MKDAKernel`, which creates a binary sphere around each coordinate. -.. warning:: +.. important:: While the modeled activation map is an estimate of the original statistical map, that doesn't mean that modeled activation maps can actually be used as statistical maps. We still need meta-analytic algorithms that are designed for coordinates, rather than images. @@ -57,7 +57,7 @@ and then comparing the summary statistics from the real Dataset to these "null" This method may take a long time, and is only slightly more accurate than the approximate method, as long as there are enough iterations. -In general, we would recommend using the approximate method. +**In general, we recommend using the approximate method.** Multiple comparisons correction ------------------------------- @@ -79,7 +79,7 @@ These methods can be broadly separated into two groups: generic methods and Esti Generic methods rely on tools like ``statsmodels`` to correct the results as an array, without accounting for any of the idiosyncrasies of neuroimaging data (e.g., autocorrelation). One example of a generic method is the "bonferroni" method for the FWECorrector. -**We do not recommend using these methods.** +**We do not recommend using the generic methods.** Estimator-specific methods are approaches that are implemented within the Estimator as class methods that are then called by the Corrector. diff --git a/docs/decoding.rst b/docs/decoding.rst index b865f8351..5fd529b6b 100644 --- a/docs/decoding.rst +++ b/docs/decoding.rst @@ -1,7 +1,5 @@ .. include:: links.rst -.. _decoding methods: - Meta-analytic functional decoding ================================= diff --git a/examples/02_meta-analyses/README.txt b/examples/02_meta-analyses/README.txt index 36e715c8d..ff31d0dd8 100644 --- a/examples/02_meta-analyses/README.txt +++ b/examples/02_meta-analyses/README.txt @@ -5,3 +5,6 @@ Performing meta-analyses NiMARE implements a number of coordinate- and image-based meta-analysis algorithms in its :mod:`nimare.meta` module. In the examples below, we exhibit a range of meta-analyses that can be done with coordinates and/or images in NiMARE. + +For more information about the components that go into coordinate-based meta-analyses in NiMARE, see :doc:`../cbma`, +as well as :doc:`../outputs`. diff --git a/examples/04_decoding/README.txt b/examples/04_decoding/README.txt index d3b628c47..f014cc948 100644 --- a/examples/04_decoding/README.txt +++ b/examples/04_decoding/README.txt @@ -5,4 +5,4 @@ Functional characterization analysis Functional characterization analysis refers to methods which use meta-analytic databases to characterize, or "decode", brain regions or statistical maps in terms of tasks and/or mental processes. -For more information about functional characterization analysis, see :ref:`decoding methods`. +For more information about functional characterization analysis, see :doc:`../decoding`. From de052b168d5d2ac0e84ac1c78069d98855829ce2 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 8 Dec 2021 14:15:35 -0500 Subject: [PATCH 11/16] Update 03_plot_lda.py --- examples/03_annotation/03_plot_lda.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/03_annotation/03_plot_lda.py b/examples/03_annotation/03_plot_lda.py index 62ee5ee20..02eef76e5 100644 --- a/examples/03_annotation/03_plot_lda.py +++ b/examples/03_annotation/03_plot_lda.py @@ -36,11 +36,11 @@ ############################################################################### # View results # ------------ -# Given that the new annotations DataFrame is very wide (many terms), -# but also very short (5 studies), we will transpose it before presenting it. -new_dset.annotations.T.head(10) +# This DataFrame is very large, so we will only show a slice of it. +new_dset.annotations[new_dset.annotations.columns[:10]].head(10) ############################################################################### +# Given that this DataFrame is very wide (many terms), we will transpose it before presenting it. model.distributions_["p_topic_g_word_df"].T.head(10) ############################################################################### From 657db36565f1a547e887a9ef47d145259b3ab04a Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 8 Dec 2021 14:43:51 -0500 Subject: [PATCH 12/16] Update api.rst --- docs/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api.rst b/docs/api.rst index 9d5420fbb..06a5fb195 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -330,4 +330,5 @@ For more information about fetching data from the internet, see :ref:`fetching t base.Estimator base.MetaEstimator base.Transformer + base.Annotator base.Decoder From c1d726e267f73d0a81d7f1a87156e720a1f5ba5c Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 8 Dec 2021 16:08:03 -0500 Subject: [PATCH 13/16] More cleanup. --- nimare/annotate/lda.py | 2 +- nimare/base.py | 33 ++++++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/nimare/annotate/lda.py b/nimare/annotate/lda.py index ef5890b7c..840a1dbf2 100644 --- a/nimare/annotate/lda.py +++ b/nimare/annotate/lda.py @@ -44,7 +44,7 @@ def __init__(self, n_topics, max_iter, text_column="abstract"): learning_method="online", ) - def transform(self, dset): + def _transform(self, dset): """Fit the LDA topic model to text from a Dataset. Parameters diff --git a/nimare/base.py b/nimare/base.py index 904d28978..fb996d0dd 100644 --- a/nimare/base.py +++ b/nimare/base.py @@ -470,25 +470,44 @@ class Annotator(NiMAREBase): One difference between Annotators and Transformers is that Annotators retain extra information in a ``distributions_`` attribute. - Depending on the Annotator, they should accept either a text column or a set of annotations to - use. + Depending on the Annotator, they may accept either a text column or a set of labels to use. .. versionadded:: 0.0.11 """ - def __init__(self): - pass - - @abstractmethod def transform(self, dataset): - """Add stuff to transformer.""" + """Annotate a dataset. + + Parameters + ---------- + dataset : :obj:`nimare.dataset.Dataset` + Dataset to annotate. + + Returns + ------- + dataset : :obj:`nimare.dataset.Dataset` + Updated Dataset with new annotations added to the ``Dataset.annotations`` attribute. + + Notes + ----- + The `transform` method is a light wrapper that runs input validation and preprocessing + before fitting the actual model. Annotators' individual "transforming" methods are + implemented as `_transform`, although users should call `transform`. + """ # Using attribute check instead of type check to allow fake Datasets for testing. if not hasattr(dataset, "slice"): raise ValueError( f"Argument 'dataset' must be a valid Dataset object, not a {type(dataset)}" ) + dataset = self._transform(dataset) + return dataset + + @abstractmethod + def _transform(self, dataset): + pass + class Decoder(NiMAREBase): """Base class for decoders in :mod:`nimare.decode`. From 075a5f0cd073602fc5b5f20177a8523687eb1fdc Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 16 Dec 2021 14:09:48 -0500 Subject: [PATCH 14/16] Remove Annotator class. The Annotator and Annotation classes will be developed in #618. --- nimare/annotate/lda.py | 77 ++++++++++++++++++++++--------- nimare/base.py | 46 ------------------ nimare/tests/test_annotate_lda.py | 13 +++++- 3 files changed, 68 insertions(+), 68 deletions(-) diff --git a/nimare/annotate/lda.py b/nimare/annotate/lda.py index 840a1dbf2..4ad9e46a1 100644 --- a/nimare/annotate/lda.py +++ b/nimare/annotate/lda.py @@ -4,7 +4,7 @@ from nimare import references from nimare.annotate.text import generate_counts -from nimare.base import Annotator +from nimare.base import NiMAREBase from nimare.due import due @@ -13,57 +13,94 @@ references.LDAMODEL, description="First use of LDA for automated annotation of neuroimaging literature.", ) -class LDAModel(Annotator): +class LDAModel(NiMAREBase): """Generate a latent Dirichlet allocation (LDA) topic model. This class is a light wrapper around scikit-learn tools for tokenization and LDA. Parameters ---------- - n_topics - max_iter - text_column + n_topics : :obj:`int` + Number of topics for topic model. This corresponds to the model's ``n_components`` + parameter. Must be an integer >= 1. + max_iter : :obj:`int`, optional + Maximum number of iterations to use during model fitting. Default = 1000. + alpha : :obj:`float` or None, optional + The ``alpha`` value for the model. This corresponds to the model's ``doc_topic_prior`` + parameter. Default is None, which evaluates to ``1 / n_topics``, as was used in [2]_. + beta : :obj:`float` or None, optional + The ``beta`` value for the model. This corresponds to the model's ``topic_word_prior`` + parameter. If None, it evaluates to ``1 / n_topics``. + Default is 0.001, which was used in [2]_. + text_column : :obj:`str`, optional + The source of text to use for the model. This should correspond to an existing column + in the :py:attr:`~nimare.dataset.Dataset.texts` attribute. Default is "abstract". Attributes ---------- - model : :obj:`sklearn.decomposition.LatentDirichletAllocation` + model : :obj:`~sklearn.decomposition.LatentDirichletAllocation` + + Notes + ----- + Latent Dirichlet allocation was first developed in [1]_, and was first applied to neuroimaging + articles in [2]_. + + References + ---------- + .. [1] Blei, David M., Andrew Y. Ng, and Michael I. Jordan. "Latent + dirichlet allocation." Journal of machine Learning research 3.Jan + (2003): 993-1022. + .. [2] Poldrack, Russell A., et al. "Discovering relations between mind, + brain, and mental disorders using topic mapping." PLoS computational + biology 8.10 (2012): e1002707. + https://doi.org/10.1371/journal.pcbi.1002707 See Also -------- - :class:`sklearn.feature_extraction.text.CountVectorizer` - :class:`sklearn.decomposition.LatentDirichletAllocation` + :class:`~sklearn.feature_extraction.text.CountVectorizer`: Used to build a vocabulary of terms + and their associated counts from texts in the ``self.text_column`` of the Dataset's + ``texts`` attribute. + :class:`~sklearn.decomposition.LatentDirichletAllocation`: Used to train the LDA model. """ - def __init__(self, n_topics, max_iter, text_column="abstract"): + def __init__(self, n_topics, max_iter=1000, alpha=None, beta=0.001, text_column="abstract"): self.n_topics = n_topics self.max_iter = max_iter + self.alpha = alpha + self.beta = beta self.text_column = text_column + self.model = LatentDirichletAllocation( n_components=n_topics, max_iter=max_iter, learning_method="online", + doc_topic_prior=alpha, + topic_word_prior=beta, ) - def _transform(self, dset): + def fit(self, dset): """Fit the LDA topic model to text from a Dataset. Parameters ---------- - dset + dset : :obj:`~nimare.dataset.Dataset` + A Dataset with, at minimum, text available in the ``self.text_column`` column of its + :py:attr:`~nimare.dataset.Dataset.texts` attribute. Returns ------- - dset + dset : :obj:`~nimare.dataset.Dataset` + A new Dataset with an updated :py:attr:`~nimare.dataset.Dataset.annotations` attribute. Attributes ---------- distributions_ : :obj:`dict` A dictionary containing additional distributions produced by the model, including: - - p_topic_g_word: numpy ndarray of shape (n_topics, n_tokens) containing the - topic-term weights for the model. - - p_topic_g_word_df: pandas DataFrame of shape (n_topics, n_tokens) containing - the topic-term weights for the model. + - ``p_topic_g_word``: :obj:`numpy.ndarray` of shape (n_topics, n_tokens) + containing the topic-term weights for the model. + - ``p_topic_g_word_df``: :obj:`pandas.DataFrame` of shape (n_topics, n_tokens) + containing the topic-term weights for the model. """ counts_df = generate_counts( dset.texts, @@ -75,7 +112,7 @@ def _transform(self, dset): vocabulary = counts_df.columns.tolist() count_values = counts_df.values study_ids = counts_df.index.tolist() - # LDA50__1_word1_word2_word3 + # TODO: LDA50__1_word1_word2_word3 topic_names = [f"LDA{self.n_topics}__{i + 1}" for i in range(self.n_topics)] doc_topic_weights = self.model.fit_transform(count_values) @@ -96,9 +133,7 @@ def _transform(self, dset): } annotations = dset.annotations.copy() - new_annotations = pd.merge( - annotations, doc_topic_weights_df, left_on="id", right_index=True - ) + annotations = pd.merge(annotations, doc_topic_weights_df, left_on="id", right_index=True) new_dset = dset.copy() - new_dset.annotations = new_annotations + new_dset.annotations = annotations return new_dset diff --git a/nimare/base.py b/nimare/base.py index 53464defd..e92618cee 100644 --- a/nimare/base.py +++ b/nimare/base.py @@ -463,52 +463,6 @@ def transform(self, dataset): ) -class Annotator(NiMAREBase): - """Base class for annotators in :mod:`nimare.annotate`. - - Annotators operate like Transformers in that they ingest Datasets and output modified Datasets. - One difference between Annotators and Transformers is that Annotators retain extra information - in a ``distributions_`` attribute. - - Depending on the Annotator, they may accept either a text column or a set of labels to use. - - .. versionadded:: 0.0.11 - - """ - - def transform(self, dataset): - """Annotate a dataset. - - Parameters - ---------- - dataset : :obj:`nimare.dataset.Dataset` - Dataset to annotate. - - Returns - ------- - dataset : :obj:`nimare.dataset.Dataset` - Updated Dataset with new annotations added to the ``Dataset.annotations`` attribute. - - Notes - ----- - The `transform` method is a light wrapper that runs input validation and preprocessing - before fitting the actual model. Annotators' individual "transforming" methods are - implemented as `_transform`, although users should call `transform`. - """ - # Using attribute check instead of type check to allow fake Datasets for testing. - if not hasattr(dataset, "slice"): - raise ValueError( - f"Argument 'dataset' must be a valid Dataset object, not a {type(dataset)}" - ) - - dataset = self._transform(dataset) - return dataset - - @abstractmethod - def _transform(self, dataset): - pass - - class Decoder(NiMAREBase): """Base class for decoders in :mod:`~nimare.decode`. diff --git a/nimare/tests/test_annotate_lda.py b/nimare/tests/test_annotate_lda.py index 9a9687731..550c3e7fc 100644 --- a/nimare/tests/test_annotate_lda.py +++ b/nimare/tests/test_annotate_lda.py @@ -1,4 +1,7 @@ """Test nimare.annotate.lda (LDA).""" +import numpy as np +import pandas as pd + from nimare import annotate @@ -10,6 +13,14 @@ def test_lda(testdata_laird): max_iter=100, text_column="abstract", ) - new_dset = model.transform(testdata_laird) + new_dset = model.fit(testdata_laird) topic_columns = [c for c in new_dset.annotations.columns if c.startswith("LDA")] assert len(topic_columns) == N_TOPICS + + assert hasattr(model, "distributions_") + assert "p_topic_g_word" in model.distributions_.keys() + assert isinstance(model.distributions_["p_topic_g_word"], np.ndarray) + assert model.distributions_["p_topic_g_word"].shape[0] == N_TOPICS + assert "p_topic_g_word_df" in model.distributions_.keys() + assert isinstance(model.distributions_["p_topic_g_word_df"], pd.DataFrame) + assert model.distributions_["p_topic_g_word_df"].shape[0] == N_TOPICS From c8dce8e971ffd4fa957a226b419fb242ccb159fa Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 16 Dec 2021 14:26:44 -0500 Subject: [PATCH 15/16] Update 03_plot_lda.py --- examples/03_annotation/03_plot_lda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/03_annotation/03_plot_lda.py b/examples/03_annotation/03_plot_lda.py index 02eef76e5..6e38738ac 100644 --- a/examples/03_annotation/03_plot_lda.py +++ b/examples/03_annotation/03_plot_lda.py @@ -31,7 +31,7 @@ ############################################################################### # Run model # --------- -new_dset = model.transform(dset) +new_dset = model.fit(dset) ############################################################################### # View results From 1435bc6a55b4127cb4083861e71bae1feacd1f16 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 6 Jan 2022 15:38:03 -0500 Subject: [PATCH 16/16] Remove undefined base class. --- docs/api.rst | 1 - examples/03_annotation/03_plot_lda.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 06a5fb195..9d5420fbb 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -330,5 +330,4 @@ For more information about fetching data from the internet, see :ref:`fetching t base.Estimator base.MetaEstimator base.Transformer - base.Annotator base.Decoder diff --git a/examples/03_annotation/03_plot_lda.py b/examples/03_annotation/03_plot_lda.py index 6e38738ac..8dc5d2a6c 100644 --- a/examples/03_annotation/03_plot_lda.py +++ b/examples/03_annotation/03_plot_lda.py @@ -15,13 +15,14 @@ import pandas as pd -from nimare import annotate, dataset +from nimare import annotate +from nimare.dataset import Dataset from nimare.utils import get_resource_path ############################################################################### # Load dataset with abstracts # --------------------------- -dset = dataset.Dataset(os.path.join(get_resource_path(), "neurosynth_laird_studies.json")) +dset = Dataset(os.path.join(get_resource_path(), "neurosynth_laird_studies.json")) ############################################################################### # Initialize LDA model