Merge pull request #34 from wasade/placements

Placements
biocore · Jan 24, 2022 · 7d1b24c · 7d1b24c
2 parents 8fcbc45 + 9a4089f
commit 7d1b24c
Show file tree

Hide file tree

Showing 17 changed files with 1,374 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@ Improved Octo Waddle
 --------------------
 
 An implementation of the balanced parentheses tree structure as described by
-Cordova and Navarro (http://www.dcc.uchile.cl/~gnavarro/ps/tcs16.2.pdf).
+[Cordova and Navarro](http://www.dcc.uchile.cl/~gnavarro/ps/tcs16.2.pdf).
 
 Install notes
 -------------
@@ -12,6 +12,43 @@ problem of requiring numpy and cython for setup.py to execute. The package is
 named iow in pypi as "bp" was taken at time of registration.
 
 ```
-conda install numpy cython
-pip install iow
+$ conda create --name bp python=3.8
+$ conda activate bp
+$ conda install numpy cython
+$ pip install iow
 ```
+
+Developer notes
+---------------
+
+If pulling the source, please note that we use a submodule and Github does not
+by default bring it down. After a clone, please run:
+
+```
+$ git submodule update --init --recursive
+```
+
+Fragment insertion
+------------------
+
+BP supports the [jplace format](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0031009). Fragments can be inserted using either fully-resolved or multifurcation mode to resolve multiple placements to the same edge. In fully resolved, the edge placed against is broken N times where N is the number of fragments on the edge. In multifurcation, a new node is constructed as the average of the distal length for the N fragments, and a separate multifurcation node is added which encompasses the placed fragments.
+
+Insertions can be handled by the command line following install:
+
+```
+$ bp placement --help
+Usage: bp placement [OPTIONS]
+
+Options:
+  --placements PATH               jplace formatted data  [required]
+  --output PATH                   Where to write the resulting newick
+                                  [required]
+
+  --method [fully-resolved|multifurcating]
+                                  Whether to fully resolve or multifurcate
+                                  [required]
+
+  --help                          Show this message and exit.
+```
+
+Note that the multifurcating support relies on GPL code derived from the Genesis project. That code and LICENSE can be found under `bp/GPL`.
diff --git a/bp/GPL/Genesis_LICENSE.txt b/bp/GPL/Genesis_LICENSE.txt
diff --git a/bp/GPL/__init__.py b/bp/GPL/__init__.py
@@ -0,0 +1,3 @@
+from ._insert import insert_multifurcating
+
+__all__ = ['insert_multifurcating']
diff --git a/bp/GPL/_insert.pyx b/bp/GPL/_insert.pyx
@@ -0,0 +1,127 @@
+# encoding: utf-8
+# cython: profile=False, boundscheck=False, wraparound=False
+
+from .._bp cimport BP
+from bp._insert import _insert_setup, TreeNodeNoValidate
+from .. import to_skbio_treenode
+import pandas as pd
+import json
+import skbio
+cimport cython
+
+
+# this method described here was derived from Genesis
+# https://github.com/lczech/genesis
+# The license for Genesis is included herein under
+# Genesis_LICENSE.txt
+def insert_multifurcating(object placements, BP bptree):
+    """Update the backbone, with multifurcation for edges with multiple queries
+
+    Parameters
+    ----------
+    placements : pd.DataFrame
+        jplace data represented as a DataFrame
+    bptree : bp.BP
+        An instance of a BP tree, this is expected to contain edge numbers
+        and correspond to the backbone for the jplace data
+
+    Note
+    ----
+    This method was derived directly from the Genesis codebase, and is
+    therefore GPL.
+
+    Returns
+    -------
+    skbio.TreeNode
+        A tree with the fragments placed
+    """
+    # TODO: profile, type and re-profile
+    placements, sktree, node_lookup = \
+        _insert_setup(placements, bptree, 'multifurcating')
+
+    # it is much faster to bulk allocate than construct on the fly, so let's
+    # do that
+    new_parents = [TreeNodeNoValidate()
+                   for _ in range(len(placements['edge_num'].unique()))]
+    parent_idx = 0
+
+    new_bridges = [TreeNodeNoValidate()
+                   for _ in range(placements['edge_num'].duplicated().sum())]
+    bridge_idx = 0
+
+    for edge, edge_grp in placements.groupby('edge_num'):
+        # update topology
+        existing_node = node_lookup[edge]
+        current_parent = existing_node.parent
+        current_parent.remove(existing_node)
+
+        # get a pre-allocated node
+        new_parent = new_parents[parent_idx]
+        parent_idx += 1  # make sure we update our offset
+
+        # gather detail on our minimal placement
+        min_frag = edge_grp.iloc[0]
+        min_pendant = min_frag['pendant_length']
+        min_distal = min_frag['distal_length']
+        new_node = min_frag['node']
+
+        if len(edge_grp) > 1:
+            # if we have multiple placements, we construct a node that contains
+            # all of the placements, and place this under a "bridge" such that
+            # it is a sister to the existing node.
+
+            # derived from
+            # https://github.com/lczech/genesis/blob/98c064d8e3e2efaa97da33c9263f6fef3724f0a5/lib/genesis/placement/function/tree.cpp#L295
+
+            # update topology
+            bridge = new_bridges[bridge_idx]
+            bridge_idx += 1
+            bridge.append(existing_node)
+            bridge.append(new_parent)
+            current_parent.append(bridge)
+            new_parent.append(new_node)
+
+            # Gappa uses the average distal for joining the node encompassing
+            # placements and existing back to the tree. As we are subtracting
+            # against the existing node, it is possible to introduce a negative
+            # branch length, so we enforce a minimum of 0.0
+            avg_prox_len = edge_grp['distal_length'].mean()
+            bridge.length = max(0.0, existing_node.length - avg_prox_len)
+
+            # update edges. the first node we place has a length of zero as
+            # its parent accounts for its pendant
+            existing_node.length = avg_prox_len
+            new_parent.length = min_pendant
+            new_node.length = 0.0
+
+            for i in range(1, len(edge_grp)):
+                # gather placement detail
+                frag_row = edge_grp.iloc[i]
+                frag_node = frag_row['node']
+
+                # update topology
+                new_parent.append(frag_node)
+
+                # update the branch length. Note that "frag_node" has its
+                # length first set to its pendant during the preallocation
+                # step. we subtract the parent pendant to avoid counting
+                # extra edge length. This should never be < 0 as the data
+                # are presorted by pendant length, so the fragment evaluated
+                # here is assured to have a > length than the pendant used
+                # with the parent.
+                frag_node.length = frag_node.length - new_parent.length
+        else:
+            # if we only have a single placement, we place the fragment as a
+            # sister to the existing node.
+
+            # update topology
+            current_parent.append(new_parent)
+            new_parent.append(existing_node)
+            new_parent.append(new_node)
+
+            # update branch lengths
+            new_node.length = min_pendant
+            new_parent.length = existing_node.length - min_distal
+            existing_node.length = min_distal
+
+    return sktree
diff --git a/bp/GPL/tests/test_insert.py b/bp/GPL/tests/test_insert.py
@@ -0,0 +1,34 @@
+import unittest
+import pkg_resources
+from bp import parse_jplace
+from bp.GPL import insert_multifurcating
+import skbio
+
+
+class InsertTests(unittest.TestCase):
+    package = 'bp.tests'
+
+    def setUp(self):
+        self.jplacedata_multiple = \
+            open(self.get_data_path('300/placement_mul.jplace')).read()
+        self.final_multiple_multifurcating = \
+            skbio.TreeNode.read(self.get_data_path('300/placement_mul.newick'))
+
+    def get_data_path(self, filename):
+        # adapted from qiime2.plugin.testing.TestPluginBase
+        return pkg_resources.resource_filename(self.package,
+                                               '/data/%s' % filename)
+
+    def test_insert_multifurcating(self):
+        exp = self.final_multiple_multifurcating
+        placements, backbone = parse_jplace(self.jplacedata_multiple)
+        obs = insert_multifurcating(placements, backbone)
+        self.assertEqual({n.name for n in obs.tips()},
+                         {n.name for n in exp.tips()})
+        self.assertEqual(obs.compare_rfd(exp), 0)
+        self.assertAlmostEqual(obs.compare_tip_distances(exp), 0)
+
+
+if __name__ == '__main__':
+    unittest.main()
+
diff --git a/bp/__init__.py b/bp/__init__.py
@@ -6,8 +6,11 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 from ._bp import BP
-from ._io import parse_newick, write_newick
+from ._io import parse_newick, write_newick, parse_jplace
 from ._conv import to_skbio_treenode, from_skbio_treenode, to_skbio_treearray
+from ._insert import insert_fully_resolved
+
 
 __all__ = ['BP', 'parse_newick', 'to_skbio_treenode', 'from_skbio_treenode',
-           'to_skbio_treearray', 'write_newick']
+           'to_skbio_treearray', 'write_newick', 'parse_jplace',
+           'insert_fully_resolved']
diff --git a/bp/_cli.py b/bp/_cli.py
@@ -0,0 +1,33 @@
+import click
+from bp import parse_jplace, insert_fully_resolved
+from bp.GPL import insert_multifurcating
+
+
+@click.group()
+def cli():
+    pass
+
+
+@cli.command()
+@click.option('--placements', type=click.Path(exists=True),
+              required=True, help='jplace formatted data')
+@click.option('--output', type=click.Path(exists=False),
+              required=True, help='Where to write the resulting newick')
+@click.option('--method',
+              type=click.Choice(['fully-resolved', 'multifurcating']),
+              required=True, help='Whether to fully resolve or multifurcate')
+def placement(placements, output, method):
+    if method == 'fully-resolved':
+        f = insert_fully_resolved
+    elif method == 'multifurcating':
+        f = insert_multifurcating
+    else:
+        raise ValueError("Unknown method: %s" % method)
+
+    placement_df, tree = parse_jplace(open(placements).read())
+    sktree = f(placement_df, tree)
+    sktree.write(output)
+
+
+if __name__ == '__main__':
+    cli()
diff --git a/bp/_insert.pyx b/bp/_insert.pyx
@@ -0,0 +1,115 @@
+# encoding: utf-8
+# cython: profile=False, boundscheck=False, wraparound=False
+
+from ._bp cimport BP
+from . import to_skbio_treenode
+import pandas as pd
+import json
+import skbio
+cimport cython
+
+
+# see the comment in _insert_setup. Avoid the use of invalidate_caches as it
+# is very expensive for tree mutation operations
+class TreeNodeNoValidate(skbio.TreeNode):
+    def invalidate_caches(self):
+        pass
+
+
+# our noop used when monkey patching invalidate_caches
+def noop(*arg, **kwargs):
+    pass
+
+
+# pandas apply functions for preallocation of objects in bulk
+def _preallocate_fragment(r):
+    return TreeNodeNoValidate(name=r['fragment'], length=r['pendant_length'])
+
+
+def _preallocate_empty(r):
+    return TreeNodeNoValidate()
+
+
+def _insert_setup(placements, bptree, insert_type):
+    # insertion setup addresses:
+    # * treenode caching
+    # * placement ordering
+    # * preallocation of objects where "easy"
+
+    sktree = to_skbio_treenode(bptree)
+    node_lookup = {n.edge_num: n for n in sktree.traverse(include_self=True)}
+
+    # mutation operations against TreeNode is expensive as every append or
+    # remove triggers a call to invalidate caches, which requires a traversal
+    # to find the root (and do other stuff). so let's monkey patch the method
+    # to force a noop
+    for node in sktree.traverse(include_self=True):
+        node.invalidate_caches = noop
+
+    if insert_type == 'multifurcating':
+        placements = placements.sort_values(['edge_num', 'pendant_length'])
+    elif insert_type == 'fully_resolved':
+        placements = placements.sort_values(['edge_num', 'distal_length'],
+                                            ascending=[True, False])
+    else:
+        raise ValueError()
+
+    placements['node'] = placements.apply(_preallocate_fragment, axis=1)
+
+    if insert_type == 'fully_resolved':
+        placements['parent'] = placements.apply(_preallocate_empty, axis=1)
+
+    return placements, sktree, node_lookup
+
+
+# pd.DataFrame is not a resolved type so we cannot use it here for cython
+def insert_fully_resolved(object placements, BP bptree):
+    """Update the backbone, fully resolving edges with multiple queries
+
+    Parameters
+    ----------
+    placements : pd.DataFrame
+        jplace data represented as a DataFrame
+    bptree : bp.BP
+        An instance of a BP tree, this is expected to contain edge numbers
+        and correspond to the backbone for the jplace data
+
+    Returns
+    -------
+    skbio.TreeNode
+        A tree with the fragments placed
+    """
+    # TODO: profile, type and re-profile
+    placements, sktree, node_lookup = \
+        _insert_setup(placements, bptree, 'fully_resolved')
+
+    for edge, edge_grp in placements.groupby('edge_num'):
+        existing_node = node_lookup[edge]
+        current_parent = existing_node.parent
+
+        # break the edge
+        current_parent.remove(existing_node)
+        existing_node.parent = None
+        existing_length = existing_node.length
+
+        for _, fragment in edge_grp.iterrows():
+            distal_length = fragment['distal_length']
+            fragment_node = fragment['node']
+            fragment_parent = fragment['parent']
+
+            # update branch lengths
+            fragment_parent.length = existing_length - distal_length
+            existing_length = distal_length
+
+            # attach the nodes
+            fragment_parent.append(fragment_node)
+            current_parent.append(fragment_parent)
+
+            # update
+            current_parent = fragment_parent
+
+        existing_node.length = existing_length
+        current_parent.append(existing_node)
+        existing_node.length = distal_length
+
+    return sktree
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from ._insert import insert_multifurcating

		__all__ = ['insert_multifurcating']