Skip to content

Commit

Permalink
Merge pull request #34 from wasade/placements
Browse files Browse the repository at this point in the history
Placements
  • Loading branch information
wasade authored Jan 24, 2022
2 parents 8fcbc45 + 9a4089f commit 7d1b24c
Show file tree
Hide file tree
Showing 17 changed files with 1,374 additions and 14 deletions.
43 changes: 40 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Improved Octo Waddle
--------------------

An implementation of the balanced parentheses tree structure as described by
Cordova and Navarro (http://www.dcc.uchile.cl/~gnavarro/ps/tcs16.2.pdf).
[Cordova and Navarro](http://www.dcc.uchile.cl/~gnavarro/ps/tcs16.2.pdf).

Install notes
-------------
Expand All @@ -12,6 +12,43 @@ problem of requiring numpy and cython for setup.py to execute. The package is
named iow in pypi as "bp" was taken at time of registration.

```
conda install numpy cython
pip install iow
$ conda create --name bp python=3.8
$ conda activate bp
$ conda install numpy cython
$ pip install iow
```

Developer notes
---------------

If pulling the source, please note that we use a submodule and Github does not
by default bring it down. After a clone, please run:

```
$ git submodule update --init --recursive
```

Fragment insertion
------------------

BP supports the [jplace format](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0031009). Fragments can be inserted using either fully-resolved or multifurcation mode to resolve multiple placements to the same edge. In fully resolved, the edge placed against is broken N times where N is the number of fragments on the edge. In multifurcation, a new node is constructed as the average of the distal length for the N fragments, and a separate multifurcation node is added which encompasses the placed fragments.

Insertions can be handled by the command line following install:

```
$ bp placement --help
Usage: bp placement [OPTIONS]
Options:
--placements PATH jplace formatted data [required]
--output PATH Where to write the resulting newick
[required]
--method [fully-resolved|multifurcating]
Whether to fully resolve or multifurcate
[required]
--help Show this message and exit.
```

Note that the multifurcating support relies on GPL code derived from the Genesis project. That code and LICENSE can be found under `bp/GPL`.
674 changes: 674 additions & 0 deletions bp/GPL/Genesis_LICENSE.txt

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions bp/GPL/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from ._insert import insert_multifurcating

__all__ = ['insert_multifurcating']
127 changes: 127 additions & 0 deletions bp/GPL/_insert.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# encoding: utf-8
# cython: profile=False, boundscheck=False, wraparound=False

from .._bp cimport BP
from bp._insert import _insert_setup, TreeNodeNoValidate
from .. import to_skbio_treenode
import pandas as pd
import json
import skbio
cimport cython


# this method described here was derived from Genesis
# https://github.com/lczech/genesis
# The license for Genesis is included herein under
# Genesis_LICENSE.txt
def insert_multifurcating(object placements, BP bptree):
"""Update the backbone, with multifurcation for edges with multiple queries
Parameters
----------
placements : pd.DataFrame
jplace data represented as a DataFrame
bptree : bp.BP
An instance of a BP tree, this is expected to contain edge numbers
and correspond to the backbone for the jplace data
Note
----
This method was derived directly from the Genesis codebase, and is
therefore GPL.
Returns
-------
skbio.TreeNode
A tree with the fragments placed
"""
# TODO: profile, type and re-profile
placements, sktree, node_lookup = \
_insert_setup(placements, bptree, 'multifurcating')

# it is much faster to bulk allocate than construct on the fly, so let's
# do that
new_parents = [TreeNodeNoValidate()
for _ in range(len(placements['edge_num'].unique()))]
parent_idx = 0

new_bridges = [TreeNodeNoValidate()
for _ in range(placements['edge_num'].duplicated().sum())]
bridge_idx = 0

for edge, edge_grp in placements.groupby('edge_num'):
# update topology
existing_node = node_lookup[edge]
current_parent = existing_node.parent
current_parent.remove(existing_node)

# get a pre-allocated node
new_parent = new_parents[parent_idx]
parent_idx += 1 # make sure we update our offset

# gather detail on our minimal placement
min_frag = edge_grp.iloc[0]
min_pendant = min_frag['pendant_length']
min_distal = min_frag['distal_length']
new_node = min_frag['node']

if len(edge_grp) > 1:
# if we have multiple placements, we construct a node that contains
# all of the placements, and place this under a "bridge" such that
# it is a sister to the existing node.

# derived from
# https://github.com/lczech/genesis/blob/98c064d8e3e2efaa97da33c9263f6fef3724f0a5/lib/genesis/placement/function/tree.cpp#L295

# update topology
bridge = new_bridges[bridge_idx]
bridge_idx += 1
bridge.append(existing_node)
bridge.append(new_parent)
current_parent.append(bridge)
new_parent.append(new_node)

# Gappa uses the average distal for joining the node encompassing
# placements and existing back to the tree. As we are subtracting
# against the existing node, it is possible to introduce a negative
# branch length, so we enforce a minimum of 0.0
avg_prox_len = edge_grp['distal_length'].mean()
bridge.length = max(0.0, existing_node.length - avg_prox_len)

# update edges. the first node we place has a length of zero as
# its parent accounts for its pendant
existing_node.length = avg_prox_len
new_parent.length = min_pendant
new_node.length = 0.0

for i in range(1, len(edge_grp)):
# gather placement detail
frag_row = edge_grp.iloc[i]
frag_node = frag_row['node']

# update topology
new_parent.append(frag_node)

# update the branch length. Note that "frag_node" has its
# length first set to its pendant during the preallocation
# step. we subtract the parent pendant to avoid counting
# extra edge length. This should never be < 0 as the data
# are presorted by pendant length, so the fragment evaluated
# here is assured to have a > length than the pendant used
# with the parent.
frag_node.length = frag_node.length - new_parent.length
else:
# if we only have a single placement, we place the fragment as a
# sister to the existing node.

# update topology
current_parent.append(new_parent)
new_parent.append(existing_node)
new_parent.append(new_node)

# update branch lengths
new_node.length = min_pendant
new_parent.length = existing_node.length - min_distal
existing_node.length = min_distal

return sktree
34 changes: 34 additions & 0 deletions bp/GPL/tests/test_insert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import unittest
import pkg_resources
from bp import parse_jplace
from bp.GPL import insert_multifurcating
import skbio


class InsertTests(unittest.TestCase):
package = 'bp.tests'

def setUp(self):
self.jplacedata_multiple = \
open(self.get_data_path('300/placement_mul.jplace')).read()
self.final_multiple_multifurcating = \
skbio.TreeNode.read(self.get_data_path('300/placement_mul.newick'))

def get_data_path(self, filename):
# adapted from qiime2.plugin.testing.TestPluginBase
return pkg_resources.resource_filename(self.package,
'/data/%s' % filename)

def test_insert_multifurcating(self):
exp = self.final_multiple_multifurcating
placements, backbone = parse_jplace(self.jplacedata_multiple)
obs = insert_multifurcating(placements, backbone)
self.assertEqual({n.name for n in obs.tips()},
{n.name for n in exp.tips()})
self.assertEqual(obs.compare_rfd(exp), 0)
self.assertAlmostEqual(obs.compare_tip_distances(exp), 0)


if __name__ == '__main__':
unittest.main()

7 changes: 5 additions & 2 deletions bp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
from ._bp import BP
from ._io import parse_newick, write_newick
from ._io import parse_newick, write_newick, parse_jplace
from ._conv import to_skbio_treenode, from_skbio_treenode, to_skbio_treearray
from ._insert import insert_fully_resolved


__all__ = ['BP', 'parse_newick', 'to_skbio_treenode', 'from_skbio_treenode',
'to_skbio_treearray', 'write_newick']
'to_skbio_treearray', 'write_newick', 'parse_jplace',
'insert_fully_resolved']
33 changes: 33 additions & 0 deletions bp/_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import click
from bp import parse_jplace, insert_fully_resolved
from bp.GPL import insert_multifurcating


@click.group()
def cli():
pass


@cli.command()
@click.option('--placements', type=click.Path(exists=True),
required=True, help='jplace formatted data')
@click.option('--output', type=click.Path(exists=False),
required=True, help='Where to write the resulting newick')
@click.option('--method',
type=click.Choice(['fully-resolved', 'multifurcating']),
required=True, help='Whether to fully resolve or multifurcate')
def placement(placements, output, method):
if method == 'fully-resolved':
f = insert_fully_resolved
elif method == 'multifurcating':
f = insert_multifurcating
else:
raise ValueError("Unknown method: %s" % method)

placement_df, tree = parse_jplace(open(placements).read())
sktree = f(placement_df, tree)
sktree.write(output)


if __name__ == '__main__':
cli()
115 changes: 115 additions & 0 deletions bp/_insert.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# encoding: utf-8
# cython: profile=False, boundscheck=False, wraparound=False

from ._bp cimport BP
from . import to_skbio_treenode
import pandas as pd
import json
import skbio
cimport cython


# see the comment in _insert_setup. Avoid the use of invalidate_caches as it
# is very expensive for tree mutation operations
class TreeNodeNoValidate(skbio.TreeNode):
def invalidate_caches(self):
pass


# our noop used when monkey patching invalidate_caches
def noop(*arg, **kwargs):
pass


# pandas apply functions for preallocation of objects in bulk
def _preallocate_fragment(r):
return TreeNodeNoValidate(name=r['fragment'], length=r['pendant_length'])


def _preallocate_empty(r):
return TreeNodeNoValidate()


def _insert_setup(placements, bptree, insert_type):
# insertion setup addresses:
# * treenode caching
# * placement ordering
# * preallocation of objects where "easy"

sktree = to_skbio_treenode(bptree)
node_lookup = {n.edge_num: n for n in sktree.traverse(include_self=True)}

# mutation operations against TreeNode is expensive as every append or
# remove triggers a call to invalidate caches, which requires a traversal
# to find the root (and do other stuff). so let's monkey patch the method
# to force a noop
for node in sktree.traverse(include_self=True):
node.invalidate_caches = noop

if insert_type == 'multifurcating':
placements = placements.sort_values(['edge_num', 'pendant_length'])
elif insert_type == 'fully_resolved':
placements = placements.sort_values(['edge_num', 'distal_length'],
ascending=[True, False])
else:
raise ValueError()

placements['node'] = placements.apply(_preallocate_fragment, axis=1)

if insert_type == 'fully_resolved':
placements['parent'] = placements.apply(_preallocate_empty, axis=1)

return placements, sktree, node_lookup


# pd.DataFrame is not a resolved type so we cannot use it here for cython
def insert_fully_resolved(object placements, BP bptree):
"""Update the backbone, fully resolving edges with multiple queries
Parameters
----------
placements : pd.DataFrame
jplace data represented as a DataFrame
bptree : bp.BP
An instance of a BP tree, this is expected to contain edge numbers
and correspond to the backbone for the jplace data
Returns
-------
skbio.TreeNode
A tree with the fragments placed
"""
# TODO: profile, type and re-profile
placements, sktree, node_lookup = \
_insert_setup(placements, bptree, 'fully_resolved')

for edge, edge_grp in placements.groupby('edge_num'):
existing_node = node_lookup[edge]
current_parent = existing_node.parent

# break the edge
current_parent.remove(existing_node)
existing_node.parent = None
existing_length = existing_node.length

for _, fragment in edge_grp.iterrows():
distal_length = fragment['distal_length']
fragment_node = fragment['node']
fragment_parent = fragment['parent']

# update branch lengths
fragment_parent.length = existing_length - distal_length
existing_length = distal_length

# attach the nodes
fragment_parent.append(fragment_node)
current_parent.append(fragment_parent)

# update
current_parent = fragment_parent

existing_node.length = existing_length
current_parent.append(existing_node)
existing_node.length = distal_length

return sktree
Loading

0 comments on commit 7d1b24c

Please sign in to comment.