-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #34 from wasade/placements
Placements
- Loading branch information
Showing
17 changed files
with
1,374 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from ._insert import insert_multifurcating | ||
|
||
__all__ = ['insert_multifurcating'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
# encoding: utf-8 | ||
# cython: profile=False, boundscheck=False, wraparound=False | ||
|
||
from .._bp cimport BP | ||
from bp._insert import _insert_setup, TreeNodeNoValidate | ||
from .. import to_skbio_treenode | ||
import pandas as pd | ||
import json | ||
import skbio | ||
cimport cython | ||
|
||
|
||
# this method described here was derived from Genesis | ||
# https://github.com/lczech/genesis | ||
# The license for Genesis is included herein under | ||
# Genesis_LICENSE.txt | ||
def insert_multifurcating(object placements, BP bptree): | ||
"""Update the backbone, with multifurcation for edges with multiple queries | ||
Parameters | ||
---------- | ||
placements : pd.DataFrame | ||
jplace data represented as a DataFrame | ||
bptree : bp.BP | ||
An instance of a BP tree, this is expected to contain edge numbers | ||
and correspond to the backbone for the jplace data | ||
Note | ||
---- | ||
This method was derived directly from the Genesis codebase, and is | ||
therefore GPL. | ||
Returns | ||
------- | ||
skbio.TreeNode | ||
A tree with the fragments placed | ||
""" | ||
# TODO: profile, type and re-profile | ||
placements, sktree, node_lookup = \ | ||
_insert_setup(placements, bptree, 'multifurcating') | ||
|
||
# it is much faster to bulk allocate than construct on the fly, so let's | ||
# do that | ||
new_parents = [TreeNodeNoValidate() | ||
for _ in range(len(placements['edge_num'].unique()))] | ||
parent_idx = 0 | ||
|
||
new_bridges = [TreeNodeNoValidate() | ||
for _ in range(placements['edge_num'].duplicated().sum())] | ||
bridge_idx = 0 | ||
|
||
for edge, edge_grp in placements.groupby('edge_num'): | ||
# update topology | ||
existing_node = node_lookup[edge] | ||
current_parent = existing_node.parent | ||
current_parent.remove(existing_node) | ||
|
||
# get a pre-allocated node | ||
new_parent = new_parents[parent_idx] | ||
parent_idx += 1 # make sure we update our offset | ||
|
||
# gather detail on our minimal placement | ||
min_frag = edge_grp.iloc[0] | ||
min_pendant = min_frag['pendant_length'] | ||
min_distal = min_frag['distal_length'] | ||
new_node = min_frag['node'] | ||
|
||
if len(edge_grp) > 1: | ||
# if we have multiple placements, we construct a node that contains | ||
# all of the placements, and place this under a "bridge" such that | ||
# it is a sister to the existing node. | ||
|
||
# derived from | ||
# https://github.com/lczech/genesis/blob/98c064d8e3e2efaa97da33c9263f6fef3724f0a5/lib/genesis/placement/function/tree.cpp#L295 | ||
|
||
# update topology | ||
bridge = new_bridges[bridge_idx] | ||
bridge_idx += 1 | ||
bridge.append(existing_node) | ||
bridge.append(new_parent) | ||
current_parent.append(bridge) | ||
new_parent.append(new_node) | ||
|
||
# Gappa uses the average distal for joining the node encompassing | ||
# placements and existing back to the tree. As we are subtracting | ||
# against the existing node, it is possible to introduce a negative | ||
# branch length, so we enforce a minimum of 0.0 | ||
avg_prox_len = edge_grp['distal_length'].mean() | ||
bridge.length = max(0.0, existing_node.length - avg_prox_len) | ||
|
||
# update edges. the first node we place has a length of zero as | ||
# its parent accounts for its pendant | ||
existing_node.length = avg_prox_len | ||
new_parent.length = min_pendant | ||
new_node.length = 0.0 | ||
|
||
for i in range(1, len(edge_grp)): | ||
# gather placement detail | ||
frag_row = edge_grp.iloc[i] | ||
frag_node = frag_row['node'] | ||
|
||
# update topology | ||
new_parent.append(frag_node) | ||
|
||
# update the branch length. Note that "frag_node" has its | ||
# length first set to its pendant during the preallocation | ||
# step. we subtract the parent pendant to avoid counting | ||
# extra edge length. This should never be < 0 as the data | ||
# are presorted by pendant length, so the fragment evaluated | ||
# here is assured to have a > length than the pendant used | ||
# with the parent. | ||
frag_node.length = frag_node.length - new_parent.length | ||
else: | ||
# if we only have a single placement, we place the fragment as a | ||
# sister to the existing node. | ||
|
||
# update topology | ||
current_parent.append(new_parent) | ||
new_parent.append(existing_node) | ||
new_parent.append(new_node) | ||
|
||
# update branch lengths | ||
new_node.length = min_pendant | ||
new_parent.length = existing_node.length - min_distal | ||
existing_node.length = min_distal | ||
|
||
return sktree |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import unittest | ||
import pkg_resources | ||
from bp import parse_jplace | ||
from bp.GPL import insert_multifurcating | ||
import skbio | ||
|
||
|
||
class InsertTests(unittest.TestCase): | ||
package = 'bp.tests' | ||
|
||
def setUp(self): | ||
self.jplacedata_multiple = \ | ||
open(self.get_data_path('300/placement_mul.jplace')).read() | ||
self.final_multiple_multifurcating = \ | ||
skbio.TreeNode.read(self.get_data_path('300/placement_mul.newick')) | ||
|
||
def get_data_path(self, filename): | ||
# adapted from qiime2.plugin.testing.TestPluginBase | ||
return pkg_resources.resource_filename(self.package, | ||
'/data/%s' % filename) | ||
|
||
def test_insert_multifurcating(self): | ||
exp = self.final_multiple_multifurcating | ||
placements, backbone = parse_jplace(self.jplacedata_multiple) | ||
obs = insert_multifurcating(placements, backbone) | ||
self.assertEqual({n.name for n in obs.tips()}, | ||
{n.name for n in exp.tips()}) | ||
self.assertEqual(obs.compare_rfd(exp), 0) | ||
self.assertAlmostEqual(obs.compare_tip_distances(exp), 0) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import click | ||
from bp import parse_jplace, insert_fully_resolved | ||
from bp.GPL import insert_multifurcating | ||
|
||
|
||
@click.group() | ||
def cli(): | ||
pass | ||
|
||
|
||
@cli.command() | ||
@click.option('--placements', type=click.Path(exists=True), | ||
required=True, help='jplace formatted data') | ||
@click.option('--output', type=click.Path(exists=False), | ||
required=True, help='Where to write the resulting newick') | ||
@click.option('--method', | ||
type=click.Choice(['fully-resolved', 'multifurcating']), | ||
required=True, help='Whether to fully resolve or multifurcate') | ||
def placement(placements, output, method): | ||
if method == 'fully-resolved': | ||
f = insert_fully_resolved | ||
elif method == 'multifurcating': | ||
f = insert_multifurcating | ||
else: | ||
raise ValueError("Unknown method: %s" % method) | ||
|
||
placement_df, tree = parse_jplace(open(placements).read()) | ||
sktree = f(placement_df, tree) | ||
sktree.write(output) | ||
|
||
|
||
if __name__ == '__main__': | ||
cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
# encoding: utf-8 | ||
# cython: profile=False, boundscheck=False, wraparound=False | ||
|
||
from ._bp cimport BP | ||
from . import to_skbio_treenode | ||
import pandas as pd | ||
import json | ||
import skbio | ||
cimport cython | ||
|
||
|
||
# see the comment in _insert_setup. Avoid the use of invalidate_caches as it | ||
# is very expensive for tree mutation operations | ||
class TreeNodeNoValidate(skbio.TreeNode): | ||
def invalidate_caches(self): | ||
pass | ||
|
||
|
||
# our noop used when monkey patching invalidate_caches | ||
def noop(*arg, **kwargs): | ||
pass | ||
|
||
|
||
# pandas apply functions for preallocation of objects in bulk | ||
def _preallocate_fragment(r): | ||
return TreeNodeNoValidate(name=r['fragment'], length=r['pendant_length']) | ||
|
||
|
||
def _preallocate_empty(r): | ||
return TreeNodeNoValidate() | ||
|
||
|
||
def _insert_setup(placements, bptree, insert_type): | ||
# insertion setup addresses: | ||
# * treenode caching | ||
# * placement ordering | ||
# * preallocation of objects where "easy" | ||
|
||
sktree = to_skbio_treenode(bptree) | ||
node_lookup = {n.edge_num: n for n in sktree.traverse(include_self=True)} | ||
|
||
# mutation operations against TreeNode is expensive as every append or | ||
# remove triggers a call to invalidate caches, which requires a traversal | ||
# to find the root (and do other stuff). so let's monkey patch the method | ||
# to force a noop | ||
for node in sktree.traverse(include_self=True): | ||
node.invalidate_caches = noop | ||
|
||
if insert_type == 'multifurcating': | ||
placements = placements.sort_values(['edge_num', 'pendant_length']) | ||
elif insert_type == 'fully_resolved': | ||
placements = placements.sort_values(['edge_num', 'distal_length'], | ||
ascending=[True, False]) | ||
else: | ||
raise ValueError() | ||
|
||
placements['node'] = placements.apply(_preallocate_fragment, axis=1) | ||
|
||
if insert_type == 'fully_resolved': | ||
placements['parent'] = placements.apply(_preallocate_empty, axis=1) | ||
|
||
return placements, sktree, node_lookup | ||
|
||
|
||
# pd.DataFrame is not a resolved type so we cannot use it here for cython | ||
def insert_fully_resolved(object placements, BP bptree): | ||
"""Update the backbone, fully resolving edges with multiple queries | ||
Parameters | ||
---------- | ||
placements : pd.DataFrame | ||
jplace data represented as a DataFrame | ||
bptree : bp.BP | ||
An instance of a BP tree, this is expected to contain edge numbers | ||
and correspond to the backbone for the jplace data | ||
Returns | ||
------- | ||
skbio.TreeNode | ||
A tree with the fragments placed | ||
""" | ||
# TODO: profile, type and re-profile | ||
placements, sktree, node_lookup = \ | ||
_insert_setup(placements, bptree, 'fully_resolved') | ||
|
||
for edge, edge_grp in placements.groupby('edge_num'): | ||
existing_node = node_lookup[edge] | ||
current_parent = existing_node.parent | ||
|
||
# break the edge | ||
current_parent.remove(existing_node) | ||
existing_node.parent = None | ||
existing_length = existing_node.length | ||
|
||
for _, fragment in edge_grp.iterrows(): | ||
distal_length = fragment['distal_length'] | ||
fragment_node = fragment['node'] | ||
fragment_parent = fragment['parent'] | ||
|
||
# update branch lengths | ||
fragment_parent.length = existing_length - distal_length | ||
existing_length = distal_length | ||
|
||
# attach the nodes | ||
fragment_parent.append(fragment_node) | ||
current_parent.append(fragment_parent) | ||
|
||
# update | ||
current_parent = fragment_parent | ||
|
||
existing_node.length = existing_length | ||
current_parent.append(existing_node) | ||
existing_node.length = distal_length | ||
|
||
return sktree |
Oops, something went wrong.