Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add h5unifrac_all method #153

Merged
merged 4 commits into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,13 @@ jobs:
python -c "import h5py,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); f_u=h5py.File('ci/test2.dm.h5','r'); dm_u=skbio.stats.distance.DistanceMatrix(f_u['matrix'][:,:],f_u['order'][:]); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1"
python -c "import h5py; f_u=h5py.File('ci/test2.dm.h5','r'); print(f_u.keys()); assert len(f_u['stat_methods'][:]) == 1"
python -c "import h5py; f_u=h5py.File('ci/test3.dm.h5','r'); print(f_u.keys()); assert len(f_u['pcoa_eigvals'][:]) == 2"
# repeat using unifrac's h5 interfaces
python -c "import unifrac; dm_u=unifrac.h5unifrac('ci/test.dm.h5'); dm_l=unifrac.h5unifrac_all('ci/test.dm.h5')"
python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac('ci/test.dm.h5'); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1"
python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac_all('ci/test.dm.h5')[0]; t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1"
python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac('ci/test2.dm.h5'); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1"
python -c "import unifrac; st_l=unifrac.h5permanova_dict('ci/test2.dm.h5'); assert len(st_l) == 1"
python -c "import unifrac; pc=unifrac.h5pcoa('ci/test3.dm.h5'); print(pc); assert len(pc.eigvals) == 2"
if [[ "$(uname -s)" == "Linux" ]];
then
MD5=md5sum
Expand Down
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,14 @@ The library can be accessed directly from within Python. If operating in this mo
>>> import unifrac
>>> dir(unifrac)
['__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__',
'__package__', '__path__', '__spec__', '__version__', '_api', '_meta', '_methods', 'set_random_seed', 'faith_pd',
'generalized', 'generalized_fp32', 'generalized_fp32_to_file', 'generalized_fp64', 'generalized_fp64_to_file', 'generalized_to_file',
'h5pcoa', 'h5pcoa_all', 'h5permanova', 'h5permanova_dict', 'h5unifrac', 'meta', 'pkg_resources', 'ssu', 'ssu_fast', 'ssu_inmem', 'ssu_to_file', 'ssu_to_file_v2',
'unweighted', 'unweighted_fp32', 'unweighted_fp32_to_file', 'unweighted_fp64', 'unweighted_fp64_to_file', 'unweighted_to_file',
'weighted_normalized', 'weighted_normalized_fp32', 'weighted_normalized_fp32_to_file', 'weighted_normalized_fp64', 'weighted_normalized_fp64_to_file',
'weighted_normalized_to_file', 'weighted_unnormalized', 'weighted_unnormalized_fp32', 'weighted_unnormalized_fp32_to_file', 'weighted_unnormalized_fp64',
'weighted_unnormalized_fp64_to_file', 'weighted_unnormalized_to_file']
'__package__', '__path__', '__spec__', '__version__', '_api', '_meta', '_methods',
'faith_pd', 'generalized', 'generalized_fp32', 'generalized_fp32_to_file', 'generalized_fp64', 'generalized_fp64_to_file', 'generalized_to_file',
'h5pcoa', 'h5pcoa_all', 'h5permanova', 'h5permanova_dict', 'h5unifrac', 'h5unifrac_all', 'meta', 'pkg_resources',
'set_random_seed', 'ssu', 'ssu_fast', 'ssu_inmem', 'ssu_to_file', 'ssu_to_file_v2', 'unweighted', 'unweighted_fp32',
'unweighted_fp32_to_file', 'unweighted_fp64', 'unweighted_fp64_to_file', 'unweighted_to_file', 'weighted_normalized',
'weighted_normalized_fp32', 'weighted_normalized_fp32_to_file', 'weighted_normalized_fp64', 'weighted_normalized_fp64_to_file',
'weighted_normalized_to_file', 'weighted_unnormalized', 'weighted_unnormalized_fp32', 'weighted_unnormalized_fp32_to_file',
'weighted_unnormalized_fp64', 'weighted_unnormalized_fp64_to_file', 'weighted_unnormalized_to_file']
>>> print(unifrac.unweighted.__doc__)
Compute Unweighted UniFrac

Expand Down
4 changes: 2 additions & 2 deletions unifrac/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
weighted_unnormalized_fp32_to_file,
generalized_fp32_to_file,
meta,
h5unifrac,
h5unifrac, h5unifrac_all,
h5pcoa, h5pcoa_all,
h5permanova, h5permanova_dict)
from unifrac._api import ssu, ssu_fast, faith_pd, set_random_seed
Expand All @@ -57,7 +57,7 @@
'weighted_normalized_fp32_to_file',
'weighted_unnormalized_fp32_to_file',
'generalized_fp32_to_file',
'h5unifrac', 'h5pcoa', 'h5pcoa_all',
'h5unifrac', 'h5unifrac_all', 'h5pcoa', 'h5pcoa_all',
'h5permanova', 'h5permanova_dict',
'ssu', 'ssu_fast', 'faith_pd',
'ssu_to_file', 'ssu_to_file_v2',
Expand Down
72 changes: 67 additions & 5 deletions unifrac/_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -2508,7 +2508,7 @@ def generalized_fp32_to_file(table: str,


def h5unifrac(h5file: str) -> skbio.DistanceMatrix:
"""Read UniFrac from a hdf5 file
"""Read UniFrac distance matrix from a hdf5 file

Parameters
----------
Expand Down Expand Up @@ -2538,13 +2538,68 @@ def h5unifrac(h5file: str) -> skbio.DistanceMatrix:
"""

with h5py.File(h5file, "r") as f_u:
dm = skbio.DistanceMatrix(
if 'matrix:0' in f_u.keys():
# multi format
dm = skbio.DistanceMatrix(
f_u['matrix:0'][:, :],
[c.decode('ascii') for c in f_u['order'][:]])
else:
# single format
dm = skbio.DistanceMatrix(
f_u['matrix'][:, :],
[c.decode('ascii') for c in f_u['order'][:]])

return dm


def h5unifrac_all(h5file: str) -> skbio.DistanceMatrix:
"""Read all UniFrac distance matrices from a hdf5 file

Parameters
----------
h5file : str
A filepath to a hdf5 file.

Returns
-------
tuple(skbio.DistanceMatrix)
The distance matrices.

Raises
------
OSError
If the hdf5 file is not found
KeyError
If the hdf5 does not have the necessary fields

References
----------
.. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for
comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235
(2005).
.. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a
powerful beta diversity measure for comparing communities based on
phylogeny. BMC Bioinformatics 12:118 (2011).
"""

with h5py.File(h5file, "r") as f_u:
order = [c.decode('ascii') for c in f_u['order'][:]]
if 'matrix' in f_u.keys():
# single format
dms = [skbio.DistanceMatrix(
f_u['matrix'][:, :], order)]
else:
# multi format
dms = []
i = 0
while 'matrix:%i' % i in f_u.keys():
dms.append(skbio.DistanceMatrix(
f_u['matrix:%i' % i][:, :], order))
i = i + 1

return dms


def _build_pcoa(f_u, long_method_name, order_index,
eigval_key, samples_key, prop_key):
axis_labels = ["PC%d" % i for i in
Expand Down Expand Up @@ -2597,9 +2652,16 @@ def h5pcoa(h5file: str) -> skbio.OrdinationResults:
order_index = [c.decode('ascii')
for c in f_u['order'][:]]

pc = _build_pcoa(f_u, long_method_name, order_index,
'pcoa_eigvals', 'pcoa_samples',
'pcoa_proportion_explained')
if 'pcoa_eigvals:0' in f_u.keys():
# multi interface
pc = _build_pcoa(f_u, long_method_name, order_index,
'pcoa_eigvals:0', 'pcoa_samples:0',
'pcoa_proportion_explained:0')
else:
# single interface
pc = _build_pcoa(f_u, long_method_name, order_index,
'pcoa_eigvals', 'pcoa_samples',
'pcoa_proportion_explained')

return pc

Expand Down