Merge master changes into biocore#337 code!

This works now! Well, kinda. The tests are still broken, and the JS code still doesn't uncompress the sample metadata. But you can at least generate QZVs now!
fedarko · Oct 16, 2020 · 6211505 · 6211505
1 parent 590cb07
commit 6211505
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 60 deletions.
diff --git a/empress/compression_utils.py b/empress/compression_utils.py
@@ -165,23 +165,37 @@ def compress_recurring_md_vals(str_metadata_df, output_type="list"):
         this DataFrame must be stored as strings (even numbers should be stored
         as strings), so that compressed values can be interpreted correctly.
 
+    output_type: str
+        Should be either "list" or "dict".
+
+            -If it's "list", then the returned compressed_md will be a
+             two-dimensional list of length len(str_metadata_df.index).
+             Each position i within this outer list holds an "inner list"
+             of length len(str_metadata_df.columns). The c-th value of the
+             i-th inner list contains either:
+              -An integer, in which case the value referred to is located at
+               this integer's position in recurring_vals (using 0-indexing)
+              -A string value
+             In either case, the value referred to is the metadata value in
+             column c for the sample with index i.
+
+            -If it's "dict", then the returned compressed_md will be a dict
+             representation of the input metadata. There will be one key in the
+             dict for every value in str_metadata_df.index. Each key maps to a
+             list of length len(str_metadata_df.columns), where recurring
+             values have been replaced as specified above for "list".
+
+            -If it's not "list" or "dict", this will just raise an error.
+
     Returns
     -------
     (recurring_vals, compressed_md)
         recurring_vals: list
             List of "recurring values" that are used at least twice in the
             metadata, sorted in descending order of frequency in the metadata
             (with ties broken arbitrarily).
-        compressed_md: list
-            Two-dimensional list. The "outer list" is of length
-            len(str_metadata_df.index). Each position i within this outer
-            list holds an "inner list" of length len(str_metadata_df.columns).
-            The c-th value of the i-th inner list contains either:
-             -An integer, in which case the value referred to is located at
-              this integer's position in recurring_vals (using 0-indexing)
-             -A string value
-            In either case, the value referred to is the metadata value in
-            column c for the sample with index i.
+        compressed_md: list or dict
+            See the output_type description for details on the exact format.
     """
     num_cols = len(str_metadata_df.columns)
     orig_idx = list(str_metadata_df.index)
@@ -337,7 +351,7 @@ def compress_sample_metadata(s_ids_to_indices, metadata):
     return sm_cols, recurring_vals, compressed_sm
 
 
-def compress_feature_metadata(tip_metadata, int_metadata, name2treepos):
+def compress_feature_metadata(tip_metadata, int_metadata):
     """Converts tip/internal node metadata DataFrames to a space-saving format.
 
     Note that the columns of tip_metadata and int_metadata should be identical,
@@ -355,43 +369,38 @@ def compress_feature_metadata(tip_metadata, int_metadata, name2treepos):
     int_metadata: pd.DataFrame or None
         Metadata for internal nodes. If not None, the index should describe
         node names, and the columns should describe feature metadata fields.
-    name2treepos: dict
-        Maps node names to a list of corresponding postorder position(s) in the
-        tree. (Tip names should only map to one position, while internal node
-        names can map to an arbitrarily large number of positions.)
 
     Returns
     -------
     (fm_cols, recurring_vals, compressed_tm, compressed_im)
         fm_cols: list
             List of the feature metadata column names, all converted to
-            strings. If both input DFs are None, this will be {}.
+            strings. If both input DFs are None, this will be [].
         recurring_vals: list
             List of "recurring values" that are used at least twice in either
             the tip or internal node metadata, sorted in descending order of
-            frequency in the metadata (with ties broken arbitrarily). (Note
+            frequency in the metadata (with ties broken arbitrarily). NOTE
             that these are computed by looking at both metadata DFs
             at once: so if a given value is used once in the tip metadata and
             once in the internal node metadata then it'll still get included in
-            this list.
-        compressed_tm: list
-            Two-dimensional list representation of the tip metadata. Along with
-            recurring values being replaced with their position in
-            recurring_vals, the DF's indices (which were previously tip node
-            names in the tree) will be replaced with each tip's postorder
-            position (based on name2treepos). If tip_metadata was empty, or if
-            both input DFs were None, this will be {}.
-        compressed_im: list
-            Two-dimensional list representation of the internal node metadata.
-            Recurring values are replaced as described above for compressed_tm,
-            and the DF's indices will similarly be replaced with each internal
-            node's postorder position based on name2treepos. Rows where the
-            index (a.k.a. internal node name) maps to multiple postorder
-            positions in name2treepos will be duplicated, such that each tree
-            position with internal node metadata will have its own row. (This
-            could be made more efficient, but it simplifies things a lot.)
+            this list. ALSO NOTE that we don't keep track of how many times a
+            duplicate internal node name occurs -- so if a given value only
+            occurs once in a given internal node name's feature metadata, but
+            1,000 internal nodes share that name, we still won't consider that
+            value recurring (at least for now).
+        compressed_tm: dict
+            Maps node names in tip_metadata to a list of feature metadata
+            values, in the same order as in fm_cols and converted to strings.
+            Each recurring value (i.e. a value present in two or more locations
+            throughout the tip / internal node metadata) will be replaced with
+            an integer pointing to the index of this value in recurring_vals.
             If tip_metadata was empty, or if both input DFs were None, this
             will be {}.
+        compressed_im: dict
+            Maps node names in tip_metadata to a list of feature metadata
+            values, in the same order as in fm_cols and converted to strings.
+            Recurring values are replaced as for compressed_tm. If int_metadata
+            was empty, or if both input DFs were None, this will be {}.
 
     Raises
     ------
@@ -447,24 +456,13 @@ def compress_feature_metadata(tip_metadata, int_metadata, name2treepos):
     )
 
     # Split up the compressed feature metadata back into tip and internal node
-    # metadata. Also, while we're going through the metadata, replace node
-    # names with the corresponding indices in name2treepos, and duplicate
-    # internal node metadata.
+    # metadata.
     compressed_tm = {}
     compressed_im = {}
     for name in compressed_fm:
-        try:
-            treepositions = name2treepos[name]
-        except KeyError:
-            raise KeyError(
-                "Node name {} is not present in name2treepos.".format(name)
-            )
         if name in tip_metadata.index:
-            if len(treepositions) > 1:
-                raise ValueError("Tip node name is shared by multiple nodes.")
-            compressed_tm[treepositions[0]] = compressed_fm[name]
+            compressed_tm[name] = compressed_fm[name]
         else:
-            for pos in treepositions:
-                compressed_im[pos] = compressed_fm[name]
+            compressed_im[name] = compressed_fm[name]
 
     return fm_cols, recurring_vals, compressed_tm, compressed_im
diff --git a/empress/core.py b/empress/core.py
@@ -280,7 +280,7 @@ def _to_dict(self):
             A dictionary describing the plots contained in the ordination
             object and the sample + feature metadata.
         """
-        s_ids = f_ids = cmp_table = sm_cols = compressed_sm = None
+        s_ids = f_ids = cmp_table = sm_cols = recurring_sm_vals = cmp_sm = None
         sid2idxs = fid2idxs = {}
         if self.is_community_plot:
             # The fid2idxs dict we get from compress_table() is temporary --
@@ -293,19 +293,16 @@ def _to_dict(self):
             s_ids, f_ids, sid2idxs, fid2idxs_t, cmp_table = compress_table(
                 self.table
             )
-            sm_cols, recurring_sm_vals, compressed_sm = compress_sample_metadata(
+            sm_cols, recurring_sm_vals, cmp_sm = compress_sample_metadata(
                 sid2idxs, self.samples
             )
-        fm_cols, recurring_fm_vals, compressed_tm_tmp, compressed_im_tmp = \
+        fm_cols, recurring_fm_vals, cmp_tm_tmp, cmp_im_tmp = \
             compress_feature_metadata(self.tip_md, self.int_md)
 
-        # Maps node names to postorder position(s) in the tree. Used for
-        # feature metadata compression.
-        name2treepos = defaultdict(list)
         # Use nodes' postorder positions as their "IDs" for the BIOM table and
         # feature metadata
-        compressed_tm = {}
-        compressed_im = {}
+        cmp_tm = {}
+        cmp_im = {}
         # bptree indices start at one, hence we pad the arrays
         names = [-1]
         lengths = [-1]
@@ -320,14 +317,14 @@ def _to_dict(self):
                 fid2idxs[i] = fid2idxs_t[name]
                 f_ids[fid2idxs[i]] = i
 
-            if name in compressed_tm_tmp:
-                compressed_tm[i] = compressed_tm_tmp[name]
+            if name in cmp_tm_tmp:
+                cmp_tm[i] = cmp_tm_tmp[name]
 
             # Note: for internal metadata, node names may not be unique. Thus,
             # we duplicate the internal node metadata for each node in the
             # metadata with the same name.
-            if name in compressed_im_tmp:
-                compressed_im[i] = compressed_im_tmp[name]
+            if name in cmp_im_tmp:
+                cmp_im[i] = cmp_im_tmp[name]
 
         data_to_render = {
             'base_url': self.base_url,