ENH: Add Audacity extended label track format, fix #213

- Add src/crowsetta/formats/bbox/audbbox.py - TST: Add tests/data_for_tests/aud-bbox/ with files - TST: Add audbbox fixtures in tests - TST: Add tests/test_formats/test_bbox/test_audbbox.py - Add src/crowsetta/data/audbbox/ with example data
vocalpy · Mar 1, 2023 · 0c3c0b6 · 0c3c0b6
1 parent 38c7a68
commit 0c3c0b6
Show file tree

Hide file tree

Showing 10 changed files with 765 additions and 0 deletions.
diff --git a/src/crowsetta/data/audbbox/__init__.py b/src/crowsetta/data/audbbox/__init__.py
diff --git a/src/crowsetta/data/audbbox/citation.txt b/src/crowsetta/data/audbbox/citation.txt
@@ -0,0 +1,3 @@
+scikit-maad developers. (2018-present).
+Example data file.
+Adapted under BSD License: https://github.com/scikit-maad/scikit-maad/blob/production/LICENSE
diff --git a/src/crowsetta/data/audbbox/spinetail.txt b/src/crowsetta/data/audbbox/spinetail.txt
@@ -0,0 +1,36 @@
+0.101385	0.367520	SP
+\	6441.064453	12296.577148
+0.506924	3.041545	CRER
+\	2593.156006	8866.920898
+1.203945	1.482753	SP
+\	6608.365234	11543.726562
+2.724718	3.218969	SP
+\	4851.710938	12631.178711
+5.221319	7.869998	CRER
+\	2509.505615	8699.620117
+6.260514	6.666053	SP
+\	5939.163574	12798.478516
+7.946037	8.288210	SP
+\	4600.760254	13049.430664
+8.896519	9.302059	SP
+\	5827.929351	12513.294481
+9.973733	10.353927	SP
+\	4851.710938	13969.581055
+11.329756	13.750319	CRER
+\	2091.254639	9117.871094
+11.773314	12.077469	SP
+\	6106.464355	12296.577148
+12.660432	12.939240	SP
+\	5604.562500	12296.577148
+15.435842	15.879400	SP
+\	4015.209229	13216.730469
+16.170882	16.563748	SP
+\	4098.859375	12463.878906
+16.437017	18.730849	CRER
+\	2676.806152	8699.620117
+17.159384	17.514231	SP
+\	5688.213379	12714.829102
+18.198578	18.502733	SP
+\	5353.612305	12463.878906
+19.073023	19.465889	SP
+\	4349.810059	12296.577148
diff --git a/src/crowsetta/formats/bbox/__init__.py b/src/crowsetta/formats/bbox/__init__.py
@@ -1,5 +1,7 @@
+from .audbbox import AudBBox
 from .raven import Raven
 
 __all__ = [
+    "AudBBox",
     "Raven",
 ]
diff --git a/src/crowsetta/formats/bbox/audbbox.py b/src/crowsetta/formats/bbox/audbbox.py
@@ -0,0 +1,245 @@
+"""Module for Audacity label tracks
+in extended format, exported to .txt files
+https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Extended_format_with_frequency_ranges
+"""
+import pathlib
+from typing import ClassVar, List, Optional
+
+import attr
+import pandas as pd
+import pandera
+from pandera.typing import Series
+
+import crowsetta
+from crowsetta.typing import PathLike
+
+
+def txt_to_records(aud_txt_path: PathLike) -> list[dict]:
+    """Load a txt file in Audacity extended label track format
+    into records for a `pandas.DataFrame``.
+
+    Returns a ``list`` of ``dict`` that can be made into a
+    ``DataFrame`` by calling ``pandas.DataFrame.from_records``.
+
+    Parameters
+    ----------
+    aud_txt_path : str, pathlib.Path
+
+    Returns
+    -------
+    records : list
+        Of ``dict``, each ``dict`` a row
+        in the ``DataFrame``.
+
+    Notes
+    -----
+    We work with Audacity txt files this way, instead of
+    loading with ``pandas.read_csv`` then munging, so that we can
+    be sure that we can round-trip data without corrupting it.
+    """
+    with pathlib.Path(aud_txt_path).open('r') as fp:
+        lines = fp.read().splitlines()
+    lines = [line.split('\t') for line in lines]
+
+    records = []
+    # next line: iterate over lines in groups of 2
+    for row1, row2 in zip(*[iter(lines)] * 2, strict=True):
+        record = {
+            "begin_time_s": float(row1[0]),
+            "end_time_s": float(row1[1]),
+            "label": str(row1[2]),
+            "low_freq_hz": float(row2[1]),
+            "high_freq_hz": float(row2[2]),
+        }
+        records.append(record)
+    return records
+
+
+def df_to_lines(df: pd.DataFrame) -> list[str]:
+    """Convert a pandas DataFrame to a list of strings
+    that can be saved as a txt file in Audacity extended
+    label track format.
+
+    This function is (roughly) the inverse of
+    ``crowsetta.formats.bbox.audbbox.txt_to_records``.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        With contents of a .txt file in Audacity extended label track format,
+        after being loaded and parsed by ``crowsetta.formats.bbox.audbbox.audbbox_txt_to_df``
+
+    Returns
+    -------
+    lines : list
+        List of strings that can be saved to a text file
+        by calling ``writelines``.
+
+    Notes
+    -----
+    We work with Audacity txt files this way, instead of
+    munging and then calling ``pandas.DataFrame.to_csv``,
+    so that we can be sure that we can round-trip data
+    without corrupting it.
+    """
+    df = AudBBoxSchema.validate(df)
+
+    lines = []
+    for record in df.itertuples():
+        row1 = f"{float(record.begin_time_s)}\t{float(record.end_time_s)}\t{record.label}\n"
+        row2 = f"\\\t{float(record.low_freq_hz)}\t{float(record.high_freq_hz)}\n"
+        lines.extend((row1, row2))
+
+    return lines
+
+
+class AudBBoxSchema(pandera.SchemaModel):
+    """A ``pandera.SchemaModel`` that validates ``pandas`` dataframes
+    loaded from Audacity label tracks
+    in extended format, exported to .txt files
+    https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Extended_format_with_frequency_ranges
+    """
+
+    begin_time_s: Series[float] = pandera.Field(coerce=True)
+    end_time_s: Series[float] = pandera.Field(coerce=True)
+    label: Series[pd.StringDtype] = pandera.Field(coerce=True)
+    low_freq_hz: Series[float] = pandera.Field(coerce=True)
+    high_freq_hz: Series[float] = pandera.Field(coerce=True)
+
+    class Config:
+        ordered = True
+        strict = True
+
+
+@crowsetta.interface.BBoxLike.register
+@attr.define
+class AudBBox:
+    """Class that represents Audacity label tracks
+    in extended format, exported to .txt files
+    https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Extended_format_with_frequency_ranges
+
+    Attributes
+    ----------
+    name: str
+        Shorthand name for annotation format: 'aud-bbox'.
+    ext: str
+        Extension of files in annotation format: '.txt'
+    df : pandas.DataFrame
+        with annotations loaded into it
+    annot_path : str, pathlib.Path
+        Path to Audacity .txt file from which annotations were loaded.
+    audio_path : str. pathlib.Path
+        Path to audio file that the Audacity .txt file annotates.
+    """
+    COLUMNS_MAP: ClassVar[dict] = {
+        0: "begin_time_s",
+        1: "end_time_s",
+        2: "label",
+        3: "low_freq_hz",
+        4: "high_freq_hz",
+    }
+
+    name: ClassVar[str] = 'aud-bbox'
+    ext: ClassVar[str] = '.txt'
+
+    df: pd.DataFrame
+    annot_path: pathlib.Path
+    audio_path: Optional[pathlib.Path] = attr.field(default=None,
+                                                    converter=attr.converters.optional(pathlib.Path))
+
+    @classmethod
+    def from_file(cls,
+                  annot_path: PathLike,
+                  audio_path: Optional[PathLike] = None) -> 'Self':  # noqa: F821
+        """Load annotations from a Audacity annotation file with bbox,
+        created by exporting a Selection Table.
+
+        Parameters
+        ----------
+        annot_path : str, pathlib.Path
+            Path to a .txt file exported from Audacity bbox.
+        audio_path : str, pathlib.Path
+            Path to audio file that the Audacity bbox .txt file annotates.
+            Optional, defaults to None.
+
+        Examples
+        --------
+        >>> example = crowsetta.data.get('aud-bbox')
+        >>> audbbox = crowsetta.formats.bbox.AudBBox.from_file(example.annot_path)
+        """
+        annot_path = pathlib.Path(annot_path)
+        crowsetta.validation.validate_ext(annot_path, extension=cls.ext)
+        records = crowsetta.formats.bbox.audbbox.txt_to_records(annot_path)
+        df = pd.DataFrame.from_records(records)
+        df = crowsetta.formats.bbox.audbbox.AudBBoxSchema.validate(df)
+
+        return cls(
+            df=df,
+            annot_path=annot_path,
+            audio_path=audio_path,
+        )
+
+    def to_bbox(self) -> List[crowsetta.BBox]:
+        """Convert this Audacity extended label track annotation to a ``list`` of ``crowsetta.Bbox``.
+
+        Returns
+        -------
+        bboxes : list
+            of ``crowsetta.BBox``
+
+        Examples
+        --------
+        >>> example = crowsetta.data.get('aud-bbox')
+        >>> audbbox = crowsetta.formats.bbox.AudBBox.from_file(example.annot_path)
+        >>> bboxes = audbbox.to_bbox()
+        """
+        bboxes = []
+        for begin_time, end_time, label, low_freq, high_freq in zip(
+                self.df.begin_time_s.values,
+                self.df.end_time_s.values,
+                self.df.label.values,
+                self.df.low_freq_hz.values,
+                self.df.high_freq_hz.values,
+        ):
+            bboxes.append(
+                    crowsetta.BBox(onset=begin_time,
+                                   offset=end_time,
+                                   low_freq=low_freq,
+                                   high_freq=high_freq,
+                                   label=label)
+                )
+        return bboxes
+
+    def to_annot(self) -> crowsetta.Annotation:
+        """Convert this Audacity bbox annotation to a ``crowsetta.Annotation``.
+
+        Returns
+        -------
+        annot : crowsetta.Annotation
+
+        Examples
+        --------
+        >>> example = crowsetta.data.get('aud-bbox')
+        >>> audacitybbox = crowsetta.formats.bbox.AudBBox.from_file(example.annot_path)
+        >>> annot = audacitybbox.to_annot()
+        """
+        bboxes = self.to_bbox()
+        return crowsetta.Annotation(annot_path=self.annot_path,
+                                    notated_path=self.audio_path,
+                                    bboxes=bboxes)
+
+    def to_file(self,
+                annot_path: PathLike) -> None:
+        """Make a .txt file from this annotation
+        in extended label track format that can be read by Audacity.
+
+        Parameters
+        ----------
+        annot_path : str, pathlib.Path
+             Path including filename where file should be saved.
+             Must have extension '.txt'
+        """
+        crowsetta.validation.validate_ext(annot_path, extension=self.ext)
+        lines = df_to_lines(self.df)
+        with pathlib.Path(annot_path).open('w') as fp:
+            fp.writelines(lines)