Source code for vocalpy.segment.ava

"""Find segments in audio, using algorithm from ``ava`` package."""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

import numpy as np
import numpy.typing as npt
from scipy.ndimage import gaussian_filter
from scipy.signal import stft

from ..params import Params
from ..segments import Segments

if TYPE_CHECKING:
    from .. import Sound


EPSILON = 1e-9



[docs]
@dataclass
class AvaParams(Params):
    """Data class that represents parameters
    for :func:`vocalpy.segment.ava`.

    Constants in this module are instances of this class
    that represent parameters used in papers.

    Attributes
    ----------
    nperseg : int
        Number of samples per segment for Short-Time Fourier Transform.
        Default is 1024.
    noverlap : int
        Number of samples to overlap per segment
        for Short-Time Fourier Transform.
        Default is 512.
    min_freq : int
        Minimum frequency. Spectrogram is "cropped"
        below this frequency (instead of, e.g.,
        bandpass filtering). Default is 30e3.
    max_freq : int
        Maximum frequency. Spectrogram is "cropped"
        above this frequency (instead of, e.g.,
        bandpass filtering). Default is 110e3.
    spect_min_val : float, optional
        Expected minimum value of spectrogram
        after transforming to the log of the
        magnitude. Used for a min-max scaling:
        :math:`(s - s_{min} / (s_{max} - s_{min})`
        where ``spect_min_val`` is :math:`s_{min}`.
        Default is None, in which case
        the minimum value of the spectrogram is used.
    spect_max_val : float, optional
        Expected maximum value of spectrogram
        after transforming to the log of the
        magnitude. Used for a min-max scaling:
        :math:`(s - s_{min} / (s_{max} - s_{min})`
        where ``spect_min_val`` is :math:`s_{min}`.
        Default is None, in which case
        the maximum value of the spectrogram is used.
    thresh_max : float
        Threshold used to find local maxima.
    thresh_min : float
        Threshold used to find local minima,
        in relation to local maxima.
        Used to find onsets and offsets of segments.
    thresh_lowest : float
        Lowest threshold used to find onsets and offsets
        of segments.
    min_dur : float
        Minimum duration of a segment, in seconds.
    max_dur : float
        Maximum duration of a segment, in seconds.
    min_isi_dur : float, optional
        Minimum duration of inter-segment intervals, in seconds.
        If specified, any inter-segment intervals shorter than this value
        will be removed, and the adjacent segments merged.
        Default is None.
    use_softmax_amp : bool
        If True, compute summed spectral power from spectrogram
        with a softmax operation on each column.
        Default is True.
    temperature : float
        Temperature for softmax. Only used if ``use_softmax_amp`` is True.
    smoothing_timescale : float
        Timescale to use when smoothing summed spectral power
        with a gaussian filter.
        The window size will be ``dt - smoothing_timescale / samplerate``,
        where ``dt`` is the size of a time bin in the spectrogram.
    scale : bool
        If True, scale the ``sound.data``.
        Default is True.
        This is needed to replicate the behavior of ``ava``,
        which assumes the audio data is loaded as 16-bit integers.
        Since the default for :class:`vocalpy.Sound` is to load sounds
        with a numpy dtype of float64, this function defaults to
        multiplying the ``sound.data`` by 2**15,
        and then casting to the int16 dtype.
        This replicates the behavior of the ``ava`` function,
        given data with dtype float64.
        If you have loaded a sound with a dtype of int16,
        then set this to False.
    scale_val :
        Value to multiply the ``sound.data`` by, to scale the data.
        Default is 2**15.
        Only used if ``scale`` is ``True``.
        This is needed to replicate the behavior of ``ava``,
        which assumes the audio data is loaded as 16-bit integers.
    scale_dtype : numpy.dtype
        Numpy Dtype to cast ``sound.data`` to, after scaling.
        Default is ``np.int16``.
        Only used if ``scale`` is ``True``.
        This is needed to replicate the behavior of ``ava``,
        which assumes the audio data is loaded as 16-bit integers.

    Examples
    --------
    >>> jourjineetal2023paths = voc.example('jourjine-et-al-2023')
    >>> wav_path = jourjine2023paths[0]
    >>> sound = voc.Sound.read(wav_path)
    >>> onsets, offsets = voc.segment.ava(sound, **voc.segment.ava.JOURJINEETAL2023)
    """

    nperseg: int = 1024
    noverlap: int = 512
    min_freq: float = 20e3
    max_freq: float = 125e3
    spect_min_val: float = 0.8
    spect_max_val: float = 6.0
    thresh_lowest: float = 0.3
    thresh_min: float = 0.3
    thresh_max: float = 0.35
    min_dur: float = 0.015
    max_dur: float = 1.0
    min_isi_dur: float | None = None
    use_softmax_amp: bool = False
    temperature: float = 0.01
    smoothing_timescale: float = 0.00025
    scale: bool = True
    scale_val: int | float = 2**15
    scale_dtype: npt.DTypeLike = np.int16



# from https://github.com/nickjourjine/peromyscus-pup-vocal-evolution/blob/main/scripts/Segmenting%20and%20UMAP.ipynb
JOURJINEETAL2023 = AvaParams(
    nperseg=1024,
    noverlap=512,
    min_freq=20e3,
    max_freq=125e3,
    spect_min_val=0.8,
    spect_max_val=6.0,
    thresh_lowest=0.3,
    thresh_min=0.3,
    thresh_max=0.35,
    min_dur=0.015,
    max_dur=1.0,
    min_isi_dur=0.004,
    use_softmax_amp=False,
    temperature=0.01,
    smoothing_timescale=0.00025,
)


# from https://github.com/ralphpeterson/gerbil-vocal-dialects/blob/main/figure1-audio-segmenting-example.ipynb
PETERSONETAL2023 = AvaParams(
    nperseg=512,
    noverlap=256,
    min_freq=500.0,
    max_freq=62.5e3,
    spect_min_val=-8.0,
    spect_max_val=-7.25,
    thresh_lowest=2,
    thresh_min=5,
    thresh_max=2,
    min_dur=0.03,
    max_dur=0.3,
    use_softmax_amp=False,
    temperature=0.01,
    smoothing_timescale=0.007,
)



[docs]
def ava(
    sound: Sound,
    nperseg: int = 1024,
    noverlap: int = 512,
    min_freq: int = 30e3,
    max_freq: int = 110e3,
    spect_min_val: float | None = None,
    spect_max_val: float | None = None,
    thresh_lowest: float = 0.1,
    thresh_min: float = 0.2,
    thresh_max: float = 0.3,
    min_dur: float = 0.03,
    max_dur: float = 0.2,
    min_isi_dur: float | None = None,
    use_softmax_amp: bool = True,
    temperature: float = 0.5,
    smoothing_timescale: float = 0.007,
    scale: bool = True,
    scale_val: int | float = 2**15,
    scale_dtype: npt.DTypeLike = np.int16,
) -> Segments:
    """Find segments in audio, using algorithm
    from ``ava`` package.

    Segments audio by generating a spectrogram from it,
    summing power across frequencies, and then thresholding
    this summed spectral power as if it were an amplitude trace.

    The spectral power is segmented with three thresholds,
    ``thresh_lowest``, ``thresh_min``, and ``thresh_max``, where
    ``thresh_lowest <= thresh_min <= thresh_max``.
    The segmenting algorithm works as follows:
    first detect all local maxima that exceed ``thresh_max``.
    Then for each local maximum, find onsets and offsets.
    An offset is detected wherever a local maxima
    is followed by a subsequent local minimum
    in the summed spectral power less than ``thresh_min``,
    or when the power is less than ``thresh_lowest``.
    Onsets are located in the same way,
    by looking for a preceding local minimum
    less than ``thresh_min``, or any value less than ``thresh_lowest``.

    Parameters
    ----------
    sound : vocalpy.Sound
        Sound loaded from an audio file.
    nperseg : int
        Number of samples per segment for Short-Time Fourier Transform.
        Default is 1024.
    noverlap : int
        Number of samples to overlap per segment
        for Short-Time Fourier Transform.
        Default is 512.
    min_freq : int
        Minimum frequency. Spectrogram is "cropped"
        below this frequency (instead of, e.g.,
        bandpass filtering). Default is 30e3.
    max_freq : int
        Maximum frequency. Spectrogram is "cropped"
        above this frequency (instead of, e.g.,
        bandpass filtering). Default is 110e3.
    spect_min_val : float, optional
        Expected minimum value of spectrogram
        after transforming to the log of the
        magnitude. Used for a min-max scaling:
        :math:`(s - s_{min} / (s_{max} - s_{min})`
        where ``spect_min_val`` is :math:`s_{min}`.
        Default is None, in which case
        the minimum value of the spectrogram is used.
    spect_max_val : float, optional
        Expected maximum value of spectrogram
        after transforming to the log of the
        magnitude. Used for a min-max scaling:
        :math:`(s - s_{min} / (s_{max} - s_{min})`
        where ``spect_min_val`` is :math:`s_{min}`.
        Default is None, in which case
        the maximum value of the spectrogram is used.
    thresh_max : float
        Threshold used to find local maxima.
    thresh_min : float
        Threshold used to find local minima,
        in relation to local maxima.
        Used to find onsets and offsets of segments.
    thresh_lowest : float
        Lowest threshold used to find onsets and offsets
        of segments.
    min_dur : float
        Minimum duration of a segment, in seconds.
    max_dur : float
        Maximum duration of a segment, in seconds.
    min_isi_dur : float, optional
        Minimum duration of inter-segment intervals, in seconds.
        If specified, any inter-segment intervals shorter than this value
        will be removed, and the adjacent segments merged.
        Default is None.
    use_softmax_amp : bool
        If True, compute summed spectral power from spectrogram
        with a softmax operation on each column.
        Default is True.
    temperature : float
        Temperature for softmax. Only used if ``use_softmax_amp`` is True.
    smoothing_timescale : float
        Timescale to use when smoothing summed spectral power
        with a gaussian filter.
        The window size will be ``dt - smoothing_timescale / samplerate``,
        where ``dt`` is the size of a time bin in the spectrogram.
    scale : bool
        If True, scale the ``sound.data``.
        Default is True.
        This is needed to replicate the behavior of ``ava``,
        which assumes the audio data is loaded as 16-bit integers.
        Since the default for :class:`vocalpy.Sound` is to load sounds
        with a numpy dtype of float64, this function defaults to
        multiplying the ``sound.data`` by 2**15,
        and then casting to the int16 dtype.
        This replicates the behavior of the ``ava`` function,
        given data with dtype float64.
        If you have loaded a sound with a dtype of int16,
        then set this to False.
    scale_val :
        Value to multiply the ``sound.data`` by, to scale the data.
        Default is 2**15.
        Only used if ``scale`` is ``True``.
        This is needed to replicate the behavior of ``ava``,
        which assumes the audio data is loaded as 16-bit integers.
    scale_dtype : numpy.dtype
        Numpy Dtype to cast ``sound.data`` to, after scaling.
        Default is ``np.int16``.
        Only used if ``scale`` is ``True``.
        This is needed to replicate the behavior of ``ava``,
        which assumes the audio data is loaded as 16-bit integers.

    Returns
    -------
    segments : vocalpy.Segments
        Instance of :class:`vocalpy.Segments` representing
        the segments found.

    Examples
    --------
    >>> jourjineetal2023paths = voc.example('jourjine-et-al-2023')
    >>> wav_path = jourjineetal2023paths[0]
    >>> sound = voc.Sound.read(wav_path)
    >>> params = {**voc.segment.ava.JOURJINEETAL2023}
    >>> del params['min_isi_dur']
    >>> segments = voc.segment.ava(sound, **params)
    >>> spect = voc.spectrogram(sound)
    >>> rows = 3; cols = 4
    >>> import matplotlib.pyplot as plt
    >>> fig, ax_arr = plt.subplots(rows, cols)
    >>> start_inds, stop_inds = segments.start_inds, segments.stop_inds
    >>> ax_to_use = ax_arr.ravel()[:start_inds.shape[0]]
    >>> for start_ind, stop_ind, ax in zip(start_inds, stop_inds, ax_to_use):
    ...     data = sound.data[:, start_ind:stop_ind]
    ...     newsound = voc.Sound(data=data, samplerate=sound.samplerate)
    ...     spect = voc.spectrogram(newsound)
    ...     ax.pcolormesh(spect.times, spect.frequencies, np.squeeze(spect.data))
    >>> for ax in ax_arr.ravel()[:start_inds.shape[0]]:
    ...     ax.set_axis_off()
    >>> for ax in ax_arr.ravel()[start_inds.shape[0]:]:
    ...     ax.remove()

    Notes
    -----
    Code is adapted from [2]_.
    Default parameters are taken from example script here:
    https://github.com/pearsonlab/autoencoded-vocal-analysis/blob/master/examples/mouse_sylls_mwe.py
    Note that example script suggests tuning these parameters using functionality built into it,
    that we do not replicate here.

    Versions of this algorithm were also used to segment
    rodent vocalizations in [4]_ (see code in [5]_)
    and [6]_ (see code in [7]_).

    References
    ----------
    .. [1] Goffinet, J., Brudner, S., Mooney, R., & Pearson, J. (2021).
      Low-dimensional learned feature spaces quantify individual and group differences in vocal repertoires.
      eLife, 10:e67855. https://doi.org/10.7554/eLife.67855

    .. [2] https://github.com/pearsonlab/autoencoded-vocal-analysis

    .. [3] Goffinet, J., Brudner, S., Mooney, R., & Pearson, J. (2021).
      Data from: Low-dimensional learned feature spaces quantify individual
      and group differences in vocal repertoires. Duke Research Data Repository.
      https://doi.org/10.7924/r4gq6zn8w

    .. [4] Nicholas Jourjine, Maya L. Woolfolk, Juan I. Sanguinetti-Scheck, John E. Sabatini,
      Sade McFadden, Anna K. Lindholm, Hopi E. Hoekstra,
      Two pup vocalization types are genetically and functionally separable in deer mice,
      Current Biology, 2023 https://doi.org/10.1016/j.cub.2023.02.045

    .. [5] https://github.com/nickjourjine/peromyscus-pup-vocal-evolution/blob/main/src/segmentation.py

    .. [6] Peterson, Ralph Emilio, Aman Choudhri, Catalin MItelut, Aramis Tanelus, Athena Capo-Battaglia,
      Alex H. Williams, David M. Schneider, and Dan H. Sanes.
      "Unsupervised discovery of family specific vocal usage in the Mongolian gerbil."
      bioRxiv (2023): 2023-03.

    .. [7] https://github.com/ralphpeterson/gerbil-vocal-dialects/blob/main/vocalization_segmenting.py
    """
    if sound.data.shape[0] > 1:
        raise ValueError(
            f"The ``sound`` has {sound.data.shape[0]} channels, but segmentation is not implemented "
            "for sounds with multiple channels. This is because there can be a different number of segments "
            "per channel, which cannot be represented as a rectangular array. To segment each channel, "
            "first split the channels into separate ``vocalpy.Sound`` instances, then pass each to this function."
            "For example,\n"
            ">>> sound_channels = [sound_ for sound_ in sound]  # split with a list comprehension\n"
            ">>> channel_segments = [vocalpy.segment.meansquared(sound_) for sound_ in sound_channels]\n"
        )

    data = np.squeeze(
        sound.data, axis=0
    )  # get rid of channels dim so we operate on scalars in main loop

    if scale:
        data = (data * scale_val).astype(scale_dtype)

    # ---- compute spectrogram
    # TODO: return Spectrogram for each Segment when we return Segments
    f, t, spect = stft(
        data, sound.samplerate, nperseg=nperseg, noverlap=noverlap
    )
    i1 = np.searchsorted(f, min_freq)
    i2 = np.searchsorted(f, max_freq)
    f, spect = f[i1:i2], spect[i1:i2]
    spect = np.log(np.abs(spect) + EPSILON)
    spect -= spect_min_val
    spect /= spect_max_val - spect_min_val
    spect = np.clip(spect, 0.0, 1.0)

    # we determine `dt` here in case we need it for `amps`
    # we also use it below to remove segments shorter than the minimum allowed value
    dt = t[1] - t[0]

    # ---- calculate amplitude and smooth.
    if use_softmax_amp:
        temp = np.exp(spect / temperature)
        temp /= np.sum(temperature, axis=0) + EPSILON
        amps = np.sum(np.multiply(spect, temp), axis=0)
    else:
        amps = np.sum(spect, axis=0)
    amps = gaussian_filter(amps, smoothing_timescale / dt)

    # Find local maxima greater than thresh_max.
    local_maxima = []
    for i in range(1, len(amps) - 1, 1):
        if amps[i] > thresh_max and amps[i] == np.max(
            amps[i - 1 : i + 2]  # noqa: E203
        ):
            local_maxima.append(i)

    # Then search to the left and right for onsets and offsets.
    onsets, offsets = [], []
    for local_max in local_maxima:
        # skip this local_max if we are in a region we already declared a segment.
        # Note the next line in the original implementation had an off-by-one error,
        # that is fixed here to avoid occasionally duplicating the first segment. See:
        # https://github.com/pearsonlab/autoencoded-vocal-analysis/issues/12
        if len(offsets) > 0 and local_max < offsets[-1]:
            continue

        # first find onset
        i = local_max - 1
        while (
            i > 0
        ):  # could we do ``while i > min(0, i - max_syl_length)`` to speed up?
            # this if-else can be a single `if` with an `or`
            # and then I think we can remove the `if len(onsets)` blocks
            if amps[i] < thresh_lowest:
                onsets.append(i)
                break
            elif amps[i] < thresh_min and amps[i] == np.min(
                amps[i - 1 : i + 2]  # noqa: E203
            ):
                onsets.append(i)
                break
            i -= 1

        # if we found multiple onsets because of if/else, then only keep one
        if len(onsets) != len(offsets) + 1:
            onsets = onsets[: len(offsets)]
            continue

        # then find offset
        i = local_max + 1
        while i < len(
            amps
        ):  # could we do ``while i > min(amps, i + max_syl_length)`` to speed up?
            # this if-else can be a single `if` with an `or`
            # and then I think we can remove the `if len(onsets)` blocks
            if amps[i] < thresh_lowest:
                offsets.append(i)
                break
            elif amps[i] < thresh_min and amps[i] == np.min(
                amps[i - 1 : i + 2]  # noqa: E203
            ):
                offsets.append(i)
                break
            i += 1

        # if we found multiple offsets because of if/else, then only keep one
        # shouldn't this change offsets though?
        if len(onsets) != len(offsets):
            onsets = onsets[: len(offsets)]
            continue

    # Throw away segments that are too long or too short.
    min_dur_samples = int(np.floor(min_dur / dt))
    max_dur_samples = int(np.ceil(max_dur / dt))
    new_onsets, new_offsets = [], []
    for i in range(len(offsets)):
        t1, t2 = onsets[i], offsets[i]
        if t2 - t1 + 1 <= max_dur_samples and t2 - t1 + 1 >= min_dur_samples:
            new_onsets.append(t1 * dt)
            new_offsets.append(t2 * dt)
    onsets = np.array(new_onsets)
    offsets = np.array(new_offsets)

    if onsets.size == 0 and offsets.size == 0:
        # can't throw any intervals away (next code block)
        # if there's not any intervals, so, return empty Segments
        return Segments(
            np.array([]).astype(int),
            np.array([]).astype(int),
            sound.samplerate,
        )

    # Throw away inter-segment intervals that are too short, as is done in Jourjine et al., 2023
    # Note we do this **after** throwing away segments that are too long or too short.
    # We do this to replicate what was done in Jourjine et al., 2023, where they call `ava.get_onsets_offsets`
    # and then remove inter-syllable intervals less than a specified duration.
    # This means there is the possibility of throwing away some short segments that we might have merged,
    # if we'd removed inter-segment intervals *first*, which is what `vocalpy.segment.meansquared` does.
    if min_isi_dur is not None:
        isi_durs = onsets[1:] - offsets[:-1]
        keep_these = isi_durs > min_isi_dur
        # we don't keep seconds anymore since we're returning samples
        onsets = np.concatenate(
            (onsets[0, np.newaxis], onsets[1:][keep_these])
        )
        offsets = np.concatenate(
            (offsets[:-1][keep_these], offsets[-1, np.newaxis])
        )

    onsets_sample = (onsets * sound.samplerate).astype(int)
    offsets_sample = (offsets * sound.samplerate).astype(int)
    lengths = offsets_sample - onsets_sample
    # Handle edge case where we decide last time bin is offset,
    # and the time of the last time bin in `t` (as computed from `dt`) is greater than duration of sound.
    # Fixes https://github.com/vocalpy/vocalpy/issues/167
    if onsets_sample[-1] + lengths[-1] > sound.samples:
        # set length to be "until the end of the sound"
        lengths[-1] = sound.samples - onsets_sample[-1]
    return Segments(
        start_inds=onsets_sample, lengths=lengths, samplerate=sound.samplerate
    )