Source code for vocalpy.segment.meansquared

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, Iterable

import numpy as np
import numpy.typing as npt

from .. import signal
from ..params import Params
from ..segments import Segments

if TYPE_CHECKING:
    from .. import Sound



[docs]
@dataclass
class MeanSquaredParams(Params):
    """Data class that represents parameters
    for :func:`vocalpy.segment.ava.segment`.

    Attributes
    ----------
    threshold : int
        Value above which mean squared signal is considered part of a segment.
        Default is 5000.
    min_dur : float
        Minimum duration of a segment, in seconds.
        Default is 0.02, i.e. 20 ms.
    min_silent_dur : float
        Minimum duration of silent gap between segments, in seconds.
        Default is 0.002, i.e. 2 ms.
    freq_cutoffs : Iterable
        Cutoff frequencies for bandpass filter.
        List or tuple with two elements, default is ``(500, 10000)``.
    smooth_win : int
        Size of smoothing window in milliseconds. Default is 2.
    scale : bool
        If True, scale the ``sound.data``.
        Default is True.
        This is needed to replicate the behavior of ``evsonganaly``,
        which assumes the audio data is loaded as 16-bit integers.
        Since the default for :class:`vocalpy.Sound` is to load sounds
        with a numpy dtype of float64, this function defaults to
        multiplying the ``sound.data`` by 2**15,
        and then casting to the int16 dtype.
        This replicates the behavior of the ``evsonganaly`` function,
        given data with dtype float64.
        If you have loaded a sound with a dtype of int16,
        then set this to False.
    scale_val :
        Value to multiply the ``sound.data`` by, to scale the data.
        Default is 2**15.
        Only used if ``scale`` is ``True``.
        This is needed to replicate the behavior of ``evsonganaly``,
        which assumes the audio data is loaded as 16-bit integers.
    scale_dtype : numpy.dtype
        Numpy Dtype to cast ``sound.data`` to, after scaling.
        Default is ``np.int16``.
        Only used if ``scale`` is ``True``.
        This is needed to replicate the behavior of ``ava``,
        which assumes the audio data is loaded as 16-bit integers.
    """

    threshold: int = 5000
    min_dur: float = 0.02
    min_silent_dur: float = 0.002
    freq_cutoffs: Iterable = (500, 10000)
    smooth_win: int = 2
    scale: bool = True
    scale_val: int | float = 2**15
    scale_dtype: npt.DTypeLike = np.int16




[docs]
def meansquared(
    sound: Sound,
    threshold: int = 5000,
    min_dur: float = 0.02,
    min_silent_dur: float = 0.002,
    freq_cutoffs: Iterable = (500, 10000),
    smooth_win: int = 2,
    scale: bool = True,
    scale_val: int | float = 2**15,
    scale_dtype: npt.DTypeLike = np.int16,
) -> Segments:
    """Segment audio by thresholding the mean squared signal.

    Converts audio to the mean squared of the signal
    (using :func:`vocalpy.signal.audio.meansquared`).
    Then finds all continuous periods
    in the mean squared signal above ``threshold``.
    These periods are considered candidate segments.
    Candidates are removed that have a duration less than
    ``minimum_dur``; then, any two segments with a silent
    gap between them less than ``min_silent_dur`` are merged
    into a single segment. The segments remaining after this
    post-processing are returned as onset and offset times
    in two NumPy arrays.

    Note that :func:`vocalpy.signal.audio.meansquared`
    first filters the audio, with
    :func:`vocalpy.signal.audio.bandpass_filtfilt`,
    using ``freq_cutoffs``, and then computes
    a running average of the squared signal
    by convolving with a window of size ``smooth_win``
    milliseconds.

    Parameters
    ----------
    sound: vocalpy.Sound
        An audio signal.
    threshold : int
        Value above which mean squared signal is considered part of a segment.
        Default is 5000.
    min_dur : float
        Minimum duration of a segment, in seconds.
        Default is 0.02, i.e. 20 ms.
    min_silent_dur : float
        Minimum duration of silent gap between segments, in seconds.
        Default is 0.002, i.e. 2 ms.
    freq_cutoffs : Iterable
        Cutoff frequencies for bandpass filter.
        List or tuple with two elements, default is ``(500, 10000)``.
    smooth_win : int
        Size of smoothing window in milliseconds. Default is 2.
    scale : bool
        If True, scale the ``sound.data``.
        Default is True.
        This is needed to replicate the behavior of ``evsonganaly``,
        which assumes the audio data is loaded as 16-bit integers.
        Since the default for :class:`vocalpy.Sound` is to load sounds
        with a numpy dtype of float64, this function defaults to
        multiplying the ``sound.data`` by 2**15,
        and then casting to the int16 dtype.
        This replicates the behavior of the ``evsonganaly`` function,
        given data with dtype float64.
        If you have loaded a sound with a dtype of int16,
        then set this to False.
    scale_val :
        Value to multiply the ``sound.data`` by, to scale the data.
        Default is 2**15.
        Only used if ``scale`` is ``True``.
        This is needed to replicate the behavior of ``evsonganaly``,
        which assumes the audio data is loaded as 16-bit integers.
    scale_dtype : numpy.dtype
        Numpy Dtype to cast ``sound.data`` to, after scaling.
        Default is ``np.int16``.
        Only used if ``scale`` is ``True``.
        This is needed to replicate the behavior of ``evsonganaly``,
        which assumes the audio data is loaded as 16-bit integers.

    Returns
    -------
    segments : vocalpy.Segments
        Instance of :class:`vocalpy.Segments` representing
        the segments found.

    Examples
    --------
    >>> sounds = voc.examples('bfsongrepo', return_type='sound')
    >>> segments = voc.segment.meansquared(sounds[0])
    >>> print(segments)

    """
    if sound.data.shape[0] > 1:
        raise ValueError(
            f"The ``sound`` has {sound.data.shape[0]} channels, but segmentation is not implemented "
            "for sounds with multiple channels. This is because there can be a different number of segments "
            "per channel, which cannot be represented as a rectangular array. To segment each channel, "
            "first split the channels into separate ``vocalpy.Sound`` instances, then pass each to this function."
            "For example,\n"
            ">>> sound_channels = [sound_ for sound_ in sound]  # split with a list comprehension\n"
            ">>> channel_segments = [vocalpy.segment.meansquared(sound_) for sound_ in sound_channels]\n"
        )

    if scale:
        # make a copy so we don't mutate data of input argument silently
        # since a user won't expect this.
        # we need to make a new Sound since that's what `signal.audio.meansquared` expect.
        # we could avoid this by just moving the logic of those functions inside this one
        # or move them into this module, since it's currently the only place they're used internally
        # (and probably externally)
        from .. import Sound

        sound_copy = Sound(
            data=(sound.data * scale_val).astype(scale_dtype),
            samplerate=sound.samplerate,
        )
        meansquared_ = signal.audio.meansquared(
            sound_copy, freq_cutoffs, smooth_win
        )
    else:
        meansquared_ = signal.audio.meansquared(
            sound, freq_cutoffs, smooth_win
        )

    # we get rid of the channel dimension *after* calling ``signal.audio.meansquared``
    # because that function *does* work on multi-channel data
    meansquared_ = np.squeeze(meansquared_, axis=0)
    above_th = meansquared_ > threshold
    h = [1, -1]
    # convolving with h causes:
    # +1 whenever above_th changes from 0 to 1
    # and -1 whenever above_th changes from 1 to 0
    above_th_convoluted = np.convolve(h, above_th)

    # always get in units of sample first, then convert to s
    onsets_sample = np.where(above_th_convoluted > 0)[0]
    offsets_sample = np.where(above_th_convoluted < 0)[0]
    onsets_s = onsets_sample / sound.samplerate
    offsets_s = offsets_sample / sound.samplerate

    if onsets_s.shape[0] < 1 or offsets_s.shape[0] < 1:
        return None  # because no onsets or offsets in this file

    # get rid of silent intervals that are shorter than min_silent_dur
    silent_gap_durs = onsets_s[1:] - offsets_s[:-1]  # duration of silent gaps
    keep_these = np.nonzero(silent_gap_durs > min_silent_dur)
    onsets_s = np.concatenate(
        (onsets_s[0, np.newaxis], onsets_s[1:][keep_these])
    )
    offsets_s = np.concatenate(
        (offsets_s[:-1][keep_these], offsets_s[-1, np.newaxis])
    )
    # we do some double-bookkeeping here, not sure if there's a smarter way
    onsets_sample = np.concatenate(
        (onsets_sample[0, np.newaxis], onsets_sample[1:][keep_these])
    )
    offsets_sample = np.concatenate(
        (offsets_sample[:-1][keep_these], offsets_sample[-1, np.newaxis])
    )

    # eliminate syllables with duration shorter than min_dur
    above_th_segment_durs = offsets_s - onsets_s
    keep_these = np.nonzero(above_th_segment_durs > min_dur)
    # we no longer need onsets/offsets in seconds, so we just update onsets_sample/offsets_sample
    onsets_sample = onsets_sample[keep_these]
    offsets_sample = offsets_sample[keep_these]
    lengths = offsets_sample - onsets_sample

    return Segments(
        start_inds=onsets_sample, lengths=lengths, samplerate=sound.samplerate
    )