Source code for vocalpy.segment.meansquared

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, Iterable

import numpy as np
import numpy.typing as npt

from .. import signal
from ..params import Params
from ..segments import Segments

if TYPE_CHECKING:
    from .. import Sound


[docs] @dataclass class MeanSquaredParams(Params): """Data class that represents parameters for :func:`vocalpy.segment.ava.segment`. Attributes ---------- threshold : int Value above which mean squared signal is considered part of a segment. Default is 5000. min_dur : float Minimum duration of a segment, in seconds. Default is 0.02, i.e. 20 ms. min_silent_dur : float Minimum duration of silent gap between segments, in seconds. Default is 0.002, i.e. 2 ms. freq_cutoffs : Iterable Cutoff frequencies for bandpass filter. List or tuple with two elements, default is ``(500, 10000)``. smooth_win : int Size of smoothing window in milliseconds. Default is 2. scale : bool If True, scale the ``sound.data``. Default is True. This is needed to replicate the behavior of ``evsonganaly``, which assumes the audio data is loaded as 16-bit integers. Since the default for :class:`vocalpy.Sound` is to load sounds with a numpy dtype of float64, this function defaults to multiplying the ``sound.data`` by 2**15, and then casting to the int16 dtype. This replicates the behavior of the ``evsonganaly`` function, given data with dtype float64. If you have loaded a sound with a dtype of int16, then set this to False. scale_val : Value to multiply the ``sound.data`` by, to scale the data. Default is 2**15. Only used if ``scale`` is ``True``. This is needed to replicate the behavior of ``evsonganaly``, which assumes the audio data is loaded as 16-bit integers. scale_dtype : numpy.dtype Numpy Dtype to cast ``sound.data`` to, after scaling. Default is ``np.int16``. Only used if ``scale`` is ``True``. This is needed to replicate the behavior of ``ava``, which assumes the audio data is loaded as 16-bit integers. """ threshold: int = 5000 min_dur: float = 0.02 min_silent_dur: float = 0.002 freq_cutoffs: Iterable = (500, 10000) smooth_win: int = 2 scale: bool = True scale_val: int | float = 2**15 scale_dtype: npt.DTypeLike = np.int16
[docs] def meansquared( sound: Sound, threshold: int = 5000, min_dur: float = 0.02, min_silent_dur: float = 0.002, freq_cutoffs: Iterable = (500, 10000), smooth_win: int = 2, scale: bool = True, scale_val: int | float = 2**15, scale_dtype: npt.DTypeLike = np.int16, ) -> Segments: """Segment audio by thresholding the mean squared signal. Converts audio to the mean squared of the signal (using :func:`vocalpy.signal.audio.meansquared`). Then finds all continuous periods in the mean squared signal above ``threshold``. These periods are considered candidate segments. Candidates are removed that have a duration less than ``minimum_dur``; then, any two segments with a silent gap between them less than ``min_silent_dur`` are merged into a single segment. The segments remaining after this post-processing are returned as onset and offset times in two NumPy arrays. Note that :func:`vocalpy.signal.audio.meansquared` first filters the audio, with :func:`vocalpy.signal.audio.bandpass_filtfilt`, using ``freq_cutoffs``, and then computes a running average of the squared signal by convolving with a window of size ``smooth_win`` milliseconds. Parameters ---------- sound: vocalpy.Sound An audio signal. threshold : int Value above which mean squared signal is considered part of a segment. Default is 5000. min_dur : float Minimum duration of a segment, in seconds. Default is 0.02, i.e. 20 ms. min_silent_dur : float Minimum duration of silent gap between segments, in seconds. Default is 0.002, i.e. 2 ms. freq_cutoffs : Iterable Cutoff frequencies for bandpass filter. List or tuple with two elements, default is ``(500, 10000)``. smooth_win : int Size of smoothing window in milliseconds. Default is 2. scale : bool If True, scale the ``sound.data``. Default is True. This is needed to replicate the behavior of ``evsonganaly``, which assumes the audio data is loaded as 16-bit integers. Since the default for :class:`vocalpy.Sound` is to load sounds with a numpy dtype of float64, this function defaults to multiplying the ``sound.data`` by 2**15, and then casting to the int16 dtype. This replicates the behavior of the ``evsonganaly`` function, given data with dtype float64. If you have loaded a sound with a dtype of int16, then set this to False. scale_val : Value to multiply the ``sound.data`` by, to scale the data. Default is 2**15. Only used if ``scale`` is ``True``. This is needed to replicate the behavior of ``evsonganaly``, which assumes the audio data is loaded as 16-bit integers. scale_dtype : numpy.dtype Numpy Dtype to cast ``sound.data`` to, after scaling. Default is ``np.int16``. Only used if ``scale`` is ``True``. This is needed to replicate the behavior of ``evsonganaly``, which assumes the audio data is loaded as 16-bit integers. Returns ------- segments : vocalpy.Segments Instance of :class:`vocalpy.Segments` representing the segments found. Examples -------- >>> sounds = voc.examples('bfsongrepo', return_type='sound') >>> segments = voc.segment.meansquared(sounds[0]) >>> print(segments) """ if sound.data.shape[0] > 1: raise ValueError( f"The ``sound`` has {sound.data.shape[0]} channels, but segmentation is not implemented " "for sounds with multiple channels. This is because there can be a different number of segments " "per channel, which cannot be represented as a rectangular array. To segment each channel, " "first split the channels into separate ``vocalpy.Sound`` instances, then pass each to this function." "For example,\n" ">>> sound_channels = [sound_ for sound_ in sound] # split with a list comprehension\n" ">>> channel_segments = [vocalpy.segment.meansquared(sound_) for sound_ in sound_channels]\n" ) if scale: # make a copy so we don't mutate data of input argument silently # since a user won't expect this. # we need to make a new Sound since that's what `signal.audio.meansquared` expect. # we could avoid this by just moving the logic of those functions inside this one # or move them into this module, since it's currently the only place they're used internally # (and probably externally) from .. import Sound sound_copy = Sound( data=(sound.data * scale_val).astype(scale_dtype), samplerate=sound.samplerate, ) meansquared_ = signal.audio.meansquared( sound_copy, freq_cutoffs, smooth_win ) else: meansquared_ = signal.audio.meansquared( sound, freq_cutoffs, smooth_win ) # we get rid of the channel dimension *after* calling ``signal.audio.meansquared`` # because that function *does* work on multi-channel data meansquared_ = np.squeeze(meansquared_, axis=0) above_th = meansquared_ > threshold h = [1, -1] # convolving with h causes: # +1 whenever above_th changes from 0 to 1 # and -1 whenever above_th changes from 1 to 0 above_th_convoluted = np.convolve(h, above_th) # always get in units of sample first, then convert to s onsets_sample = np.where(above_th_convoluted > 0)[0] offsets_sample = np.where(above_th_convoluted < 0)[0] onsets_s = onsets_sample / sound.samplerate offsets_s = offsets_sample / sound.samplerate if onsets_s.shape[0] < 1 or offsets_s.shape[0] < 1: return None # because no onsets or offsets in this file # get rid of silent intervals that are shorter than min_silent_dur silent_gap_durs = onsets_s[1:] - offsets_s[:-1] # duration of silent gaps keep_these = np.nonzero(silent_gap_durs > min_silent_dur) onsets_s = np.concatenate( (onsets_s[0, np.newaxis], onsets_s[1:][keep_these]) ) offsets_s = np.concatenate( (offsets_s[:-1][keep_these], offsets_s[-1, np.newaxis]) ) # we do some double-bookkeeping here, not sure if there's a smarter way onsets_sample = np.concatenate( (onsets_sample[0, np.newaxis], onsets_sample[1:][keep_these]) ) offsets_sample = np.concatenate( (offsets_sample[:-1][keep_these], offsets_sample[-1, np.newaxis]) ) # eliminate syllables with duration shorter than min_dur above_th_segment_durs = offsets_s - onsets_s keep_these = np.nonzero(above_th_segment_durs > min_dur) # we no longer need onsets/offsets in seconds, so we just update onsets_sample/offsets_sample onsets_sample = onsets_sample[keep_these] offsets_sample = offsets_sample[keep_these] lengths = offsets_sample - onsets_sample return Segments( start_inds=onsets_sample, lengths=lengths, samplerate=sound.samplerate )