"""Find segments in audio, using algorithm from ``ava`` package."""
from __future__ import annotations
from dataclasses import dataclass
from typing import TYPE_CHECKING
import numpy as np
import numpy.typing as npt
from scipy.ndimage import gaussian_filter
from scipy.signal import stft
from ..params import Params
from ..segments import Segments
if TYPE_CHECKING:
from .. import Sound
EPSILON = 1e-9
[docs]
@dataclass
class AvaParams(Params):
"""Data class that represents parameters
for :func:`vocalpy.segment.ava`.
Constants in this module are instances of this class
that represent parameters used in papers.
Attributes
----------
nperseg : int
Number of samples per segment for Short-Time Fourier Transform.
Default is 1024.
noverlap : int
Number of samples to overlap per segment
for Short-Time Fourier Transform.
Default is 512.
min_freq : int
Minimum frequency. Spectrogram is "cropped"
below this frequency (instead of, e.g.,
bandpass filtering). Default is 30e3.
max_freq : int
Maximum frequency. Spectrogram is "cropped"
above this frequency (instead of, e.g.,
bandpass filtering). Default is 110e3.
spect_min_val : float, optional
Expected minimum value of spectrogram
after transforming to the log of the
magnitude. Used for a min-max scaling:
:math:`(s - s_{min} / (s_{max} - s_{min})`
where ``spect_min_val`` is :math:`s_{min}`.
Default is None, in which case
the minimum value of the spectrogram is used.
spect_max_val : float, optional
Expected maximum value of spectrogram
after transforming to the log of the
magnitude. Used for a min-max scaling:
:math:`(s - s_{min} / (s_{max} - s_{min})`
where ``spect_min_val`` is :math:`s_{min}`.
Default is None, in which case
the maximum value of the spectrogram is used.
thresh_max : float
Threshold used to find local maxima.
thresh_min : float
Threshold used to find local minima,
in relation to local maxima.
Used to find onsets and offsets of segments.
thresh_lowest : float
Lowest threshold used to find onsets and offsets
of segments.
min_dur : float
Minimum duration of a segment, in seconds.
max_dur : float
Maximum duration of a segment, in seconds.
min_isi_dur : float, optional
Minimum duration of inter-segment intervals, in seconds.
If specified, any inter-segment intervals shorter than this value
will be removed, and the adjacent segments merged.
Default is None.
use_softmax_amp : bool
If True, compute summed spectral power from spectrogram
with a softmax operation on each column.
Default is True.
temperature : float
Temperature for softmax. Only used if ``use_softmax_amp`` is True.
smoothing_timescale : float
Timescale to use when smoothing summed spectral power
with a gaussian filter.
The window size will be ``dt - smoothing_timescale / samplerate``,
where ``dt`` is the size of a time bin in the spectrogram.
scale : bool
If True, scale the ``sound.data``.
Default is True.
This is needed to replicate the behavior of ``ava``,
which assumes the audio data is loaded as 16-bit integers.
Since the default for :class:`vocalpy.Sound` is to load sounds
with a numpy dtype of float64, this function defaults to
multiplying the ``sound.data`` by 2**15,
and then casting to the int16 dtype.
This replicates the behavior of the ``ava`` function,
given data with dtype float64.
If you have loaded a sound with a dtype of int16,
then set this to False.
scale_val :
Value to multiply the ``sound.data`` by, to scale the data.
Default is 2**15.
Only used if ``scale`` is ``True``.
This is needed to replicate the behavior of ``ava``,
which assumes the audio data is loaded as 16-bit integers.
scale_dtype : numpy.dtype
Numpy Dtype to cast ``sound.data`` to, after scaling.
Default is ``np.int16``.
Only used if ``scale`` is ``True``.
This is needed to replicate the behavior of ``ava``,
which assumes the audio data is loaded as 16-bit integers.
Examples
--------
>>> jourjineetal2023paths = voc.example('jourjine-et-al-2023')
>>> wav_path = jourjine2023paths[0]
>>> sound = voc.Sound.read(wav_path)
>>> onsets, offsets = voc.segment.ava(sound, **voc.segment.ava.JOURJINEETAL2023)
"""
nperseg: int = 1024
noverlap: int = 512
min_freq: float = 20e3
max_freq: float = 125e3
spect_min_val: float = 0.8
spect_max_val: float = 6.0
thresh_lowest: float = 0.3
thresh_min: float = 0.3
thresh_max: float = 0.35
min_dur: float = 0.015
max_dur: float = 1.0
min_isi_dur: float | None = None
use_softmax_amp: bool = False
temperature: float = 0.01
smoothing_timescale: float = 0.00025
scale: bool = True
scale_val: int | float = 2**15
scale_dtype: npt.DTypeLike = np.int16
# from https://github.com/nickjourjine/peromyscus-pup-vocal-evolution/blob/main/scripts/Segmenting%20and%20UMAP.ipynb
JOURJINEETAL2023 = AvaParams(
nperseg=1024,
noverlap=512,
min_freq=20e3,
max_freq=125e3,
spect_min_val=0.8,
spect_max_val=6.0,
thresh_lowest=0.3,
thresh_min=0.3,
thresh_max=0.35,
min_dur=0.015,
max_dur=1.0,
min_isi_dur=0.004,
use_softmax_amp=False,
temperature=0.01,
smoothing_timescale=0.00025,
)
# from https://github.com/ralphpeterson/gerbil-vocal-dialects/blob/main/figure1-audio-segmenting-example.ipynb
PETERSONETAL2023 = AvaParams(
nperseg=512,
noverlap=256,
min_freq=500.0,
max_freq=62.5e3,
spect_min_val=-8.0,
spect_max_val=-7.25,
thresh_lowest=2,
thresh_min=5,
thresh_max=2,
min_dur=0.03,
max_dur=0.3,
use_softmax_amp=False,
temperature=0.01,
smoothing_timescale=0.007,
)
[docs]
def ava(
sound: Sound,
nperseg: int = 1024,
noverlap: int = 512,
min_freq: int = 30e3,
max_freq: int = 110e3,
spect_min_val: float | None = None,
spect_max_val: float | None = None,
thresh_lowest: float = 0.1,
thresh_min: float = 0.2,
thresh_max: float = 0.3,
min_dur: float = 0.03,
max_dur: float = 0.2,
min_isi_dur: float | None = None,
use_softmax_amp: bool = True,
temperature: float = 0.5,
smoothing_timescale: float = 0.007,
scale: bool = True,
scale_val: int | float = 2**15,
scale_dtype: npt.DTypeLike = np.int16,
) -> Segments:
"""Find segments in audio, using algorithm
from ``ava`` package.
Segments audio by generating a spectrogram from it,
summing power across frequencies, and then thresholding
this summed spectral power as if it were an amplitude trace.
The spectral power is segmented with three thresholds,
``thresh_lowest``, ``thresh_min``, and ``thresh_max``, where
``thresh_lowest <= thresh_min <= thresh_max``.
The segmenting algorithm works as follows:
first detect all local maxima that exceed ``thresh_max``.
Then for each local maximum, find onsets and offsets.
An offset is detected wherever a local maxima
is followed by a subsequent local minimum
in the summed spectral power less than ``thresh_min``,
or when the power is less than ``thresh_lowest``.
Onsets are located in the same way,
by looking for a preceding local minimum
less than ``thresh_min``, or any value less than ``thresh_lowest``.
Parameters
----------
sound : vocalpy.Sound
Sound loaded from an audio file.
nperseg : int
Number of samples per segment for Short-Time Fourier Transform.
Default is 1024.
noverlap : int
Number of samples to overlap per segment
for Short-Time Fourier Transform.
Default is 512.
min_freq : int
Minimum frequency. Spectrogram is "cropped"
below this frequency (instead of, e.g.,
bandpass filtering). Default is 30e3.
max_freq : int
Maximum frequency. Spectrogram is "cropped"
above this frequency (instead of, e.g.,
bandpass filtering). Default is 110e3.
spect_min_val : float, optional
Expected minimum value of spectrogram
after transforming to the log of the
magnitude. Used for a min-max scaling:
:math:`(s - s_{min} / (s_{max} - s_{min})`
where ``spect_min_val`` is :math:`s_{min}`.
Default is None, in which case
the minimum value of the spectrogram is used.
spect_max_val : float, optional
Expected maximum value of spectrogram
after transforming to the log of the
magnitude. Used for a min-max scaling:
:math:`(s - s_{min} / (s_{max} - s_{min})`
where ``spect_min_val`` is :math:`s_{min}`.
Default is None, in which case
the maximum value of the spectrogram is used.
thresh_max : float
Threshold used to find local maxima.
thresh_min : float
Threshold used to find local minima,
in relation to local maxima.
Used to find onsets and offsets of segments.
thresh_lowest : float
Lowest threshold used to find onsets and offsets
of segments.
min_dur : float
Minimum duration of a segment, in seconds.
max_dur : float
Maximum duration of a segment, in seconds.
min_isi_dur : float, optional
Minimum duration of inter-segment intervals, in seconds.
If specified, any inter-segment intervals shorter than this value
will be removed, and the adjacent segments merged.
Default is None.
use_softmax_amp : bool
If True, compute summed spectral power from spectrogram
with a softmax operation on each column.
Default is True.
temperature : float
Temperature for softmax. Only used if ``use_softmax_amp`` is True.
smoothing_timescale : float
Timescale to use when smoothing summed spectral power
with a gaussian filter.
The window size will be ``dt - smoothing_timescale / samplerate``,
where ``dt`` is the size of a time bin in the spectrogram.
scale : bool
If True, scale the ``sound.data``.
Default is True.
This is needed to replicate the behavior of ``ava``,
which assumes the audio data is loaded as 16-bit integers.
Since the default for :class:`vocalpy.Sound` is to load sounds
with a numpy dtype of float64, this function defaults to
multiplying the ``sound.data`` by 2**15,
and then casting to the int16 dtype.
This replicates the behavior of the ``ava`` function,
given data with dtype float64.
If you have loaded a sound with a dtype of int16,
then set this to False.
scale_val :
Value to multiply the ``sound.data`` by, to scale the data.
Default is 2**15.
Only used if ``scale`` is ``True``.
This is needed to replicate the behavior of ``ava``,
which assumes the audio data is loaded as 16-bit integers.
scale_dtype : numpy.dtype
Numpy Dtype to cast ``sound.data`` to, after scaling.
Default is ``np.int16``.
Only used if ``scale`` is ``True``.
This is needed to replicate the behavior of ``ava``,
which assumes the audio data is loaded as 16-bit integers.
Returns
-------
segments : vocalpy.Segments
Instance of :class:`vocalpy.Segments` representing
the segments found.
Examples
--------
>>> jourjineetal2023paths = voc.example('jourjine-et-al-2023')
>>> wav_path = jourjineetal2023paths[0]
>>> sound = voc.Sound.read(wav_path)
>>> params = {**voc.segment.ava.JOURJINEETAL2023}
>>> del params['min_isi_dur']
>>> segments = voc.segment.ava(sound, **params)
>>> spect = voc.spectrogram(sound)
>>> rows = 3; cols = 4
>>> import matplotlib.pyplot as plt
>>> fig, ax_arr = plt.subplots(rows, cols)
>>> start_inds, stop_inds = segments.start_inds, segments.stop_inds
>>> ax_to_use = ax_arr.ravel()[:start_inds.shape[0]]
>>> for start_ind, stop_ind, ax in zip(start_inds, stop_inds, ax_to_use):
... data = sound.data[:, start_ind:stop_ind]
... newsound = voc.Sound(data=data, samplerate=sound.samplerate)
... spect = voc.spectrogram(newsound)
... ax.pcolormesh(spect.times, spect.frequencies, np.squeeze(spect.data))
>>> for ax in ax_arr.ravel()[:start_inds.shape[0]]:
... ax.set_axis_off()
>>> for ax in ax_arr.ravel()[start_inds.shape[0]:]:
... ax.remove()
Notes
-----
Code is adapted from [2]_.
Default parameters are taken from example script here:
https://github.com/pearsonlab/autoencoded-vocal-analysis/blob/master/examples/mouse_sylls_mwe.py
Note that example script suggests tuning these parameters using functionality built into it,
that we do not replicate here.
Versions of this algorithm were also used to segment
rodent vocalizations in [4]_ (see code in [5]_)
and [6]_ (see code in [7]_).
References
----------
.. [1] Goffinet, J., Brudner, S., Mooney, R., & Pearson, J. (2021).
Low-dimensional learned feature spaces quantify individual and group differences in vocal repertoires.
eLife, 10:e67855. https://doi.org/10.7554/eLife.67855
.. [2] https://github.com/pearsonlab/autoencoded-vocal-analysis
.. [3] Goffinet, J., Brudner, S., Mooney, R., & Pearson, J. (2021).
Data from: Low-dimensional learned feature spaces quantify individual
and group differences in vocal repertoires. Duke Research Data Repository.
https://doi.org/10.7924/r4gq6zn8w
.. [4] Nicholas Jourjine, Maya L. Woolfolk, Juan I. Sanguinetti-Scheck, John E. Sabatini,
Sade McFadden, Anna K. Lindholm, Hopi E. Hoekstra,
Two pup vocalization types are genetically and functionally separable in deer mice,
Current Biology, 2023 https://doi.org/10.1016/j.cub.2023.02.045
.. [5] https://github.com/nickjourjine/peromyscus-pup-vocal-evolution/blob/main/src/segmentation.py
.. [6] Peterson, Ralph Emilio, Aman Choudhri, Catalin MItelut, Aramis Tanelus, Athena Capo-Battaglia,
Alex H. Williams, David M. Schneider, and Dan H. Sanes.
"Unsupervised discovery of family specific vocal usage in the Mongolian gerbil."
bioRxiv (2023): 2023-03.
.. [7] https://github.com/ralphpeterson/gerbil-vocal-dialects/blob/main/vocalization_segmenting.py
"""
if sound.data.shape[0] > 1:
raise ValueError(
f"The ``sound`` has {sound.data.shape[0]} channels, but segmentation is not implemented "
"for sounds with multiple channels. This is because there can be a different number of segments "
"per channel, which cannot be represented as a rectangular array. To segment each channel, "
"first split the channels into separate ``vocalpy.Sound`` instances, then pass each to this function."
"For example,\n"
">>> sound_channels = [sound_ for sound_ in sound] # split with a list comprehension\n"
">>> channel_segments = [vocalpy.segment.meansquared(sound_) for sound_ in sound_channels]\n"
)
data = np.squeeze(
sound.data, axis=0
) # get rid of channels dim so we operate on scalars in main loop
if scale:
data = (data * scale_val).astype(scale_dtype)
# ---- compute spectrogram
# TODO: return Spectrogram for each Segment when we return Segments
f, t, spect = stft(
data, sound.samplerate, nperseg=nperseg, noverlap=noverlap
)
i1 = np.searchsorted(f, min_freq)
i2 = np.searchsorted(f, max_freq)
f, spect = f[i1:i2], spect[i1:i2]
spect = np.log(np.abs(spect) + EPSILON)
spect -= spect_min_val
spect /= spect_max_val - spect_min_val
spect = np.clip(spect, 0.0, 1.0)
# we determine `dt` here in case we need it for `amps`
# we also use it below to remove segments shorter than the minimum allowed value
dt = t[1] - t[0]
# ---- calculate amplitude and smooth.
if use_softmax_amp:
temp = np.exp(spect / temperature)
temp /= np.sum(temperature, axis=0) + EPSILON
amps = np.sum(np.multiply(spect, temp), axis=0)
else:
amps = np.sum(spect, axis=0)
amps = gaussian_filter(amps, smoothing_timescale / dt)
# Find local maxima greater than thresh_max.
local_maxima = []
for i in range(1, len(amps) - 1, 1):
if amps[i] > thresh_max and amps[i] == np.max(
amps[i - 1 : i + 2] # noqa: E203
):
local_maxima.append(i)
# Then search to the left and right for onsets and offsets.
onsets, offsets = [], []
for local_max in local_maxima:
# skip this local_max if we are in a region we already declared a segment.
# Note the next line in the original implementation had an off-by-one error,
# that is fixed here to avoid occasionally duplicating the first segment. See:
# https://github.com/pearsonlab/autoencoded-vocal-analysis/issues/12
if len(offsets) > 0 and local_max < offsets[-1]:
continue
# first find onset
i = local_max - 1
while (
i > 0
): # could we do ``while i > min(0, i - max_syl_length)`` to speed up?
# this if-else can be a single `if` with an `or`
# and then I think we can remove the `if len(onsets)` blocks
if amps[i] < thresh_lowest:
onsets.append(i)
break
elif amps[i] < thresh_min and amps[i] == np.min(
amps[i - 1 : i + 2] # noqa: E203
):
onsets.append(i)
break
i -= 1
# if we found multiple onsets because of if/else, then only keep one
if len(onsets) != len(offsets) + 1:
onsets = onsets[: len(offsets)]
continue
# then find offset
i = local_max + 1
while i < len(
amps
): # could we do ``while i > min(amps, i + max_syl_length)`` to speed up?
# this if-else can be a single `if` with an `or`
# and then I think we can remove the `if len(onsets)` blocks
if amps[i] < thresh_lowest:
offsets.append(i)
break
elif amps[i] < thresh_min and amps[i] == np.min(
amps[i - 1 : i + 2] # noqa: E203
):
offsets.append(i)
break
i += 1
# if we found multiple offsets because of if/else, then only keep one
# shouldn't this change offsets though?
if len(onsets) != len(offsets):
onsets = onsets[: len(offsets)]
continue
# Throw away segments that are too long or too short.
min_dur_samples = int(np.floor(min_dur / dt))
max_dur_samples = int(np.ceil(max_dur / dt))
new_onsets, new_offsets = [], []
for i in range(len(offsets)):
t1, t2 = onsets[i], offsets[i]
if t2 - t1 + 1 <= max_dur_samples and t2 - t1 + 1 >= min_dur_samples:
new_onsets.append(t1 * dt)
new_offsets.append(t2 * dt)
onsets = np.array(new_onsets)
offsets = np.array(new_offsets)
if onsets.size == 0 and offsets.size == 0:
# can't throw any intervals away (next code block)
# if there's not any intervals, so, return empty Segments
return Segments(
np.array([]).astype(int),
np.array([]).astype(int),
sound.samplerate,
)
# Throw away inter-segment intervals that are too short, as is done in Jourjine et al., 2023
# Note we do this **after** throwing away segments that are too long or too short.
# We do this to replicate what was done in Jourjine et al., 2023, where they call `ava.get_onsets_offsets`
# and then remove inter-syllable intervals less than a specified duration.
# This means there is the possibility of throwing away some short segments that we might have merged,
# if we'd removed inter-segment intervals *first*, which is what `vocalpy.segment.meansquared` does.
if min_isi_dur is not None:
isi_durs = onsets[1:] - offsets[:-1]
keep_these = isi_durs > min_isi_dur
# we don't keep seconds anymore since we're returning samples
onsets = np.concatenate(
(onsets[0, np.newaxis], onsets[1:][keep_these])
)
offsets = np.concatenate(
(offsets[:-1][keep_these], offsets[-1, np.newaxis])
)
onsets_sample = (onsets * sound.samplerate).astype(int)
offsets_sample = (offsets * sound.samplerate).astype(int)
lengths = offsets_sample - onsets_sample
# Handle edge case where we decide last time bin is offset,
# and the time of the last time bin in `t` (as computed from `dt`) is greater than duration of sound.
# Fixes https://github.com/vocalpy/vocalpy/issues/167
if onsets_sample[-1] + lengths[-1] > sound.samples:
# set length to be "until the end of the sound"
lengths[-1] = sound.samples - onsets_sample[-1]
return Segments(
start_inds=onsets_sample, lengths=lengths, samplerate=sound.samplerate
)