"""Feature extraction for Sound Analysis Toolbox (SAT).
Code adapted from [1]_, [2]_, and [3]_.
.. [1] `Sound Analysis Tools <http://soundanalysispro.com/matlab-sat>`_ for Matlab (SAT) by Ofer Tchernichovski
.. [2] `birdsonganalysis <https://github.com/PaulEcoffet/birdsonganalysis>`_ by Paul Ecoffet
.. [3] `avn <https://github.com/theresekoch/avn/blob/main/avn/acoustics.py>`_
by Therese Koch, specifically the acoustics module
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import librosa
import numpy as np
import numpy.typing as npt
import xarray as xr
if TYPE_CHECKING:
from .. import Features, Sound, Spectrogram
from .. import spectral
# get small number to avoid potential divide by zero errors
EPS = np.finfo(np.double).eps
def goodness_of_pitch(
cepstrogram: npt.NDArray, quefrencies: npt.NDArray, max_F0: float = 1830.0
) -> npt.NDArray:
"""Calculate goodness of pitch
Finds the max in each column of ``cepstrogram``
between ``quefrencies`` greater than ``1 / max_F0``
and less than ``int(np.floor(len(cepstrogram) / 2))``.
Parameters
----------
cepstrogram : numpy.ndarray
Cepstrogram returned by :func:`vocalpy.spectral.sat`,
matrix with dimensions (channels, quefrencies, time bins).
quefrencies : numpy.ndarray
The quefrencies for the cepstrogram.
max_F0 : float
Maximum frequency to consider,
that becomes the lowest ``quefrency``
used when computing goodness of pitch.
Returns
-------
values : numpy.ndarray
The goodness of pitch values for each column in ``cepstrogram``.
Notes
-----
Goodness of pitch is an estimate of harmonic periodicity of a signal.
Higher values indicate a more periodic sound (like a harmonic stack), whereas
lower values indicate less periodic sounds (like noise). Formally, it is the
peak of the cepstrum of the signal for fundamental frequencies below `max_F0`.
Code adapted from [1]_, [2]_, and [3]_.
Docs adapted from [1]_ and [3]_.
References
----------
.. [1] `Sound Analysis Tools <http://soundanalysispro.com/matlab-sat>`_ for Matlab (SAT) by Ofer Tchernichovski
.. [2] `birdsonganalysis <https://github.com/PaulEcoffet/birdsonganalysis>`_ by Paul Ecoffet
.. [3] `avn <https://github.com/theresekoch/avn/blob/main/avn/acoustics.py>`_
by Therese Koch, specifically the acoustics module
"""
if max_F0 <= 0:
raise ValueError(
f"`max_F0` must be greater than zero but was: {max_F0}"
)
quefrency_cutoff = 1 / max_F0
if quefrency_cutoff > quefrencies.max():
raise ValueError(
f"`max_F0` of {max_F0} gives a quefrency cut-off of {quefrency_cutoff}, "
f"but this is greater than the max value in `quefrencies`: {quefrencies.max()}"
)
min_quef_idx = np.min(np.argwhere(quefrencies > quefrency_cutoff)) - 1
max_quef_idx = int(np.floor(cepstrogram.shape[1] / 2))
return np.max(cepstrogram[:, min_quef_idx:max_quef_idx, :], axis=1)
def mean_frequency(
power_spectrogram: Spectrogram,
min_freq: float = 380.0,
max_freq: float = 11025.0,
) -> npt.NDArray:
"""Calculate mean frequency.
Finds the mean for each column in ``power_spectrogram``,
between the frequencies specified by ``min_freq`` and ``max_freq``.
To find the mean, the frequencies are weighted by their power
in ``power_spectrogram`` and then divided by the sum of that power.
Parameters
----------
power_spectrogram : vocalpy.Spectrogram
Spectrogram, returned by :func:`vocalpy.spectral.sat`.
min_freq : float
The minimum frequency to consider in ``power_spectrogram``.
max_freq : float
The maximum frequency to consider in ``power_spectrogram``.
Returned by :func`:vocalpy.spectral.sat`, computing using the
``freq_range`` argument to that function.
Returns
-------
values : numpy.ndarray
The mean frequency of each column in ``power_spectrogram``.
Notes
-----
This is one way to estimate the pitch of a signal.
It is the center of the distribution of power across frequencies in the signal.
For another estimate of pitch, see :func:`vocalpy.features.sat.pitch`.
Code adapted from [1]_, [2]_, and [3]_.
Docs adapted from [1]_ and [3]_.
References
----------
.. [1] `Sound Analysis Tools <http://soundanalysispro.com/matlab-sat>`_ for Matlab (SAT) by Ofer Tchernichovski
.. [2] `birdsonganalysis <https://github.com/PaulEcoffet/birdsonganalysis>`_ by Paul Ecoffet
.. [3] `avn <https://github.com/theresekoch/avn/blob/main/avn/acoustics.py>`_
by Therese Koch, specifically the acoustics module
See Also
--------
pitch
"""
freq_inds = (power_spectrogram.frequencies > min_freq) & (
power_spectrogram.frequencies < max_freq
)
P = power_spectrogram.data[:, freq_inds, :]
P[P == 0.0] += np.finfo(P.dtype).eps
frequencies = power_spectrogram.frequencies[freq_inds]
return np.sum(P * frequencies[:, np.newaxis], axis=1) / np.sum(P, axis=1)
def frequency_modulation(dSdt: npt.NDArray, dSdf: npt.NDArray) -> npt.NDArray:
"""Calculate frequency modulation.
Parameters
----------
dSdt : numpy.ndarray
Derivative of spectrogram with respect to time, returned by
:func:`vocalpy.spectral.sat`.
dSdf : numpy.ndarray
Derivative of spectrogram with respect to frequency, returned by
:func:`vocalpy.spectral.sat`.
Returns
-------
values : numpy.ndarray
The frequency modulation for each column in ``dSdt`` and ``dSdf``.
Notes
-----
Frequency modulation is a measure of the variance of the frequency composition over time
of a signal [4]_.
Code adapted from [1]_, [2]_, and [3]_.
Docs adapted from [1]_ and [3]_.
References
----------
.. [1] `Sound Analysis Tools <http://soundanalysispro.com/matlab-sat>`_ for Matlab (SAT) by Ofer Tchernichovski
.. [2] `birdsonganalysis <https://github.com/PaulEcoffet/birdsonganalysis>`_ by Paul Ecoffet
.. [3] `avn <https://github.com/theresekoch/avn/blob/main/avn/acoustics.py>`_
by Therese Koch, specifically the acoustics module
.. [4] Bradbury, Jack W., and Sandra Lee Vehrencamp. Principles of animal communication. Vol. 132.
Sunderland, MA: Sinauer Associates, 1998.
"""
return np.arctan(
np.max(dSdt, axis=1)
/ (np.max(dSdf, axis=1) + np.finfo(dSdt.dtype).eps)
)
def amplitude_modulation(dSdt: npt.NDArray) -> npt.NDArray:
"""Calculates the amplitude modulation.
Parameters
----------
dSdt : numpy.ndarray
Derivative of spectrogram with respect to time, returned by
:func:`vocalpy.spectral.sat`.
Returns
-------
values : numpy.ndarray
The amplitude modulation for each column in ``dSdt``.
Notes
-----
Amplitude modulation is a measure of the variance in amplitude
over time of a signal [4]_.
Code adapted from [1]_, [2]_, and [3]_.
Docs adapted from [1]_ and [3]_.
References
----------
.. [1] `Sound Analysis Tools <http://soundanalysispro.com/matlab-sat>`_ for Matlab (SAT) by Ofer Tchernichovski
.. [2] `birdsonganalysis <https://github.com/PaulEcoffet/birdsonganalysis>`_ by Paul Ecoffet
.. [3] `avn <https://github.com/theresekoch/avn/blob/main/avn/acoustics.py>`_
by Therese Koch, specifically the acoustics module
.. [4] Bradbury, Jack W., and Sandra Lee Vehrencamp. Principles of animal communication. Vol. 132.
Sunderland, MA: Sinauer Associates, 1998.
"""
return np.sum(dSdt, axis=1)
def entropy(
power_spectrogram: Spectrogram,
min_freq: float = 380.0,
max_freq: float = 11025.0,
) -> npt.NDArray:
"""Calculate Wiener entropy
Computes the Wiener entropy for each column in ``power_spectrogram``,
between the frequencies specified by ``min_freq`` and ``max_freq``.
Returns:
np.array: array containing the of each frame in the song interval.
Parameters
----------
power_spectrogram : vocalpy.Spectrogram
Spectrogram, returned by
:func:`vocalpy.spectral.sat`.
min_freq : float
The minimum frequency to consider in ``power_spectrogram``.
max_freq : float
The maximum frequency to consider in ``power_spectrogram``.
Returned by :func`:vocalpy.spectral.sat`, computing using the
``freq_range`` argument to that function.
Returns
-------
values : numpy.ndarray
The log-scaled Weiner entropy for every column in ``power_spectrogram``.
Notes
------
Wiener entropy is a measure of the uniformity of power spread across frequency bands in a frame of audio.
The output of this function is log-scaled Wiener entropy, which can range in value from 0 to negative
infinity. A score close to 0 indicates broadly spread power across frequency bands, i.e. a less structured
sound like white noise. A large negative score indicates low uniformity across frequency bands, i.e. a more
structured sound like a harmonic stack or pure tone.
Code adapted from [1]_, [2]_, and [3]_.
Docs adapted from [1]_ and [3]_.
References
----------
.. [1] `Sound Analysis Tools <http://soundanalysispro.com/matlab-sat>`_ for Matlab (SAT) by Ofer Tchernichovski
.. [2] `birdsonganalysis <https://github.com/PaulEcoffet/birdsonganalysis>`_ by Paul Ecoffet
.. [3] `avn <https://github.com/theresekoch/avn/blob/main/avn/acoustics.py>`_
by Therese Koch, specifically the acoustics module
"""
freq_inds = (power_spectrogram.frequencies > min_freq) & (
power_spectrogram.frequencies < max_freq
)
P = power_spectrogram.data[:, freq_inds, :]
P[P == 0.0] += np.finfo(P.dtype).eps
# calculate entropy for current frame
sum_log = np.sum(np.log(P), axis=1)
log_sum = np.log(np.sum(P, axis=1) / (P.shape[1] - 1))
return sum_log / (P.shape[1] - 1) - log_sum
def amplitude(
power_spectrogram: Spectrogram,
min_freq: float = 380.0,
max_freq: float = 11025.0,
baseline: float = 70.0,
) -> npt.NDArray:
"""Calculate amplitude.
Sums each column in ``power_spectrogram.data``
for frequencies between ``min_freq`` and ``max_freq``.
This gives the energy, that is then converted
to decibels with ``10 * np.log10``. The ``baseline``
is added to these values.
Parameters
----------
power_spectrogram : vocalpy.Spectrogram
Spectrogram, returned by
:func:`vocalpy.spectral.sat`.
min_freq : float
The minimum frequency to consider in ``power_spectrogram``.
max_freq : float
The maximum frequency to consider in ``power_spectrogram``.
Returned by :func`:vocalpy.spectral.sat`, computing using the
``freq_range`` argument to that function.
baseline : float
The baseline value added, in decibels.
The default is 70.0 dB, the value used by SAT
and SAP.
Returns
-------
values : numpy.ndarray
Notes
-----
Code adapted from [1]_, [2]_, and [3]_.
Docs adapted from [1]_ and [3]_.
References
----------
.. [1] `Sound Analysis Tools <http://soundanalysispro.com/matlab-sat>`_ for Matlab (SAT) by Ofer Tchernichovski
.. [2] `birdsonganalysis <https://github.com/PaulEcoffet/birdsonganalysis>`_ by Paul Ecoffet
.. [3] `avn <https://github.com/theresekoch/avn/blob/main/avn/acoustics.py>`_
by Therese Koch, specifically the acoustics module
"""
freq_inds = (power_spectrogram.frequencies > min_freq) & (
power_spectrogram.frequencies < max_freq
)
P = power_spectrogram.data[:, freq_inds, :]
P[P == 0.0] += np.finfo(P.dtype).eps
return 10 * np.log10(np.sum(P, axis=1)) + baseline
def pitch(
sound: Sound,
fmin: float = 380.0,
fmax_yin: float = 8000.0,
frame_length: int = 400,
hop_length: int = 40,
trough_threshold: float = 0.1,
):
"""Estimates the fundamental frequency (or pitch) using the YIN algorithm [1]_.
The pitch is computed using :func:`librosa.yin`.
Parameters
----------
sound : vocalpy.Sound
A :class:`vocalpy.Sound` instance. Multi-channel is supported.
fmin : float
Minimum frequency in Hertz.
The recommended minimum is ``librosa.note_to_hz('C2')`` (~65 Hz)
though lower values may be feasible.
fmax_yin : float
Maximum frequency in Hertz.
Default is 8000.
frame_length : int
length of the frames in samples.
Default is 400.
hop_length : int
number of audio samples between adjacent YIN predictions.
If ``None``, defaults to ``frame_length // 4``.
Default is 40.
trough_threshold: float
Absolute threshold for peak estimation.
A float greater than 0.
Returns
-------
pitch: np.array
Time series of fundamental frequency in Hertz.
Notes
-----
For more information on the YIN algorithm for fundamental frequency estimation,
please refer to the documentation for :func:`librosa.yin`.
Code adapted from [2]_, [3]_, and [4]_.
Docs adapted from [2]_ and [4]_.
References
----------
.. [1] De Cheveigné, Alain, and Hideki Kawahara.
“YIN, a fundamental frequency estimator for speech and music.”
The Journal of the Acoustical Society of America 111.4 (2002): 1917-1930.
.. [2] `Sound Analysis Tools <http://soundanalysispro.com/matlab-sat>`_ for Matlab (SAT) by Ofer Tchernichovski
.. [3] `birdsonganalysis <https://github.com/PaulEcoffet/birdsonganalysis>`_ by Paul Ecoffet
.. [4] `avn <https://github.com/theresekoch/avn/blob/main/avn/acoustics.py>`_
by Therese Koch, specifically the acoustics module
"""
return librosa.yin(
sound.data,
fmin=fmin,
fmax=fmax_yin,
sr=sound.samplerate,
frame_length=frame_length,
hop_length=hop_length,
trough_threshold=trough_threshold,
)
def _get_cepstral(
spectra1: npt.NDArray, n_fft: int, samplerate: int
) -> tuple[npt.NDArray, npt.NDArray]:
"""Get cepstrogram and quefrencies from a spectrogram
Helper function used by :func:`similarity_features` to compute
:func:`goodness_of_pitch` feature.
Parameters
----------
spectra1
n_fft : int
sound : Sound
Returns
-------
cepstrogram : numpy.ndarray
quefrencies : numpy.ndarray
"""
# ---- make "cepstrogram" and quefrencies
spectra1_for_cepstrum = np.copy(spectra1)
# next line is a fancy way of adding eps to zero values
# so we don't get the enigmatic divide-by-zero error, and we don't get np.inf values
# see https://github.com/numpy/numpy/issues/21560
spectra1_for_cepstrum[spectra1_for_cepstrum == 0.0] += np.finfo(
spectra1_for_cepstrum.dtype
).eps
cepstrogram = np.fft.ifft(
np.log(np.abs(spectra1_for_cepstrum)), n=n_fft, axis=1
).real
quefrencies = np.array(np.arange(n_fft)) / samplerate
return cepstrogram, quefrencies
def _get_spectral_derivatives(
spectra1: npt.NDArray, spectra2: npt.NDArray, max_freq_idx: int
) -> tuple[npt.NDArray, npt.NDArray]:
"""Get derivatives of spectrogram with respect to time and frequency.
Helper function used by :func:`similarity_features` to compute
:func:`amplitude_modulation` and :func:`frequency_modulation` feature
Parameters
----------
spectra1
spectra2
max_freq_idx : int
Returns
-------
dSdt : numpy.ndarray
dSdf : numpy.ndarray
"""
spectra1 = spectra1[:, :max_freq_idx, :]
spectra2 = spectra2[:, :max_freq_idx, :]
# time derivative of spectrum
dSdt = (-spectra1.real * spectra2.real) - (spectra1.imag * spectra2.imag)
# frequency derivative of spectrum
dSdf = (spectra1.imag * spectra2.real) - (spectra1.real * spectra2.imag)
return dSdt, dSdf
[docs]
def sat(
sound: Sound,
n_fft=400,
hop_length=40,
freq_range=0.5,
min_freq: float = 380.0,
amp_baseline: float = 70.0,
max_F0: float = 1830.0,
fmax_yin: float = 8000.0,
trough_threshold: float = 0.1,
) -> Features:
"""Extract all features used to compute similarity with
the Sound Analysis Toolbox for Matlab (SAT).
Parameters
----------
sound : Sound
A :class:`Sound`.
Multi-channel sounds are supported.
n_fft : int
FFT window size.
hop_length : int
Number of audio samples between adjacent STFT columns.
freq_range : float
Range of frequencies to use, given as a value
between zero and one.
Default is 0.5, which means
"Use the first half of the frequencies,
from zero to :math:`f_s/4`
(half the Nyquist frequency)".
min_freq : float
Minimum frequency to consider when extracting features.
amp_baseline : float
The baseline value added, in decibels, to the amplitude feature.
The default is 70.0 dB, the value used by SAT and SAP.
max_F0 : float
Maximum frequency to consider,
that becomes the lowest ``quefrency``
used when computing goodness of pitch.
fmax_yin : float
Maximum frequency in Hertz when computing pitch with YIN algorithm.
Default is 8000.
trough_threshold: float
Absolute threshold for peak estimation.
A float greater than 0.
Used by :func:`pitch`.
Returns
-------
features : vocalpy.Features
:class:`vocalpy.Features` instance with
:attr:`~vocalpy.Features.data` attribute that is
an :class:`xarray.Dataset`,
where the data variables are the features,
and the coordinate is the time for each time bin.
"""
if not 0.0 < freq_range <= 1.0:
raise ValueError(
f"`freq_range` must be a float greater than zero and less than or equal to 1.0, but was: {freq_range}. "
f"Please specify a value between zero and one inclusive specifying the percentage of the frequencies "
f"to use when extracting features with a frequency range"
)
power_spectrogram, spectra1, spectra2 = spectral.sat._sat_multitaper(
sound, n_fft, hop_length
)
# in SAT, freq_range means "use first `freq_range` percent of frequencies". Next line finds that range.
f = power_spectrogram.frequencies
max_freq_idx = int(np.floor(f.shape[0] * freq_range))
max_freq = f[max_freq_idx]
# ---- now extract features
# -------- features that require sound
pitch_ = pitch(
sound,
min_freq,
fmax_yin,
frame_length=n_fft,
hop_length=hop_length,
trough_threshold=trough_threshold,
)
# -------- features that require power spectrogram and max_freq
amp_ = amplitude(power_spectrogram, min_freq, max_freq, amp_baseline)
entropy_ = entropy(power_spectrogram, min_freq, max_freq)
# -------- features that require cepstrogram
cepstrogram, quefrencies = _get_cepstral(spectra1, n_fft, sound.samplerate)
goodness_ = goodness_of_pitch(cepstrogram, quefrencies, max_F0)
# -------- features that spectral derivatives
dSdt, dSdf = _get_spectral_derivatives(spectra1, spectra2, max_freq_idx)
FM = frequency_modulation(dSdt, dSdf)
AM = amplitude_modulation(dSdt)
channels = np.arange(sound.data.shape[0])
data = xr.Dataset(
{
"amplitude": (["channel", "time"], amp_),
"pitch": (["channel", "time"], pitch_),
"goodness_of_pitch": (["channel", "time"], goodness_),
"frequency_modulation": (["channel", "time"], FM),
"amplitude_modulation": (["channel", "time"], AM),
"entropy": (["channel", "time"], entropy_),
},
coords={"channel": channels, "time": power_spectrogram.times},
)
from .. import Features # avoid circular import
features = Features(data=data)
return features