Source code for vocalpy.feature._biosound.features

"""Functions to compute pre-defined acoustic features, as described in [1]_.
Code is adapted from the ``soundsig`` library [2]_, under MIT license.

.. [1] Elie JE and Theunissen FE. "The vocal repertoire of the domesticated zebra finch:
   a data driven approach to decipher the information-bearing acoustic features of communication signals."
   Animal Cognition. 2016. 19(2) 285-315 DOI 10.1007/s10071-015-0933-6
.. [2] https://github.com/theunissenlab/soundsig
"""

from __future__ import annotations

from collections import defaultdict
from typing import TYPE_CHECKING, Literal, Sequence

import matplotlib.mlab
import numpy as np
import numpy.typing as npt
import xarray as xr

from .constants import DEFAULT_DT
from .fundamental import estimate_f0
from .sound import temporal_envelope

if TYPE_CHECKING:
    from vocalpy import Features, Sound


def temporal_envelope_features(
    data: npt.NDArray,
    samplerate: int,
    cutoff_freq: int = 20,
    amp_sample_rate: int = 1000,
) -> dict:
    """Extract pre-defined acoustic features from temporal envelope of a sound,
    as described in [1]_.

    This is a helper function called by
    :func:`vocalpy.feature.biosound`.
    It replicates the result of calling the ``soundsig.BioSound``
    method ``ampenv``.

    Parameters
    ----------
    data : numpy.ndarray
        The data from a single channel of a :class:`Sound`
    samplerate : int
        The sampling rate of the :class:`Sound`.
    cutoff_freq : int
        The cutoff frequency, in Hz.
        Default is 20, as in [1]_.
    amp_sample_rate: int
        The resampling rate to use when computing the amplitude
        envelope.

    Returns
    -------
    features : dict

    Notes
    -----
    Code is adapted from the ``soundsig`` library [2]_, under MIT license.

    References
    ----------
    .. [1] Elie JE and Theunissen FE. "The vocal repertoire of the domesticated zebra finch:
       a data driven approach to decipher the information-bearing acoustic features of communication signals."
       Animal Cognition. 2016. 19(2) 285-315 DOI 10.1007/s10071-015-0933-6
    .. [2] https://github.com/theunissenlab/soundsig
    """
    amp, tdata = temporal_envelope(
        data,
        samplerate,
        cutoff_freq=cutoff_freq,
        resample_rate=amp_sample_rate,
    )

    ampdata = amp / np.sum(amp)
    meantime = np.sum(tdata * ampdata)
    stdtime = np.sqrt(np.sum(ampdata * ((tdata - meantime) ** 2)))
    skewtime = np.sum(ampdata * (tdata - meantime) ** 3)
    skewtime = skewtime / (stdtime**3)
    kurtosistime = np.sum(ampdata * (tdata - meantime) ** 4)
    kurtosistime = kurtosistime / (stdtime**4)
    indpos = np.where(ampdata > 0)[0]
    entropytime = -np.sum(
        ampdata[indpos] * np.log2(ampdata[indpos])
    ) / np.log2(np.size(indpos))

    return {
        "mean_t": meantime,
        "std_t": stdtime,
        "skew_t": skewtime,
        "kurtosis_t": kurtosistime,
        "entropy_t": entropytime,
        "t_amp": tdata,
        "amp": amp,
        "max_amp": max(amp),
    }


def spectral_envelope_features(
    data: npt.NDArray,
    samplerate: int,
    f_high: int = 10000,
    NFFT=1024,
    noverlap=512,
) -> dict:
    """Extract pre-defined acoustic features from spectral envelope of a sound,
    as described in [1]_.

    This is a helper function called by
    :func:`vocalpy.feature.biosound`.
    It replicates the result of calling the ``soundsig.BioSound``
    method ``spectrum``.

    Parameters
    ----------
    data : numpy.ndarray
        The data from a single channel of a :class:`Sound`
    samplerate : int
        The sampling rate of the :class:`Sound`.
    f_high : int
        Highest frequency to use from spectral envelope.
        Default is 10000, as in [1]_.
    NFFT : int
        Length of FFT window, in number of samples.
        Used to compute power spectrum,
        by calling :func:`matplotlb.mlab.psd`.
        Default is 1024.
    noverlap : int
        Overlap of FFT windows, in number of samples.
        Used to compute power spectrum,
        by calling :func:`matplotlb.mlab.psd`.
        Default is 512.

    Returns
    -------
    features : dict

    Notes
    -----
    Code is adapted from the ``soundsig`` library [2]_, under MIT license.

    References
    ----------
    .. [1] Elie JE and Theunissen FE. "The vocal repertoire of the domesticated zebra finch:
       a data driven approach to decipher the information-bearing acoustic features of communication signals."
       Animal Cognition. 2016. 19(2) 285-315 DOI 10.1007/s10071-015-0933-6
    .. [2] https://github.com/theunissenlab/soundsig
    """
    # Calculates power spectrum and features from power spectrum

    # Need to add argument for window size
    # f_high is the upper bound of the frequency for saving power spectrum
    # nwindow = (1000.0*np.size(soundIn)/samprate)/window_len
    #
    Pxx, Freqs = matplotlib.mlab.psd(
        data, Fs=samplerate, NFFT=NFFT, noverlap=noverlap
    )

    # Find quartile power
    cum_power = np.cumsum(Pxx)
    tot_power = np.sum(Pxx)
    quartile_freq = np.zeros(3, dtype="int")
    quartile_values = [0.25, 0.5, 0.75]
    nfreqs = np.size(cum_power)
    iq = 0
    for ifreq in range(nfreqs):
        if cum_power[ifreq] > quartile_values[iq] * tot_power:
            quartile_freq[iq] = ifreq
            iq = iq + 1
            if iq > 2:
                break

    # Find skewness, kurtosis and entropy for power spectrum below f_high
    ind_fmax = np.where(Freqs > f_high)[0][0]

    # Description of spectral shape
    spectdata = Pxx[0:ind_fmax]
    freqdata = Freqs[0:ind_fmax]
    spectdata = spectdata / np.sum(spectdata)
    meanspect = np.sum(freqdata * spectdata)
    stdspect = np.sqrt(np.sum(spectdata * ((freqdata - meanspect) ** 2)))
    skewspect = np.sum(spectdata * (freqdata - meanspect) ** 3)
    skewspect = skewspect / (stdspect**3)
    kurtosisspect = np.sum(spectdata * (freqdata - meanspect) ** 4)
    kurtosisspect = kurtosisspect / (stdspect**4)
    entropyspect = -np.sum(spectdata * np.log2(spectdata)) / np.log2(ind_fmax)

    return {
        "mean_s": meanspect,
        "std_s": stdspect,
        "skew_s": skewspect,
        "kurtosis_s": kurtosisspect,
        "entropy_s": entropyspect,
        "q1": Freqs[quartile_freq[0]],
        "q2": Freqs[quartile_freq[1]],
        "q3": Freqs[quartile_freq[2]],
        "fpsd": freqdata,
        "psd": spectdata,
    }


def fundamental_features(
    data,
    samplerate,
    dt=DEFAULT_DT,
    max_fund: int = 1500,
    min_fund: int = 300,
    low_fc: int = 200,
    high_fc: int = 6000,
    min_saliency: float = 0.5,
    min_formant_freq: int = 500,
    max_formant_bw: int = 500,
    window_formant: float = 0.1,
    method: str = "Stack",
) -> dict:
    """Extract features from the time-varying fundamental frequency,
    as described in [1]_.

    This is a helper function called by
    :func:`vocalpy.feature.biosound`.
    It replicates the result of calling the ``soundsig.BioSound``
    method ``fundest``.

    Parameters
    ----------
    data : numpy.ndarray
        The data from a single channel of a :class:`Sound`
    samplerate : int
        The sampling rate of the :class:`Sound`.
    dt : float
        Size of time step in seconds.
        Default is DEFAULT_DT, the time step
        for the spectrogram computed by
        `soundsig.BioSound.spectroCalc`.
    max_fund : int
        Maximum fundamental frequency to analyze, in Hz.
        Default is 1500, as in [1]_.
    min_fund : int
        Minimum fundamental frequency to analyze, in Hz.
        Default is 300, as in [1]_.
    low_fc : int
        Low frequency cut-off for band-passing the signal
        prior to auto-correlation.
        Default is 200, as in [1]_.
    high_fc : int
        High frequency cut-off for band-passing.
        Default is 6000, as in [1]_.
    min_saliency : int
        Threshold in the auto-correlation for minimum saliency.
        Returns NaN for pitch values if saliency is below this number.
        Default is 0.5, as in [1]_.
    min_formant_freq : int
        Minimum value of first formant, in Hz.
        Default is 500, as in [1]_.
    max_formant_bw : int
        Maximum value of formant bandwidth, in Hz..
        Default is 500, as in [1]_.
    window_formant : float
        Time window for formant calculation, in seconds.
        Includes 5 standard deviations of normal window.
        Default is 0.1, as in [1]_.
    method : str
        One of {``'AC'``, ``'ACA'``, ``'Cep'``, or ``'Stack'``}.
        Default is ``'Stack'``, as in [1]_.
        See Notes for detail.

    Returns
    -------
    fund_features : dict
        With the following key-value pairs:
            f0 : np.ndarray
                The time-varying fundamental in Hz,
                at the same resolution as the spectrogram.
            f0_2 : np.ndarray
                A second peak in the spectrum;
                not a multiple of the fundamental, a sign of a second voice.
            F1 : np.ndarray
                The time-varying first formant, if it exists.
            F2 : np.ndarray
                The time-varying second formant, if it exists.
            F3 : np.ndarray
                The time-varying third formant, if it exists.
            fund : np.ndarray
                The average fundamental
            sal : np.ndarray
                The time-varying saliency.
            meansal : np.ndarray
                The average saliency.
            fund2 : np.ndarray
                The average fundamental of the 2nd peak ``f0_2``.
            voice2percent : np.ndarray
                The average percent of presence of a second peak.

    Notes
    -----
    This function implements four methods for estimating the
    time-varying fundamental frequency, specified by the ``method`` argument:
    * ``'AC'``: Peak of the auto-correlation function
    * ``'ACA'``: Peak of the envelope of the auto-correlation function
    * ``'Cep'``: First peak in cepstrum
    * ``'Stack'``: Fitting of harmonic stacks. This is the default, works well for zebra finches.

    Code is adapted from the ``soundsig`` library [2]_, under MIT license.

    References
    ----------
    .. [1] Elie JE and Theunissen FE. "The vocal repertoire of the domesticated zebra finch:
       a data driven approach to decipher the information-bearing acoustic features of communication signals."
       Animal Cognition. 2016. 19(2) 285-315 DOI 10.1007/s10071-015-0933-6
    .. [2] https://github.com/theunissenlab/soundsig
    """
    # Calculate the fundamental, the formants and parameters related to these
    sal, fund, fund2, form1, form2, form3, lenfund = estimate_f0(
        data,
        samplerate,
        dt,
        max_fund,
        min_fund,
        low_fc,
        high_fc,
        min_saliency,
        min_formant_freq,
        max_formant_bw,
        window_formant,
        method=method,
    )
    goodFund = fund[~np.isnan(fund)]
    goodSal = sal[~np.isnan(sal)]
    goodFund2 = fund2[~np.isnan(fund2)]
    if np.size(goodFund) > 0:
        meanfund = np.mean(goodFund)
    else:
        meanfund = np.nan
    meansal = np.mean(goodSal)
    if np.size(goodFund2) > 0:
        pk2 = np.mean(goodFund2)
    else:
        pk2 = np.nan

    if np.size(goodFund) == 0 or np.size(goodFund2) == 0:
        second_v = 0.0
    else:
        second_v = (
            float(np.size(goodFund2)) / float(np.size(goodFund))
        ) * 100.0

    fund_features = {}
    for name, value in zip(
        (
            "f0",
            "f0_2",
            "F1",
            "F2",
            "F3",
            "mean_f0",
            "sal",
            "mean_sal",
            "pk2",
            "second_v",
        ),
        (
            fund,
            fund2,
            form1,
            form2,
            form3,
            meanfund,
            sal,
            meansal,
            pk2,
            second_v,
        ),
    ):
        fund_features[name] = value
    if np.size(goodFund) > 0:
        for name, value in zip(
            ("max_fund", "min_fund", "cv_fund"),
            (np.max(goodFund), np.min(goodFund), np.std(goodFund) / meanfund),
        ):
            fund_features[name] = value

    return fund_features


SoundsigFeatureGroups = Literal["temporal", "spectral", "fundamental"]


SCALAR_FEATURES = {
    "temporal": [
        "mean_t",
        "std_t",
        "skew_t",
        "kurtosis_t",
        "entropy_t",
        "max_amp",
    ],
    "spectral": [
        "mean_s",
        "std_s",
        "skew_s",
        "kurtosis_s",
        "entropy_s",
        "q1",
        "q2",
        "q3",
    ],
    "fundamental": [
        "mean_f0",
        "mean_sal",
        "second_v",
        "pk2",
        "max_fund",
        "min_fund",
        "cv_fund",
    ],
}



[docs]
def biosound(
    sound: Sound,
    scale: bool = True,
    scale_val: int | float = 2**15,
    scale_dtype: npt.DTypeLike = np.int16,
    ftr_groups: SoundsigFeatureGroups | Sequence[SoundsigFeatureGroups] = (
        "temporal",
        "spectral",
        "fundamental",
    ),
) -> Features:
    """Compute predefined acoustic features (PAFs)
    used to analyze the vocal repertoire of the domesticated zebra finch,
    as described in [1]_.

    Parameters
    ----------
    sound : Sound
        A sound loaded from a file.
    scale : bool
        If True, scale the ``sound.data``.
        Default is True.
        This is needed to replicate the behavior of ``soundsig``,
        which assumes the audio data is loaded as 16-bit integers.
        Since the default for :class:`vocalpy.Sound` is to load sounds
        with a numpy dtype of float64, this function defaults to
        multiplying the ``sound.data`` by 2**15,
        and then casting to the int16 dtype.
        This replicates the behavior of the ``soundsig`` function,
        given data with dtype float64.
        If you have loaded a sound with a dtype of int16,
        then set this to False.
    scale_val :
        Value to multiply the ``sound.data`` by, to scale the data.
        Default is 2**15.
        Only used if ``scale`` is ``True``.
        This is needed to replicate the behavior of ``soundsig``,
        which assumes the audio data is loaded as 16-bit integers.
    scale_dtype : numpy.dtype
        Numpy Dtype to cast ``sound.data`` to, after scaling.
        Default is ``np.int16``.
        Only used if ``scale`` is ``True``.
        This is needed to replicate the behavior of ``soundsig``,
        which assumes the audio data is loaded as 16-bit integers.

    Returns
    -------
    features : vocalpy.Features
        A :class:`vocalpy.Features` instance with
        :attr:`~vocalpy.Features.data` attribute that is
        an :class:`xarray.Dataset`,
        where the data variables are the features,
        and the coordinate is the channel.

    Notes
    -----
    Code is adapted from the ``soundsig`` library [2]_, under MIT license.

    References
    ----------
    .. [1] Elie JE and Theunissen FE. "The vocal repertoire of the domesticated zebra finch:
       a data driven approach to decipher the information-bearing acoustic features of communication signals."
       Animal Cognition. 2016. 19(2) 285-315 DOI 10.1007/s10071-015-0933-6
    .. [2] https://github.com/theunissenlab/soundsig
    """
    if isinstance(ftr_groups, (list, tuple)):
        if not all([isinstance(ftr_group, str) for ftr_group in ftr_groups]):
            bad_types = set(
                [
                    type(ftr_group)
                    for ftr_group in ftr_groups
                    if not isinstance(ftr_groups, str)
                ]
            )
            raise TypeError(
                f"`ftr_groups` must be a list or tuple of strings but some items in sequence were not: {bad_types}"
            )
        if not all(
            [
                ftr_group in ("temporal", "spectral", "fundamental")
                for ftr_group in ftr_groups
            ]
        ):
            raise ValueError(
                'All strings in `ftr_groups` must be one of: "temporal", "spectral", "fundamental", '
                f"but got:\n{ftr_groups}"
            )

    elif isinstance(ftr_groups, str):
        if ftr_groups not in ("temporal", "spectral", "fundamental"):
            raise ValueError(
                'Value for `ftr_groups` must be one of: "temporal", "spectral", "fundamental", '
                f"but got:\n{ftr_groups}"
            )
        ftr_groups = (
            ftr_groups,
        )  # so we can write ``if "string" in ftr_groups``

    if scale:
        from ... import Sound

        sound = Sound(
            data=(sound.data * scale_val).astype(scale_dtype),
            samplerate=sound.samplerate,
        )

    features = defaultdict(list)
    for channel_data in sound.data:
        if "temporal" in ftr_groups:
            t_feat = temporal_envelope_features(channel_data, sound.samplerate)
            for ftr_name in SCALAR_FEATURES["temporal"]:
                features[ftr_name].append(t_feat[ftr_name])
        if "spectral" in ftr_groups:
            s_feat = spectral_envelope_features(channel_data, sound.samplerate)
            for ftr_name in SCALAR_FEATURES["spectral"]:
                features[ftr_name].append(s_feat[ftr_name])
        if "fundamental" in ftr_groups:
            f_feat = fundamental_features(channel_data, sound.samplerate)
            for ftr_name in SCALAR_FEATURES["fundamental"]:
                features[ftr_name].append(f_feat[ftr_name])

    for key, val in features.items():
        features[key] = np.array(val)

    channels = np.arange(sound.data.shape[0])
    data = xr.Dataset(
        {
            feature_name: (["channel"], feature_val)
            for feature_name, feature_val in features.items()
        },
        coords={"channel": channels},
    )

    from ... import Features  # avoid circular import

    features = Features(data=data)
    return features