Source code for vocalpy.spectral.soundsig

from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np
import numpy.typing as npt
from numpy.lib.stride_tricks import sliding_window_view
from scipy.fftpack import fft, fftfreq, next_fast_len

if TYPE_CHECKING:
    from vocalpy import Sound, Spectrogram



[docs]
class GaussianSpectrumEstimator:

[docs]
    def __init__(self, nstd=6):
        self.nstd = nstd
        self._gauss_window_cache = {}


    def get_frequencies(self, signal_length, sample_rate):
        signal_length = next_fast_len(signal_length)
        freq = fftfreq(signal_length, d=1.0 / sample_rate)
        nz = freq >= 0.0
        return freq[nz]

    def _get_gauss_window(self, nwinlen):
        if nwinlen in self._gauss_window_cache:
            return self._gauss_window_cache[nwinlen]
        else:
            if nwinlen % 2 == 0:
                nwinlen += 1
            hnwinlen = nwinlen // 2
            gauss_t = np.arange(-hnwinlen, hnwinlen + 1, 1.0)
            gauss_std = float(nwinlen) / float(self.nstd)
            gauss_window = np.exp(-(gauss_t**2) / (2.0 * gauss_std**2)) / (
                gauss_std * np.sqrt(2 * np.pi)
            )
            self._gauss_window_cache[nwinlen] = gauss_window
            return gauss_window

    def estimate(self, signal, samplerate):
        nwinlen = len(signal)
        gauss_window = self._get_gauss_window(nwinlen)

        fft_len = next_fast_len(len(signal))
        # window the signal and take the FFT
        windowed_slice = signal[:fft_len] * gauss_window[:fft_len]
        s_fft = fft(windowed_slice, n=fft_len, overwrite_x=True)
        freq = fftfreq(fft_len, d=1.0 / samplerate)
        nz = freq >= 0.0

        return freq[nz], s_fft[nz]




[docs]
def soundsig_spectro(
    sound: Sound,
    spec_sample_rate: int = 1000,
    freq_spacing: int = 50,
    min_freq: int = 0,
    max_freq: int = 10000,
    nstd: int = 6,
    scale: bool = True,
    scale_val: int | float = 2**15,
    scale_dtype: npt.DTypeLike = np.int16,
) -> Spectrogram:
    # raw string to avoid flake8 complaining about math
    r"""Compute a dB-scaled spectrogram using a Gaussian window.

    Replicates the result of the method :meth:`soundsig.BioSound.spectroCalc`.

    Parameters
    ----------
    sound : vocalpy.Sound
        Sound loaded from a file. Multi-channel is supported.
    spec_sample_rate : int
        Sampling rate for the output spectrogram, in Hz.
        Sets the overlap for the windows in the STFT.
    freq_spacing : int
        The time-frequency scale for the spectrogram, in Hz.
        Determines the width of the Gaussian window.
    min_freq : int
        The minimum frequency to analyze, in Hz.
        The returned :class:`Spectrogram` will only contain
        frequencies :math:`\gte` ``min_freq``.
    max_freq : int
        The maximum frequency to analyze, in Hz.
        The returned :class:`Spectrogram` will only contain
        frequencies :math:`\lte` ``max_freq``.
    nstd : int
        Number of standard deviations of the Gaussian in one window.
    scale : bool
        If True, scale the ``sound.data``.
        Default is True.
        This is needed to replicate the behavior of ``soundsig``,
        which assumes the audio data is loaded as 16-bit integers.
        Since the default for :class:`vocalpy.Sound` is to load sounds
        with a numpy dtype of float64, this function defaults to
        multiplying the ``sound.data`` by 2**15,
        and then casting to the int16 dtype.
        This replicates the behavior of the ``soundsig`` function,
        given data with dtype float64.
        If you have loaded a sound with a dtype of int16,
        then set this to False.
    scale_val :
        Value to multiply the ``sound.data`` by, to scale the data.
        Default is 2**15.
        Only used if ``scale`` is ``True``.
        This is needed to replicate the behavior of ``soundsig``,
        which assumes the audio data is loaded as 16-bit integers.
    scale_dtype : numpy.dtype
        Numpy Dtype to cast ``sound.data`` to, after scaling.
        Default is ``np.int16``.
        Only used if ``scale`` is ``True``.
        This is needed to replicate the behavior of ``soundsig``,
        which assumes the audio data is loaded as 16-bit integers.

    Returns
    -------
    spect : Spectrogram
        A dB-scaled :class:`Spectrogram` from an STFT
        computed with a Gaussian window.
    """
    if scale:
        from .. import Sound

        sound = Sound(
            data=(sound.data * scale_val).astype(scale_dtype),
            samplerate=sound.samplerate,
            path=sound.path,
        )

    # ---- soundsig.sound.spectrogram
    increment = 1.0 / spec_sample_rate
    window_length = nstd / (2.0 * np.pi * freq_spacing)

    # ---- soundsig.timefreq.gaussian_stft
    spectrum_estimator = GaussianSpectrumEstimator(nstd=nstd)

    # ---- soundsig.timefreq.timefreq
    # compute lengths in # of samples
    nwinlen = int(sound.samplerate * window_length)
    if nwinlen % 2 == 0:
        nwinlen += 1
    hnwinlen = nwinlen // 2
    if sound.data.shape[-1] < nwinlen:
        raise ValueError(
            f"Length of sound.data ({sound.data.shape[-1]}) is less than length of window in samples ({nwinlen})."
        )

    # get the values for the frequency axis by estimating the spectrum of a dummy slice
    full_freq = spectrum_estimator.get_frequencies(nwinlen, sound.samplerate)
    freq_index = (full_freq >= min_freq) & (full_freq <= max_freq)
    freq = full_freq[freq_index]
    nfreq = freq_index.sum()

    nincrement = int(np.round(sound.samplerate * increment))

    spect_channels = []
    for channel_data in sound.data:
        # pad the signal with zeros
        zs = np.zeros([len(channel_data) + 2 * hnwinlen])
        zs[hnwinlen:-hnwinlen] = channel_data
        windows = sliding_window_view(zs, nwinlen, axis=0)[::nincrement]
        nwindows = len(windows)

        # take the FFT of each segment, padding with zeros when necessary to keep window length the same
        tf = np.zeros([nfreq, nwindows], dtype="complex")
        for k, window in enumerate(windows):
            spec_freq, est = spectrum_estimator.estimate(
                window, sound.samplerate
            )
            findex = (spec_freq <= max_freq) & (spec_freq >= min_freq)
            tf[:, k] = est[findex]

        spect_channels.append(tf)

    spec = np.array(spect_channels)
    # Note that the desired spectrogram rate could be slightly modified
    t = np.arange(0, nwindows, 1.0) * float(nincrement) / sound.samplerate

    # ---- soundsig.BioSound.spectCalc
    spec = 20 * np.log10(np.abs(spec))

    from vocalpy import Spectrogram  # avoid circular dependencies

    return Spectrogram(data=spec, times=t, frequencies=freq)