from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
import numpy.typing as npt
from numpy.lib.stride_tricks import sliding_window_view
from scipy.fftpack import fft, fftfreq, next_fast_len
if TYPE_CHECKING:
from vocalpy import Sound, Spectrogram
[docs]
class GaussianSpectrumEstimator:
[docs]
def __init__(self, nstd=6):
self.nstd = nstd
self._gauss_window_cache = {}
def get_frequencies(self, signal_length, sample_rate):
signal_length = next_fast_len(signal_length)
freq = fftfreq(signal_length, d=1.0 / sample_rate)
nz = freq >= 0.0
return freq[nz]
def _get_gauss_window(self, nwinlen):
if nwinlen in self._gauss_window_cache:
return self._gauss_window_cache[nwinlen]
else:
if nwinlen % 2 == 0:
nwinlen += 1
hnwinlen = nwinlen // 2
gauss_t = np.arange(-hnwinlen, hnwinlen + 1, 1.0)
gauss_std = float(nwinlen) / float(self.nstd)
gauss_window = np.exp(-(gauss_t**2) / (2.0 * gauss_std**2)) / (
gauss_std * np.sqrt(2 * np.pi)
)
self._gauss_window_cache[nwinlen] = gauss_window
return gauss_window
def estimate(self, signal, samplerate):
nwinlen = len(signal)
gauss_window = self._get_gauss_window(nwinlen)
fft_len = next_fast_len(len(signal))
# window the signal and take the FFT
windowed_slice = signal[:fft_len] * gauss_window[:fft_len]
s_fft = fft(windowed_slice, n=fft_len, overwrite_x=True)
freq = fftfreq(fft_len, d=1.0 / samplerate)
nz = freq >= 0.0
return freq[nz], s_fft[nz]
[docs]
def soundsig_spectro(
sound: Sound,
spec_sample_rate: int = 1000,
freq_spacing: int = 50,
min_freq: int = 0,
max_freq: int = 10000,
nstd: int = 6,
scale: bool = True,
scale_val: int | float = 2**15,
scale_dtype: npt.DTypeLike = np.int16,
) -> Spectrogram:
# raw string to avoid flake8 complaining about math
r"""Compute a dB-scaled spectrogram using a Gaussian window.
Replicates the result of the method :meth:`soundsig.BioSound.spectroCalc`.
Parameters
----------
sound : vocalpy.Sound
Sound loaded from a file. Multi-channel is supported.
spec_sample_rate : int
Sampling rate for the output spectrogram, in Hz.
Sets the overlap for the windows in the STFT.
freq_spacing : int
The time-frequency scale for the spectrogram, in Hz.
Determines the width of the Gaussian window.
min_freq : int
The minimum frequency to analyze, in Hz.
The returned :class:`Spectrogram` will only contain
frequencies :math:`\gte` ``min_freq``.
max_freq : int
The maximum frequency to analyze, in Hz.
The returned :class:`Spectrogram` will only contain
frequencies :math:`\lte` ``max_freq``.
nstd : int
Number of standard deviations of the Gaussian in one window.
scale : bool
If True, scale the ``sound.data``.
Default is True.
This is needed to replicate the behavior of ``soundsig``,
which assumes the audio data is loaded as 16-bit integers.
Since the default for :class:`vocalpy.Sound` is to load sounds
with a numpy dtype of float64, this function defaults to
multiplying the ``sound.data`` by 2**15,
and then casting to the int16 dtype.
This replicates the behavior of the ``soundsig`` function,
given data with dtype float64.
If you have loaded a sound with a dtype of int16,
then set this to False.
scale_val :
Value to multiply the ``sound.data`` by, to scale the data.
Default is 2**15.
Only used if ``scale`` is ``True``.
This is needed to replicate the behavior of ``soundsig``,
which assumes the audio data is loaded as 16-bit integers.
scale_dtype : numpy.dtype
Numpy Dtype to cast ``sound.data`` to, after scaling.
Default is ``np.int16``.
Only used if ``scale`` is ``True``.
This is needed to replicate the behavior of ``soundsig``,
which assumes the audio data is loaded as 16-bit integers.
Returns
-------
spect : Spectrogram
A dB-scaled :class:`Spectrogram` from an STFT
computed with a Gaussian window.
"""
if scale:
from .. import Sound
sound = Sound(
data=(sound.data * scale_val).astype(scale_dtype),
samplerate=sound.samplerate,
path=sound.path,
)
# ---- soundsig.sound.spectrogram
increment = 1.0 / spec_sample_rate
window_length = nstd / (2.0 * np.pi * freq_spacing)
# ---- soundsig.timefreq.gaussian_stft
spectrum_estimator = GaussianSpectrumEstimator(nstd=nstd)
# ---- soundsig.timefreq.timefreq
# compute lengths in # of samples
nwinlen = int(sound.samplerate * window_length)
if nwinlen % 2 == 0:
nwinlen += 1
hnwinlen = nwinlen // 2
if sound.data.shape[-1] < nwinlen:
raise ValueError(
f"Length of sound.data ({sound.data.shape[-1]}) is less than length of window in samples ({nwinlen})."
)
# get the values for the frequency axis by estimating the spectrum of a dummy slice
full_freq = spectrum_estimator.get_frequencies(nwinlen, sound.samplerate)
freq_index = (full_freq >= min_freq) & (full_freq <= max_freq)
freq = full_freq[freq_index]
nfreq = freq_index.sum()
nincrement = int(np.round(sound.samplerate * increment))
spect_channels = []
for channel_data in sound.data:
# pad the signal with zeros
zs = np.zeros([len(channel_data) + 2 * hnwinlen])
zs[hnwinlen:-hnwinlen] = channel_data
windows = sliding_window_view(zs, nwinlen, axis=0)[::nincrement]
nwindows = len(windows)
# take the FFT of each segment, padding with zeros when necessary to keep window length the same
tf = np.zeros([nfreq, nwindows], dtype="complex")
for k, window in enumerate(windows):
spec_freq, est = spectrum_estimator.estimate(
window, sound.samplerate
)
findex = (spec_freq <= max_freq) & (spec_freq >= min_freq)
tf[:, k] = est[findex]
spect_channels.append(tf)
spec = np.array(spect_channels)
# Note that the desired spectrogram rate could be slightly modified
t = np.arange(0, nwindows, 1.0) * float(nincrement) / sound.samplerate
# ---- soundsig.BioSound.spectCalc
spec = 20 * np.log10(np.abs(spec))
from vocalpy import Spectrogram # avoid circular dependencies
return Spectrogram(data=spec, times=t, frequencies=freq)