"""Metrics for segmentation adapted from information retrieval."""
from __future__ import annotations
import attr
import numpy as np
import numpy.typing as npt
from ... import validators
[docs]
def find_hits(
hypothesis: npt.NDArray,
reference: npt.NDArray,
tolerance: float | int | None = None,
decimals: int | None = None,
) -> tuple[npt.NDArray, npt.NDArray, npt.NDArray]:
r"""Find hits in arrays of event times.
This is a helper function used to compute information retrieval metrics.
Specifically, this function is called by
:func:`~vocalpy.metrics.segmentation.ir.precision_recall_fscore`.
An element in ``hypothesis``, is considered a hit
if its value :math:`t_h` falls within an interval around
any value in ``reference``, :math:`t_0`, plus or minus ``tolerance``
:math:`t_0 - \Delta t < t < t_0 + \Delta t`
This function only allows there to be zero or one hit
for each element in ``reference``, but not more than one.
If the condition :math:`|ref_i - hyp_j| < tolerance`
is true for multiple values :math:`hyp_j` in ``hypothesis``,
then the value with the smallest difference from :math:`ref_i`
is considered a hit.
Both ``hypothesis`` and ``reference`` must be 1-dimensional
arrays of non-negative, strictly increasing values.
If you have two arrays ``onsets`` and ``offsets``,
you can concatenate those into a single valid array
of boundary times using :func:`concat_starts_and_stops`
that you can then pass to this function.
Parameters
----------
hypothesis : numpy.ndarray
Boundaries, e.g., onsets or offsets of segments,
as computed by some method.
reference : numpy.ndarray
Ground truth boundaries that the hypothesized
boundaries ``hypothesis`` are compared to.
tolerance : float or int
Tolerance, in seconds.
Elements in ``hypothesis`` are considered
a true positive if they are within a time interval
around any reference boundary :math:`t_0`
in ``reference`` plus or minus
the ``tolerance``, i.e.,
if a hypothesized boundary :math:`t_h`
is within the interval
:math:`t_0 - \Delta t < t < t_0 + \Delta t`.
Default is None,
in which case it is set to ``0``
(either float or int, depending on the
dtype of ``hypothesis`` and ``reference``).
See notes for more detail.
decimals: int
The number of decimal places to round both
``hypothesis`` and ``reference`` to, using
:func:`numpy.round`. This mitigates inflated
error rates due to floating point error.
Rounding is only applied
if both ``hypothesis`` and ``reference``
are floating point values. To avoid rounding,
e.g. to compute strict precision and recall,
pass in the value ``False``. Default is 3, which
assumes that the values are in seconds
and should be rounded to milliseconds.
Returns
-------
hits_ref : numpy.ndarray
The indices of hits in ``reference``.
hits_hyp : numpy.ndarray
The indices of hits in ``hypothesis``.
diffs : numpy.ndarray
Absolute differences :math:`|hit^{ref}_i - hit^{hyp}_i|`,
i.e., ``np.abs(reference[hits_ref] - hypothesis[hits_hyp])``.
"""
validators.is_valid_boundaries_array(
hypothesis
) # 1-d, non-negative, strictly increasing
validators.is_valid_boundaries_array(reference)
validators.have_same_dtype(hypothesis, reference)
if tolerance is None:
if issubclass(reference.dtype.type, np.floating):
tolerance = 0.0
elif issubclass(reference.dtype.type, np.integer):
tolerance = 0
if tolerance < 0:
raise ValueError(
f"``tolerance`` must be a non-negative number but was: {tolerance}"
)
if decimals and (decimals is not False and not isinstance(decimals, int)):
raise ValueError(
f"``decimals`` must either be ``False`` or an integer but was: {decimals}"
)
if issubclass(reference.dtype.type, np.floating):
if not isinstance(tolerance, float):
raise TypeError(
"If ``hypothesis`` and ``reference`` are floating, tolerance must be a float also, "
f"but type was: {type(tolerance)}"
)
if decimals is None:
decimals = 3
if decimals < 0:
raise ValueError(
f"``decimals`` must be a non-negative number but was: {decimals}"
)
if decimals is not False:
# we assume float values are in units of seconds and round to ``decimals``,
# the default is 3 to indicate "milliseconds"
reference = np.round(reference, decimals=decimals)
hypothesis = np.round(hypothesis, decimals=decimals)
if issubclass(reference.dtype.type, np.integer):
if not isinstance(tolerance, int):
raise TypeError(
"If ``hypothesis`` and ``reference`` are integers, tolerance must be an integer also, "
f"but type was: {type(tolerance)}"
)
if decimals is not None:
raise ValueError(
"Cannot specify a ``decimals`` value when dtype of arrays is int"
)
diffs = np.abs(np.subtract.outer(reference, hypothesis))
in_window = diffs <= tolerance
hits_ref, hits_hyp = np.where(in_window)
# now force there to be only one hit in hyp for each hit in ref;
# we do this by choosing the hit that has the smallest absolute difference
diffs_in_window = diffs[hits_ref, hits_hyp]
hits_diffs = sorted(
zip(hits_ref, hits_hyp, diffs_in_window), key=lambda x: x[2]
)
hits_ref_out = []
hits_hyp_out = []
diffs_out = []
for hit_ref, hit_hyp, diff in hits_diffs:
if hit_ref not in hits_ref_out and hit_hyp not in hits_hyp_out:
hits_ref_out.append(hit_ref)
hits_hyp_out.append(hit_hyp)
diffs_out.append(diff)
hits_ref_out = np.array(hits_ref_out)
sort_inds = np.argsort(hits_ref_out)
hits_ref_out = hits_ref_out[sort_inds]
hits_hyp_out = np.array(hits_hyp_out)[sort_inds]
diffs_out = np.array(diffs_out)[sort_inds]
return hits_ref_out, hits_hyp_out, diffs_out
[docs]
@attr.define
class IRMetricData:
"""Class representing data used to compute
an information retrieval metric.
This class contains data
needed to compute metrics like precision and recall
for estimated event times
compared to reference event times.
The class attributes are the variables
returned by
:func:`vocalpy.metrics.segmentation.find_hits`.
Instances of this class are returned
along with the value of the computed metrics.
The values can be useful when computing
additional statistics,
e.g., the classes of segments that had higher
or lower precision or recall,
or the distribution of
differences between reference times
and estimated times for some class of events.
Attributes
----------
hits_ref : numpy.ndarray
The indices of hits in ``reference``.
hits_hyp : numpy.ndarray
The indices of hits in ``hypothesis``.
diffs : numpy.ndarray
Absolute differences :math:`|hit^{ref}_i - hit^{hyp}_i|`,
i.e., ``np.abs(reference[hits_ref] - hypothesis[hits_hyp])``.
"""
hits_ref: npt.NDArray
hits_hyp: npt.NDArray
diffs: npt.NDArray
[docs]
def precision_recall_fscore(
hypothesis: npt.NDArray,
reference: npt.NDArray,
metric: str,
tolerance: float | int | None = None,
decimals: int | bool | None = None,
) -> tuple[float, int, IRMetricData]:
r"""Helper function that computes precision, recall, and the F-score.
Since all these metrics require computing the number of true positives,
and F-score is a combination of precision and recall,
we rely on this helper function to compute them.
You can compute each directly without needing the ``metric`` argument
that this function requires by calling the appropriate function:
:func:`~vocalpy.metrics.segmentation.ir.precision`,
:func:`~vocalpy.metrics.segmentation.ir.recall`, and
:func:`~vocalpy.metrics.segmentation.ir.fscore`.
See docstrings of those functions for definitions of the metrics
in terms of segmentation algorithms.
Precision, recall, and F-score are computed using hits found with
:func:`vocalpy.metrics.segmentation._ir_helper.find_hits`.
See docstring of that function for more detail on how hits are computed.
Both ``hypothesis`` and ``reference`` must be 1-dimensional
arrays of non-negative, strictly increasing values.
If you have two arrays ``onsets`` and ``offsets``,
you can concatenate those into a single valid array
of boundary times using :func:`concat_starts_and_stops`
that you can then pass to this function.
Parameters
----------
hypothesis : numpy.ndarray
Boundaries, e.g., onsets or offsets of segments,
as computed by some method.
reference : numpy.ndarray
Ground truth boundaries that the hypothesized
boundaries ``hypothesis`` are compared to.
metric : str
The name of the metric to compute.
One of: ``{"precision", "recall", "fscore"}``.
tolerance : float or int
Tolerance, in seconds.
Elements in ``hypothesis`` are considered
a true positive if they are within a time interval
around any reference boundary :math:`t_0`
in ``reference`` plus or minus
the ``tolerance``, i.e.,
if a hypothesized boundary :math:`t_h`
is within the interval
:math:`t_0 - \Delta t < t < t_0 + \Delta t`.
Default is None,
in which case it is set to ``0``
(either float or int, depending on the
dtype of ``hypothesis`` and ``reference``).
See notes for more detail.
decimals: int
The number of decimal places to round both
``hypothesis`` and ``reference`` to, using
:func:`numpy.round`. This mitigates inflated
error rates due to floating point error.
Rounding is only applied
if both ``hypothesis`` and ``reference``
are floating point values. To avoid rounding,
e.g. to compute strict precision and recall,
pass in the value ``False``. Default is 3, which
assumes that the values are in seconds
and should be rounded to milliseconds.
Returns
-------
metric_value : float
Value for ``metric``.
n_tp : int
The number of true positives.
metric_data : IRMetricData
Instance of :class:`IRMetricData`
with indices of hits in both
``hypothesis`` and ``reference``,
and the absolute difference between times
in ``hypothesis`` and ``reference``
for the hits.
Notes
-----
The addition of a tolerance parameter is based on [1]_.
This is also sometimes known as a "collar" [2]_ or "forgiveness collar" [3]_.
The value for the tolerance can be determined by visual inspection
of the distribution; see for example [4]_.
References
----------
.. [1] Kemp, T., Schmidt, M., Whypphal, M., & Waibel, A. (2000, June).
Strategies for automatic segmentation of audio data.
In 2000 ieee international conference on acoustics, speech, and signal processing.
proceedings (cat. no. 00ch37100) (Vol. 3, pp. 1423-1426). IEEE.
.. [2] Jordán, P. G., & Giménez, A. O. (2023).
Advances in Binary and Multiclass Sound Segmentation with Deep Learning Techniques.
.. [3] NIST. (2009). The 2009 (RT-09) Rich Transcription Meeting Recognition Evaluation Plan.
<https://web.archive.org/web/20100606041157if_/http://www.itl.nist.gov/iad/mig/thyps/rt/2009/docs/rt09-meeting-eval-plan-v2.pdf>
.. [4] Du, P., & Troyer, T. W. (2006).
A segmentation algorithm for zebra finch song at the note level.
Neurocomputing, 69(10-12), 1375-1379.
"""
if metric not in {"precision", "recall", "fscore"}:
raise ValueError(
f'``metric`` must be one of: {{"precision", "recall", "fscore"}} but was: {metric}'
)
# edge case: if both reference and hypothesis have a length of zero, we have a score of 1.0
# but no hits. This is to avoid punishing the correct hypothesis that there are no boundaries.
# See https://github.com/vocalpy/vocalpy/issues/170
if len(reference) == 0 and len(hypothesis) == 0:
return (
1.0,
0,
IRMetricData(
hits_ref=np.array([]),
hits_hyp=np.array([]),
diffs=np.array([]),
),
)
# If we have no boundaries, we get no score.
if len(reference) == 0 or len(hypothesis) == 0:
return (
0.0,
0,
IRMetricData(
hits_ref=np.array([]),
hits_hyp=np.array([]),
diffs=np.array([]),
),
)
hits_ref, hits_hyp, diffs = find_hits(
hypothesis, reference, tolerance, decimals
)
metric_data = IRMetricData(hits_ref, hits_hyp, diffs)
n_tp = hits_hyp.size
if metric == "precision":
precision_ = n_tp / hypothesis.size
return precision_, n_tp, metric_data
elif metric == "recall":
recall_ = n_tp / reference.size
return recall_, n_tp, metric_data
elif metric == "fscore":
precision_ = n_tp / hypothesis.size
recall_ = n_tp / reference.size
if np.isclose(precision_, 0.0) and np.isclose(recall_, 0.0):
# avoids divide-by-zero that would give NaN
return 0.0, n_tp, metric_data
fscore_ = 2 * (precision_ * recall_) / (precision_ + recall_)
return fscore_, n_tp, metric_data
[docs]
def precision(
hypothesis: npt.NDArray,
reference: npt.NDArray,
tolerance: float | int | None = None,
decimals: int | bool | None = None,
) -> tuple[float, int, IRMetricData]:
r"""Compute precision :math:`P` for a segmentation.
Computes the metric from a hypothesized vector of boundaries
``hypothesis`` returned by a segmentation algorithm
and a reference vector of boundaries ``reference``,
e.g., boundaries cleaned by a human expert
or boundaries from a benchmark dataset.
Precision is defined as the number of true positives (:math:`T_p`)
over the number of true positives
plus the number of false positives (:math:`F_p`).
:math:`P = \\frac{T_p}{T_p+F_p}`.
The number of true positives ``n_tp`` is computed by calling
:func:`vocalpy.metrics.segmentation.ir.find_hits`.
This function then computes the precision as
``precision = n_tp / hypothesis.size``.
Both ``hypothesis`` and ``reference`` must be 1-dimensional
arrays of non-negative, strictly increasing values.
If you have two arrays ``onsets`` and ``offsets``,
you can concatenate those into a single valid array
of boundary times using :func:`concat_starts_and_stops`
that you can then pass to this function.
Parameters
----------
hypothesis : numpy.ndarray
Boundaries, e.g., onsets or offsets of segments,
as computed by some method.
reference : numpy.ndarray
Ground truth boundaries that the hypothesized
boundaries ``hypothesis`` are compared to.
tolerance : float or int
Tolerance, in seconds.
Elements in ``hypothesis`` are considered
a true positive if they are within a time interval
around any reference boundary :math:`t_0`
in ``reference`` plus or minus
the ``tolerance``, i.e.,
if a hypothesized boundary :math:`t_h`
is within the interval
:math:`t_0 - \Delta t < t < t_0 + \Delta t`.
Default is None,
in which case it is set to ``0``
(either float or int, depending on the
dtype of ``hypothesis`` and ``reference``).
decimals: int
The number of decimal places to round both
``hypothesis`` and ``reference`` to, using
:func:`numpy.round`. This mitigates inflated
error rates due to floating point error.
Rounding is only applied
if both ``hypothesis`` and ``reference``
are floating point values. To avoid rounding,
e.g. to compute strict precision and recall,
pass in the value ``False``. Default is 3, which
assumes that the values are in seconds
and should be rounded to milliseconds.
Returns
-------
precision : float
Value for precision, computed as described above.
n_tp : int
The number of true positives.
metric_data : IRMetricData
Instance of :class:`IRMetricData`
with indices of hits in both
``hypothesis`` and ``reference``,
and the absolute difference between times
in ``hypothesis`` and ``reference``
for the hits.
Examples
--------
>>> hypothesis = np.array([1, 6, 10, 16])
>>> reference = np.array([0, 5, 10, 15])
>>> prec, n_tp, ir_metric_data = vocalpy.metrics.segmentation.ir.precision(hypothesis, reference, tolerance=0)
>>> print(prec)
0.25
>>> print(ir_metric_data.hits_hyp)
np.array([2])
>>> hypothesis = np.array([0, 1, 5, 10])
>>> reference = np.array([0, 5, 10])
>>> fscore, n_tp, metric_data = vocalpy.metrics.segmentation.ir.precision(hypothesis, reference, tolerance=1)
>>> print(fscore)
0.75
>>> print(ir_metric_data.hits_hyp)
np.array([0, 2, 3])
Notes
-----
The addition of a tolerance parameter is based on [1]_.
This is also sometimes known as a "collar" [2]_ or "forgiveness collar" [3]_.
The value for the tolerance can be determined by visual inspection
of the distribution; see for example [4]_.
References
----------
.. [1] Kemp, T., Schmidt, M., Whypphal, M., & Waibel, A. (2000, June).
Strategies for automatic segmentation of audio data.
In 2000 ieee international conference on acoustics, speech, and signal processing.
proceedings (cat. no. 00ch37100) (Vol. 3, pp. 1423-1426). IEEE.
.. [2] Jordán, P. G., & Giménez, A. O. (2023).
Advances in Binary and Multiclass Sound Segmentation with Deep Learning Techniques.
.. [3] NIST. (2009). The 2009 (RT-09) Rich Transcription Meeting Recognition Evaluation Plan.
https://web.archive.org/web/20100606041157if_/http://www.itl.nist.gov/iad/mig/thyps/rt/2009/docs/rt09-meeting-eval-plan-v2.pdf
.. [4] Du, P., & Troyer, T. W. (2006).
A segmentation algorithm for zebra finch song at the note level.
Neurocomputing, 69(10-12), 1375-1379.
"""
return precision_recall_fscore(
hypothesis, reference, "precision", tolerance, decimals
)
[docs]
def recall(
hypothesis: npt.NDArray,
reference: npt.NDArray,
tolerance: float | int | None = None,
decimals: int | bool | None = None,
) -> tuple[float, int, IRMetricData]:
r"""Compute recall :math:`R` for a segmentation.
Computes the metric from a hypothesized vector of boundaries
``hypothesis`` returned by a segmentation algorithm
and a reference vector of boundaries ``reference``,
e.g., boundaries cleaned by a human expert
or boundaries from a benchmark dataset.
Recall (:math:`R`) is defined as the number of true positives (:math:`T_p`)
over the number of true positives plus the number of false negatives
(:math:`F_n`).
:math:`R = \\frac{T_p}{T_p + F_n}`
The number of true positives ``n_tp`` is computed by calling
:func:`vocalpy.metrics.segmentation.ir.find_hits`.
This function then computes the recall as
``recall = n_tp / reference.size``.
Both ``hypothesis`` and ``reference`` must be 1-dimensional
arrays of non-negative, strictly increasing values.
If you have two arrays ``onsets`` and ``offsets``,
you can concatenate those into a single valid array
of boundary times using :func:`concat_starts_and_stops`
that you can then pass to this function.
Parameters
----------
hypothesis : numpy.ndarray
Boundaries, e.g., onsets or offsets of segments,
as computed by some method.
reference : numpy.ndarray
Ground truth boundaries that the hypothesized
boundaries ``hypothesis`` are compared to.
tolerance : float or int
Tolerance, in seconds.
Elements in ``hypothesis`` are considered
a true positive if they are within a time interval
around any reference boundary :math:`t_0`
in ``reference`` plus or minus
the ``tolerance``, i.e.,
if a hypothesized boundary :math:`t_h`
is within the interval
:math:`t_0 - \Delta t < t < t_0 + \Delta t`.
Default is None,
in which case it is set to ``0``
(either float or int, depending on the
dtype of ``hypothesis`` and ``reference``).
decimals: int
The number of decimal places to round both
``hypothesis`` and ``reference`` to, using
:func:`numpy.round`. This mitigates inflated
error rates due to floating point error.
Rounding is only applied
if both ``hypothesis`` and ``reference``
are floating point values. To avoid rounding,
e.g. to compute strict precision and recall,
pass in the value ``False``. Default is 3, which
assumes that the values are in seconds
and should be rounded to milliseconds.
Returns
-------
recall : float
Value for recall, computed as described above.
n_tp : int
The number of true positives.
metric_data : IRMetricData
Instance of :class:`IRMetricData`
with indices of hits in both
``hypothesis`` and ``reference``,
and the absolute difference between times
in ``hypothesis`` and ``reference``
for the hits.
Examples
--------
>>> hypothesis = np.array([1, 6, 10, 16])
>>> reference = np.array([0, 5, 10, 15])
>>> recall, n_tp, ir_metric_data = vocalpy.metrics.segmentation.ir.recall(hypothesis, reference, tolerance=0)
>>> print(recall)
0.25
>>> print(ir_metric_data.hits_hyp)
np.array([2])
>>> hypothesis = np.array([0, 1, 5, 10])
>>> reference = np.array([0, 5, 10])
>>> recall, n_tp, metric_data = vocalpy.metrics.segmentation.ir.recall(hypothesis, reference, tolerance=1)
>>> print(recall)
1.0
>>> print(ir_metric_data.hits_hyp)
np.array([0, 2, 3])
Notes
-----
The addition of a tolerance parameter is based on [1]_.
This is also sometimes known as a "collar" [2]_ or "forgiveness collar" [3]_.
The value for the tolerance can be determined by visual inspection
of the distribution; see for example [4]_.
References
----------
.. [1] Kemp, T., Schmidt, M., Whypphal, M., & Waibel, A. (2000, June).
Strategies for automatic segmentation of audio data.
In 2000 ieee international conference on acoustics, speech, and signal processing.
proceedings (cat. no. 00ch37100) (Vol. 3, pp. 1423-1426). IEEE.
.. [2] Jordán, P. G., & Giménez, A. O. (2023).
Advances in Binary and Multiclass Sound Segmentation with Deep Learning Techniques.
.. [3] NIST. (2009). The 2009 (RT-09) Rich Transcription Meeting Recognition Evaluation Plan.
https://web.archive.org/web/20100606041157if_/http://www.itl.nist.gov/iad/mig/thyps/rt/2009/docs/rt09-meeting-eval-plan-v2.pdf
.. [4] Du, P., & Troyer, T. W. (2006).
A segmentation algorithm for zebra finch song at the note level.
Neurocomputing, 69(10-12), 1375-1379.
"""
return precision_recall_fscore(
hypothesis, reference, "recall", tolerance, decimals
)
[docs]
def fscore(
hypothesis: npt.NDArray,
reference: npt.NDArray,
tolerance: float | int | None = None,
decimals: int | bool | None = None,
) -> tuple[float, int, IRMetricData]:
r"""Compute the F-score for a segmentation.
Computes the metric from a
hypothesized vector of boundaries ``hypothesis``
returned by a segmentation algorithm
and a reference vector of boundaries ``reference``,
e.g., boundaries cleaned by a human expert
or boundaries from a benchmark dataset.
The F-score can be interpreted as a harmonic mean of the precision and
recall, where an F-score reaches its best value at 1 and worst score at 0.
The relative contribution of precision and recall to the F-score are
equal. The formula for the F-score is:
``f_score = 2 * (precision * recall) / (precision + recall)``
Both ``hypothesis`` and ``reference`` must be 1-dimensional
arrays of non-negative, strictly increasing values.
If you have two arrays ``onsets`` and ``offsets``,
you can concatenate those into a single valid array
of boundary times using :func:`concat_starts_and_stops`
that you can then pass to this function.
Parameters
----------
hypothesis : numpy.ndarray
Boundaries, e.g., onsets or offsets of segments,
as computed by some method.
reference : numpy.ndarray
Ground truth boundaries that the hypothesized
boundaries ``hypothesis`` are compared to.
tolerance : float or int
Tolerance, in seconds.
Elements in ``hypothesis`` are considered
a true positive if they are within a time interval
around any reference boundary :math:`t_0`
in ``reference`` plus or minus
the ``tolerance``, i.e.,
if a hypothesized boundary :math:`t_h`
is within the interval
:math:`t_0 - \Delta t < t < t_0 + \Delta t`.
Default is None,
in which case it is set to ``0``
(either float or int, depending on the
dtype of ``hypothesis`` and ``reference``).
decimals: int
The number of decimal places to round both
``hypothesis`` and ``reference`` to, using
:func:`numpy.round`. This mitigates inflated
error rates due to floating point error.
Rounding is only applied
if both ``hypothesis`` and ``reference``
are floating point values. To avoid rounding,
e.g. to compute strict precision and recall,
pass in the value ``False``. Default is 3, which
assumes that the values are in seconds
and should be rounded to milliseconds.
Returns
-------
f_score : float
Value for F-score, computed as described above.
n_tp : int
The number of true positives.
metric_data : IRMetricData
Instance of :class:`IRMetricData`
with indices of hits in both
``hypothesis`` and ``reference``,
and the absolute difference between times
in ``hypothesis`` and ``reference``
for the hits.
Examples
--------
>>> hypothesis = np.array([1, 6, 10, 16])
>>> reference = np.array([0, 5, 10, 15])
>>> prec, n_tp, ir_metric_data = vocalpy.metrics.segmentation.ir.fscore(hypothesis, reference, tolerance=0)
>>> print(prec)
0.25
>>> print(ir_metric_data.hits_hyp)
np.array([2])
>>> hypothesis = np.array([0, 1, 5, 10])
>>> reference = np.array([0, 5, 10])
>>> prec, n_tp, metric_data = vocalpy.metrics.segmentation.ir.fscore(hypothesis, reference, tolerance=1)
>>> print(prec)
0.75
>>> print(ir_metric_data.hits_hyp)
np.array([0, 2, 3])
Notes
-----
The addition of a tolerance parameter is based on [1]_.
This is also sometimes known as a "collar" [2]_ or "forgiveness collar" [3]_.
The value for the tolerance can be determined by visual inspection
of the distribution; see for example [4]_.
References
----------
.. [1] Kemp, T., Schmidt, M., Whypphal, M., & Waibel, A. (2000, June).
Strategies for automatic segmentation of audio data.
In 2000 ieee international conference on acoustics, speech, and signal processing.
proceedings (cat. no. 00ch37100) (Vol. 3, pp. 1423-1426). IEEE.
.. [2] Jordán, P. G., & Giménez, A. O. (2023).
Advances in Binary and Multiclass Sound Segmentation with Deep Learning Techniques.
.. [3] NIST. (2009). The 2009 (RT-09) Rich Transcription Meeting Recognition Evaluation Plan.
https://web.archive.org/web/20100606041157if_/http://www.itl.nist.gov/iad/mig/thyps/rt/2009/docs/rt09-meeting-eval-plan-v2.pdf
.. [4] Du, P., & Troyer, T. W. (2006).
A segmentation algorithm for zebra finch song at the note level.
Neurocomputing, 69(10-12), 1375-1379.
"""
return precision_recall_fscore(
hypothesis, reference, "fscore", tolerance, decimals
)
[docs]
def concat_starts_and_stops(
starts: npt.NDArray, stops: npt.NDArray
) -> npt.NDArray:
"""Concatenate arrays of start and stop times
into a single array of boundary times.
Some segmenting algorithms return lists of segments
denoted by the start and stop times of each segment.
(You may also see these times called "onsets" and "offsets".)
Typically, such segmenting algorithms work by setting a
threshold on some acoustic feature, e.g. the Root-Mean-Square
of the spectral power.
This means the segments will be separated by brief
"silent gaps" (periods below threshold).
To compute metrics for segmentation like precision
and recall, you may want to combine the start and stop
times into a single array of boundary times.
Such an approach is valid if we think of a "silent gaps"
as a segment whose start time is the stop time/offset of the
preceding segment.
If you have arrays of start and stop times,
you can concatenate into a single array of
boundary times with this function.
Both ``starts`` and ``stops`` must be 1-dimensional
arrays of non-negative, strictly increasing values,
with the same ``dtype``.
The two arrays ``starts`` and ``stops``
must be the same length, and all start times
must be less than the corresponding stop times,
i.e., ``np.all(starts < stops)`` should evaluate
to ``True``.
Parameters
----------
starts : numpy.ndarray
Array of start times of segments.
stops : numpy.ndarray
Array of stop times of segments.
Returns
-------
boundaries : numpy.ndarray
The array of boundary times,
concatenated and then sorted,
so that
``np.all(boundaries[1:] > boundaries[:-1]``
evaluates to ``True``.
Examples
--------
>>> starts = np.array([0, 8, 16, 24])
>>> stops = np.array([4, 12, 20, 28])
>>> concat_starts_and_stops(starts, stops)
np.array([0, 4, 8, 12, 16, 20, 24, 28])
>>> starts = np.array([0.000, 8.000, 16.000, 24.000])
>>> stops = np.array([4.000, 12.000, 20.000, 28.000])
>>> concat_starts_and_stops(starts, stops)
np.array([0.000, 4.000, 8.000, 12.000, 16.000, 20.000, 24.000, 28.000])
"""
validators.is_valid_boundaries_array(
starts
) # 1-d, non-negative, strictly increasing
validators.is_valid_boundaries_array(stops)
validators.have_same_dtype(starts, stops)
if not starts.size == stops.size:
raise ValueError(
"Boundary arrays ``starts`` and ``stops`` must have same lengths--"
"every element in ``starts`` must have a corresponding element in ``stops``--"
f"but sizes were different: starts.size={starts.size}, stops.size={stops.size}"
)
if not np.all(starts < stops):
gt = np.where(starts > stops)[0]
raise ValueError(
"Every element in ``starts`` must be less than the corresponding element in ``stops``,"
f"but some values in ``starts`` were greater: values at indices {gt}"
)
return np.sort(np.concatenate((starts, stops)))