mirror of
https://github.com/Mintplex-Labs/pyannote-audio-legacy.git
synced 2026-07-01 20:24:10 -04:00
feat: initial import
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2016 CNRS
|
||||
Copyright (c) 2020 CNRS
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -0,0 +1,270 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"os.environ['PYANNOTE_DATABASE_CONFIG'] = '/Users/bredin/Development/pyannote/pyannote-audio/tests/data/database.yml'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pyannote.database import get_protocol, FileFinder\n",
|
||||
"protocol = get_protocol('Debug.SpeakerDiarization.Debug', \n",
|
||||
" preprocessors={\"audio\": FileFinder()})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Voice activity detection"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pyannote.audio.tasks.voice_activity_detection.task import VoiceActivityDetection\n",
|
||||
"vad = VoiceActivityDetection(protocol, duration=2., batch_size=32, num_workers=4)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pyannote.audio.models.debug import SimpleSegmentationModel\n",
|
||||
"model = SimpleSegmentationModel(task=vad)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"GPU available: False, used: False\n",
|
||||
"TPU available: False, using: 0 TPU cores\n",
|
||||
"\n",
|
||||
" | Name | Type | Params\n",
|
||||
"------------------------------------------\n",
|
||||
"0 | mfcc | MFCC | 0 \n",
|
||||
"1 | lstm | LSTM | 18 K \n",
|
||||
"2 | classifier | Linear | 130 \n",
|
||||
"3 | activation | LogSoftmax | 0 \n",
|
||||
"/Users/bredin/miniconda3/envs/pyannote-audio-v2/lib/python3.7/site-packages/pytorch_lightning/utilities/distributed.py:45: UserWarning: Your `IterableDataset` has `__len__` defined. In combination with multi-processing data loading (e.g. batch size > 1), this can lead to unintended side effects since the samples will be duplicated.\n",
|
||||
" warnings.warn(*args, **kwargs)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "9a872fa9cfcd4e298bb2d9e3410eb4de",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pytorch_lightning as pl\n",
|
||||
"trainer = pl.Trainer(max_epochs=10)\n",
|
||||
"_ = trainer.fit(model, vad)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Speaker tracking"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pyannote.audio.tasks.speaker_tracking.task import SpeakerTracking\n",
|
||||
"spk = SpeakerTracking(protocol, duration=2., batch_size=32, num_workers=2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = SimpleSegmentationModel(task=spk)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"GPU available: False, used: False\n",
|
||||
"TPU available: False, using: 0 TPU cores\n",
|
||||
"\n",
|
||||
" | Name | Type | Params\n",
|
||||
"---------------------------------------\n",
|
||||
"0 | mfcc | MFCC | 0 \n",
|
||||
"1 | lstm | LSTM | 18 K \n",
|
||||
"2 | classifier | Linear | 1 K \n",
|
||||
"3 | activation | Sigmoid | 0 \n",
|
||||
"/Users/bredin/miniconda3/envs/pyannote-audio-v2/lib/python3.7/site-packages/pytorch_lightning/utilities/distributed.py:45: UserWarning: The dataloader, train dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 4 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
|
||||
" warnings.warn(*args, **kwargs)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "368ad052fb504dd3898bd70701b3997f",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"trainer = pl.Trainer(max_epochs=10)\n",
|
||||
"_ = trainer.fit(model, spk)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Speaker embedding"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pyannote.audio.tasks.speaker_verification.task import SpeakerEmbeddingArcFace\n",
|
||||
"emb = SpeakerEmbeddingArcFace(protocol, duration=2., batch_size=32, num_workers=4)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pyannote.audio.models.debug import SimpleEmbeddingModel\n",
|
||||
"model = SimpleEmbeddingModel(task=emb)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"GPU available: False, used: False\n",
|
||||
"TPU available: False, using: 0 TPU cores\n",
|
||||
"\n",
|
||||
" | Name | Type | Params\n",
|
||||
"------------------------------\n",
|
||||
"0 | mfcc | MFCC | 0 \n",
|
||||
"1 | lstm | LSTM | 18 K \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "fba3a039c408450bac0a9d56381b780b",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"trainer = pl.Trainer(max_epochs=10)\n",
|
||||
"_ = trainer.fit(model, emb)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
+8
-14
@@ -1,20 +1,17 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
|
||||
# The MIT License (MIT)
|
||||
|
||||
# Copyright (c) 2016-2020 CNRS
|
||||
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
@@ -23,7 +20,4 @@
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
# AUTHORS
|
||||
# Hervé BREDIN - http://herve.niderb.fr
|
||||
|
||||
__import__("pkg_resources").declare_namespace(__name__)
|
||||
|
||||
@@ -1,20 +1,17 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
|
||||
# The MIT License (MIT)
|
||||
|
||||
# Copyright (c) 2016-2020 CNRS
|
||||
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
@@ -22,64 +19,3 @@
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
# AUTHORS
|
||||
# Hervé BREDIN - http://herve.niderb.fr
|
||||
|
||||
"""
|
||||
`pyannote.audio` provides
|
||||
|
||||
* speech activity detection
|
||||
* speaker change detection
|
||||
* speaker embedding
|
||||
* speaker diarization pipeline
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
$ pip install pyannote.audio
|
||||
```
|
||||
|
||||
## Citation
|
||||
|
||||
If you use `pyannote.audio` please use the following citations.
|
||||
|
||||
- Speech activity and speaker change detection
|
||||
|
||||
@inproceedings{Yin2017,
|
||||
Author = {Ruiqing Yin and Herv\'e Bredin and Claude Barras},
|
||||
Title = {{Speaker Change Detection in Broadcast TV using Bidirectional Long Short-Term Memory Networks}},
|
||||
Booktitle = {{18th Annual Conference of the International Speech Communication Association, Interspeech 2017}},
|
||||
Year = {2017},
|
||||
Month = {August},
|
||||
Address = {Stockholm, Sweden},
|
||||
Url = {https://github.com/yinruiqing/change_detection}
|
||||
}
|
||||
|
||||
- Speaker embedding
|
||||
|
||||
@inproceedings{Bredin2017,
|
||||
author = {Herv\'{e} Bredin},
|
||||
title = {{TristouNet: Triplet Loss for Speaker Turn Embedding}},
|
||||
booktitle = {42nd IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2017},
|
||||
year = {2017},
|
||||
url = {http://arxiv.org/abs/1609.04301},
|
||||
}
|
||||
|
||||
- Speaker diarization pipeline
|
||||
|
||||
@inproceedings{Yin2018,
|
||||
Author = {Ruiqing Yin and Herv\'e Bredin and Claude Barras},
|
||||
Title = {{Neural Speech Turn Segmentation and Affinity Propagation for Speaker Diarization}},
|
||||
Booktitle = {{19th Annual Conference of the International Speech Communication Association, Interspeech 2018}},
|
||||
Year = {2018},
|
||||
Month = {September},
|
||||
Address = {Hyderabad, India},
|
||||
}
|
||||
|
||||
"""
|
||||
|
||||
from ._version import get_versions
|
||||
|
||||
__version__ = get_versions()["version"]
|
||||
del get_versions
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
@@ -0,0 +1,354 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
from typing import Union, Optional, Text
|
||||
from pathlib import Path
|
||||
from pyannote.database import ProtocolFile
|
||||
import soundfile as sf
|
||||
|
||||
import warnings
|
||||
import numpy as np
|
||||
|
||||
import librosa
|
||||
|
||||
from pyannote.core import Segment, SlidingWindow, SlidingWindowFeature
|
||||
from pyannote.core.utils.types import Alignment
|
||||
|
||||
AudioFile = Union[Path, Text, ProtocolFile, dict]
|
||||
"""
|
||||
Audio files can be provided to the Audio class using different types:
|
||||
- a "str" instance: "/path/to/audio.wav"
|
||||
- a "Path" instance: Path("/path/to/audio.wav")
|
||||
- a ProtocolFile (or regular dict) with an "audio" key:
|
||||
{"audio": Path("/path/to/audio.wav")}
|
||||
- a ProtocolFile (or regular dict) with both "waveform" and "sample_rate" key:
|
||||
{"waveform": (time, channel) numpy array, "sample_rate": 44100}
|
||||
|
||||
For last two options, an additional "channel" key can be provided as a zero-indexed
|
||||
integer to load a specific channel:
|
||||
{"audio": Path("/path/to/stereo.wav"), "channel": 0}
|
||||
"""
|
||||
|
||||
|
||||
class Audio:
|
||||
"""Audio IO
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sample_rate: int, optional
|
||||
Target sampling rate. Defaults to using native sampling rate.
|
||||
mono : int, optional
|
||||
Convert multi-channel to mono. Defaults to True.
|
||||
|
||||
Usage
|
||||
-----
|
||||
>>> audio = Audio(sample_rate=16000, mono=True)
|
||||
>>> waveform, sample_rate = audio({"audio": "/path/to/audio.wav"})
|
||||
>>> assert sample_rate == 16000
|
||||
|
||||
>>> two_seconds_stereo = np.random.rand(44100 * 2, 2, dtype=np.float32)
|
||||
>>> waveform, sample_rate = audio({"waveform": two_seconds_stereo, "sample_rate": 44100})
|
||||
>>> assert sample_rate == 16000
|
||||
>>> assert waveform.shape[1] == 1
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_duration(file: AudioFile) -> float:
|
||||
"""Get audio file duration
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file : AudioFile
|
||||
Audio file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
duration : float
|
||||
Duration in seconds.
|
||||
"""
|
||||
|
||||
if isinstance(file, (ProtocolFile, dict)):
|
||||
audio = file["audio"]
|
||||
else:
|
||||
audio = file
|
||||
|
||||
if isinstance(audio, Path):
|
||||
audio = str(audio)
|
||||
|
||||
with sf.SoundFile(audio, "r") as f:
|
||||
return float(f.frames) / f.samplerate
|
||||
|
||||
@staticmethod
|
||||
def is_valid(file: AudioFile) -> bool:
|
||||
|
||||
if isinstance(file, (ProtocolFile, dict)):
|
||||
|
||||
if "waveform" in file:
|
||||
|
||||
waveform = file["waveform"]
|
||||
if len(waveform.shape) != 2 or waveform.shape[0] < waveform.shape[1]:
|
||||
raise ValueError(
|
||||
"'waveform' must be provided as a (time, channel) numpy array."
|
||||
)
|
||||
|
||||
sample_rate = file.get("sample_rate", None)
|
||||
if sample_rate is None:
|
||||
raise ValueError(
|
||||
"'waveform' must be provided with their 'sample_rate'."
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
elif "audio" in file:
|
||||
audio = file["audio"]
|
||||
|
||||
else:
|
||||
# TODO improve error message
|
||||
raise ValueError("either 'audio' or 'waveform' key must be provided.")
|
||||
|
||||
else:
|
||||
audio = file
|
||||
|
||||
# should we check here that "audio" file exists?
|
||||
# this will slow things down and will fail later anyway.
|
||||
|
||||
return True
|
||||
|
||||
def __init__(self, sample_rate=None, mono=True):
|
||||
super().__init__()
|
||||
self.sample_rate = sample_rate
|
||||
self.mono = mono
|
||||
|
||||
def downmix_and_resample(
|
||||
self, waveform: np.ndarray, sample_rate: int
|
||||
) -> np.ndarray:
|
||||
"""Downmix and resample
|
||||
|
||||
Parameters
|
||||
----------
|
||||
waveform : (time, channel) np.ndarray
|
||||
Waveform.
|
||||
sample_rate : int
|
||||
Sample rate.
|
||||
|
||||
Returns
|
||||
-------
|
||||
waveform : (time, channel) np.ndarray
|
||||
Remixed and resampled waveform
|
||||
sample_rate : int
|
||||
New sample rate
|
||||
"""
|
||||
|
||||
# downmix to mono
|
||||
if self.mono and waveform.shape[1] > 1:
|
||||
waveform = np.mean(waveform, axis=1, keepdims=True)
|
||||
|
||||
# resample
|
||||
if (self.sample_rate is not None) and (self.sample_rate != sample_rate):
|
||||
if self.mono:
|
||||
# librosa expects mono audio to be of shape (n,), but we have (n, 1).
|
||||
waveform = librosa.core.resample(
|
||||
waveform[:, 0], sample_rate, self.sample_rate
|
||||
)[:, None]
|
||||
else:
|
||||
waveform = librosa.core.resample(
|
||||
waveform.T, sample_rate, self.sample_rate
|
||||
).T
|
||||
sample_rate = self.sample_rate
|
||||
|
||||
return waveform, sample_rate
|
||||
|
||||
def __call__(self, file: AudioFile):
|
||||
"""Obtain waveform
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file : AudioFile
|
||||
|
||||
Returns
|
||||
-------
|
||||
waveform : `pyannote.core.SlidingWindowFeature`
|
||||
Waveform.
|
||||
|
||||
See also
|
||||
--------
|
||||
AudioFile
|
||||
"""
|
||||
|
||||
self.is_valid(file)
|
||||
|
||||
if isinstance(file, (ProtocolFile, dict)):
|
||||
|
||||
if "waveform" in file:
|
||||
audio = None
|
||||
waveform = file["waveform"]
|
||||
sample_rate = file.get("sample_rate", None)
|
||||
|
||||
elif "audio" in file:
|
||||
audio = file["audio"]
|
||||
waveform = None
|
||||
sample_rate = None
|
||||
|
||||
else:
|
||||
pass
|
||||
|
||||
channel = file.get("channel", None)
|
||||
|
||||
else:
|
||||
audio = file
|
||||
waveform = None
|
||||
sample_rate = None
|
||||
channel = None
|
||||
|
||||
if isinstance(audio, Path):
|
||||
audio = str(audio)
|
||||
|
||||
if waveform is None:
|
||||
waveform, sample_rate = sf.read(audio, dtype="float32", always_2d=True)
|
||||
|
||||
if channel is not None:
|
||||
waveform = waveform[:, channel - 1 : channel]
|
||||
|
||||
waveform = self.downmix_and_resample(waveform, sample_rate)
|
||||
|
||||
sliding_window = SlidingWindow(
|
||||
start=-0.5 / sample_rate, duration=1.0 / sample_rate, step=1.0 / sample_rate
|
||||
)
|
||||
|
||||
return SlidingWindowFeature(waveform, sliding_window)
|
||||
|
||||
def crop(
|
||||
self,
|
||||
file: AudioFile,
|
||||
segment: Segment,
|
||||
mode: Alignment = "center",
|
||||
fixed: Optional[float] = None,
|
||||
) -> np.ndarray:
|
||||
"""Fast version of self(file).crop(segment, **kwargs)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file : AudioFile
|
||||
Audio file.
|
||||
segment : `pyannote.core.Segment`
|
||||
Temporal segment to load.
|
||||
mode : {'loose', 'strict', 'center'}, optional
|
||||
In 'strict' mode, only samples fully included in 'segment' are
|
||||
returned. In 'loose' mode, any intersecting frames are returned. In
|
||||
'center' mode, first and last frames are chosen to be the ones
|
||||
whose centers are the closest to 'focus' start and end times.
|
||||
Defaults to 'center'.
|
||||
fixed : float, optional
|
||||
Overrides `Segment` 'focus' duration and ensures that the number of
|
||||
returned frames is fixed (which might otherwise not be the case
|
||||
because of rounding errors). Has no effect in 'strict' or 'loose'
|
||||
modes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
waveform : (time, channel) numpy array
|
||||
Waveform
|
||||
sample_rate : int
|
||||
Sample rate
|
||||
|
||||
TODO: remove support for "mode" option. It is always "center" anyway.
|
||||
|
||||
See also
|
||||
--------
|
||||
`pyannote.core.SlidingWindowFeature.crop`
|
||||
"""
|
||||
|
||||
self.is_valid(file)
|
||||
|
||||
if isinstance(file, (ProtocolFile, dict)):
|
||||
|
||||
if "waveform" in file:
|
||||
audio = None
|
||||
waveform = file["waveform"]
|
||||
sample_rate = file.get("sample_rate", None)
|
||||
frames = len(waveform)
|
||||
|
||||
elif "audio" in file:
|
||||
audio = file["audio"]
|
||||
waveform = None
|
||||
|
||||
else:
|
||||
pass
|
||||
|
||||
channel = file.get("channel", None)
|
||||
|
||||
else:
|
||||
audio = file
|
||||
waveform = None
|
||||
channel = None
|
||||
|
||||
if isinstance(audio, Path):
|
||||
audio = str(audio)
|
||||
|
||||
# read sample rate and number of frames
|
||||
if waveform is None:
|
||||
with sf.SoundFile(audio, "r") as f:
|
||||
sample_rate = f.samplerate
|
||||
frames = f.frames
|
||||
|
||||
# infer which samples to load from sample rate and requested chunk
|
||||
# TODO: compute start directly instead of using a sliding window
|
||||
samples = SlidingWindow(
|
||||
start=-0.5 / sample_rate, duration=1.0 / sample_rate, step=1.0 / sample_rate
|
||||
)
|
||||
((start, stop),) = samples.crop(
|
||||
segment, mode=mode, fixed=fixed, return_ranges=True
|
||||
)
|
||||
|
||||
if start < 0 or stop > frames:
|
||||
raise ValueError(
|
||||
f"requested chunk [{segment.start:.6f}, {segment.end:.6f}] "
|
||||
f"lies outside of file bounds [0., {frames / sample_rate:.6f}]."
|
||||
)
|
||||
|
||||
if waveform is not None:
|
||||
data = waveform[start:stop]
|
||||
|
||||
else:
|
||||
|
||||
with sf.SoundFile(audio, "r") as f:
|
||||
|
||||
try:
|
||||
f.seek(start)
|
||||
data = f.read(stop - start, dtype="float32", always_2d=True)
|
||||
except RuntimeError:
|
||||
msg = (
|
||||
f"SoundFile failed to seek-and-read in "
|
||||
f"{audio}: loading the whole file..."
|
||||
)
|
||||
warnings.warn(msg)
|
||||
return self(audio).crop(segment, mode=mode, fixed=fixed)
|
||||
|
||||
if channel is not None:
|
||||
data = data[:, channel - 1 : channel]
|
||||
|
||||
return self.downmix_and_resample(data, sample_rate)
|
||||
|
||||
|
||||
def normalize(wav):
|
||||
"""Normalize waveform"""
|
||||
return wav / (np.sqrt(np.mean(wav ** 2)) + 1e-8)
|
||||
@@ -0,0 +1,149 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
import pytorch_lightning as pl
|
||||
from pyannote.audio.core.task import Task, Problem
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from pyannote.audio.core.io import Audio
|
||||
|
||||
|
||||
class Model(pl.LightningModule):
|
||||
"""Base model
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sample_rate : int, optional
|
||||
Audio sample rate. Defaults to 16kHz (16000).
|
||||
num_channels : int, optional
|
||||
Number of channels. Defaults to mono (1).
|
||||
task : Task, optional
|
||||
Task addressed by the model. Only provided when training the model.
|
||||
A model should be `load_from_checkpoint`-able without a task as
|
||||
`on_load_checkpoint` hook takes care of calling `setup`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sample_rate: int = 16000,
|
||||
num_channels: int = 1,
|
||||
task: Optional[Task] = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
# set-up audio IO
|
||||
assert (
|
||||
num_channels == 1
|
||||
), "Only mono audio is supported for now (num_channels = 1)"
|
||||
self.hparams.sample_rate = sample_rate
|
||||
self.hparams.num_channels = num_channels
|
||||
self.audio = Audio(sample_rate=self.hparams.sample_rate, mono=True)
|
||||
|
||||
# set task attribute when available (i.e. at training time)
|
||||
# and also tell the task what kind of audio is expected from
|
||||
# the model
|
||||
if task is not None:
|
||||
self.task = task
|
||||
self.task.audio = self.audio
|
||||
|
||||
def build(self):
|
||||
# use this method to add task-dependent layers to the model
|
||||
# (e.g. the final classification and activation layers)
|
||||
pass
|
||||
|
||||
def setup(self, stage=None):
|
||||
|
||||
if stage == "fit":
|
||||
|
||||
# keep track of the classes here because it is used
|
||||
# to setup the final classification layer (even when stage != fit)
|
||||
self.hparams.classes = self.task.specifications.classes
|
||||
|
||||
# keep track of the type of problem here because it is used
|
||||
# to setup the final activation layer (even when stage != fit)
|
||||
self.hparams.problem = self.task.specifications.problem
|
||||
|
||||
# any other common parameters should be saved?
|
||||
# maybe the class of the model (and pyannote.audio semantic version?)
|
||||
# so that it can be loaded without knowing what type of model it is.
|
||||
# this would probably make distributing pretrained models much easier.
|
||||
|
||||
else:
|
||||
# should we do something specific when stage != fit?
|
||||
# hparams.classes and hparams.problem should already exist
|
||||
# because they should have been loaded on_load_checkpoint
|
||||
pass
|
||||
|
||||
# add task-dependent layers to the model
|
||||
# (e.g. the final classification and activation layers)
|
||||
self.build()
|
||||
|
||||
if stage == "fit":
|
||||
|
||||
# let task know about the shape of model output
|
||||
# so that its dataloader knows how to generate targets
|
||||
self.task.example_output_array = self.forward(
|
||||
self.task.example_input_array()
|
||||
)
|
||||
|
||||
def on_load_checkpoint(self, checkpoint):
|
||||
|
||||
# only hyper-parameters defined in __init__ are loaded automatically.
|
||||
# therefore, we have to manually load hyper-parameters that were
|
||||
# defined during setup()
|
||||
self.hparams.classes = checkpoint["hyper_parameters"]["classes"]
|
||||
self.hparams.problem = checkpoint["hyper_parameters"]["problem"]
|
||||
# TODO: would have to check pytorch-lightning documentation to see
|
||||
# if we can get rid of this... it is weird that only "some" parameters
|
||||
# in self.hparams are assigned at __init__ time...
|
||||
|
||||
# now that setup()-defined hyper-parameters are available,
|
||||
# we can actually setup() the model.
|
||||
self.setup()
|
||||
|
||||
def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
|
||||
msg = "Class {self.__class__.__name__} should define a `forward` method."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
# convenience function to automate the choice of the final activation function
|
||||
def default_activation(self) -> nn.Module:
|
||||
|
||||
if self.hparams.problem == Problem.MONO_LABEL_CLASSIFICATION:
|
||||
return nn.LogSoftmax(dim=-1)
|
||||
|
||||
elif self.hparams.problem == Problem.MULTI_LABEL_CLASSIFICATION:
|
||||
return nn.Sigmoid()
|
||||
|
||||
else:
|
||||
msg = "TODO: implement default activation for other types of problems"
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
# training step logic is defined by the task because the
|
||||
# model does not really need to know how it is being used.
|
||||
def training_step(self, batch, batch_idx):
|
||||
return self.task.training_step(self, batch, batch_idx)
|
||||
|
||||
# optimizer is defined by the task for the same reason as above
|
||||
def configure_optimizers(self):
|
||||
return self.task.configure_optimizers(self)
|
||||
@@ -0,0 +1,215 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, List, Text
|
||||
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
import torch.optim
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.data import IterableDataset
|
||||
import torch.nn.functional as F
|
||||
from pyannote.database import Protocol
|
||||
|
||||
|
||||
# Type of machine learning problem
|
||||
class Problem(Enum):
|
||||
MONO_LABEL_CLASSIFICATION = 1
|
||||
MULTI_LABEL_CLASSIFICATION = 2
|
||||
REPRESENTATION = 3
|
||||
REGRESSION = 4
|
||||
# any other we could think of?
|
||||
|
||||
|
||||
# A task takes an audio chunk as input and returns
|
||||
# either a temporal sequence of predictions
|
||||
# or just one prediction for the whole audio chunk
|
||||
class Scale(Enum):
|
||||
FRAME = 1 # model outputs a sequence of frames
|
||||
CHUNK = 2 # model outputs just one vector for the whole chunk
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskSpecification:
|
||||
problem: Problem
|
||||
scale: Scale
|
||||
|
||||
# for classification tasks only
|
||||
classes: Optional[List[Text]] = None
|
||||
|
||||
|
||||
# note how a task is actually a LightningDataModule
|
||||
class Task(pl.LightningDataModule):
|
||||
"""Base task class
|
||||
|
||||
A task is the combination of a "problem" and a "dataset".
|
||||
For example, here are a few tasks:
|
||||
- voice activity detection on the AMI corpus
|
||||
- speaker embedding on the VoxCeleb corpus
|
||||
- end-to-end speaker diarization on the VoxConverse corpus
|
||||
|
||||
A task is expected to be solved by a "model" that takes an
|
||||
audio chunk as input and returns the solution. Hence, the
|
||||
task is in charge of generating (input, expected_output)
|
||||
samples used for training the model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
protocol : Protocol
|
||||
pyannote.database protocol
|
||||
duration : float, optional
|
||||
Chunks duration. Defaults to variable duration (None).
|
||||
batch_size : int, optional
|
||||
Number of training samples per batch.
|
||||
num_workers : int, optional
|
||||
Number of workers used for generating training samples.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
protocol: Protocol,
|
||||
duration: float = None,
|
||||
batch_size: int = None,
|
||||
num_workers: int = 1,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
# dataset
|
||||
self.protocol = protocol
|
||||
|
||||
# batching
|
||||
self.duration = duration
|
||||
self.batch_size = batch_size
|
||||
self.num_workers = num_workers
|
||||
|
||||
def prepare_data(self):
|
||||
# this is where we might end up downloading datasets
|
||||
# and transform them so that they are ready to be used
|
||||
# with pyannote.database. but for now, the API assume
|
||||
# that we directly provide a pyannote.database.Protocol.
|
||||
pass
|
||||
|
||||
def train__iter__(self):
|
||||
# will become train_dataset.__iter__ method
|
||||
msg = f"Missing '{self.__class__.__name__}.train__iter__' method."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
def train__len__(self):
|
||||
# will become train_dataset.__len__ method
|
||||
msg = f"Missing '{self.__class__.__name__}.train__len__' method."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
def train_dataloader(self) -> DataLoader:
|
||||
# build train IterableDataset subclass programmatically
|
||||
dataset = type(
|
||||
"TrainDataset",
|
||||
(IterableDataset,),
|
||||
{"__iter__": self.train__iter__, "__len__": self.train__len__},
|
||||
)
|
||||
|
||||
return DataLoader(
|
||||
dataset(),
|
||||
batch_size=self.batch_size,
|
||||
num_workers=self.num_workers,
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
@property
|
||||
def example_input_duration(self) -> float:
|
||||
return 2.0 if self.duration is None else self.duration
|
||||
|
||||
def example_input_array(self):
|
||||
# this method is called in Model.setup where it is used
|
||||
# to automagically infer the temporal resolution of the
|
||||
# model output, and hence allow the dataloader to shape
|
||||
# its targets correctly.
|
||||
|
||||
# since we plan to have the feature extraction step done
|
||||
# on GPU as part of the model, the example input array is
|
||||
# basically always a chunk of audio
|
||||
|
||||
if self.audio.mono:
|
||||
num_channels = 1
|
||||
else:
|
||||
msg = "Only 'mono' audio is supported."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
return torch.randn(
|
||||
(
|
||||
self.batch_size,
|
||||
int(self.audio.sample_rate * self.example_input_duration),
|
||||
num_channels,
|
||||
)
|
||||
)
|
||||
|
||||
# below is a (hacky) way to automagically infer the expected
|
||||
# resolution of the target. basically, we do a forward pass
|
||||
# of an example input array and look at the resulting shape
|
||||
# of the output. the problem with this approach is that we
|
||||
# may encounter weird rounding errors in case of variable-length
|
||||
# chunks. TODO: someone should look at this to make it more robust.
|
||||
|
||||
@property
|
||||
def example_output_array(self) -> torch.Tensor:
|
||||
return self.example_output_array_
|
||||
|
||||
@example_output_array.setter
|
||||
def example_output_array(self, example_output_array: torch.Tensor):
|
||||
self.example_output_array_ = example_output_array
|
||||
if self.specifications.scale == Scale.FRAME:
|
||||
self.frame_duration_ = (
|
||||
self.example_input_duration / example_output_array.shape[1]
|
||||
)
|
||||
|
||||
@property
|
||||
def frame_duration(self) -> float:
|
||||
if self.specifications.scale == Scale.FRAME:
|
||||
return self.frame_duration_
|
||||
|
||||
# default training_step provided for convenience
|
||||
# can obviously be overriden for each task
|
||||
def training_step(self, model: "Model", batch, batch_idx: int):
|
||||
X, y = batch["X"], batch["y"]
|
||||
if self.specifications.problem == Problem.MONO_LABEL_CLASSIFICATION:
|
||||
loss = F.nll_loss(
|
||||
model(X).view(-1, len(self.specifications.classes)), y.view(-1)
|
||||
)
|
||||
|
||||
elif self.specifications.problem == Problem.MULTI_LABEL_CLASSIFICATION:
|
||||
loss = F.binary_cross_entropy(model(X), y.float())
|
||||
|
||||
else:
|
||||
msg = "TODO: implement for other types of problems"
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
model.log("train_loss", loss)
|
||||
return loss
|
||||
|
||||
# default configure_optimizers provided for convenience
|
||||
# can obviously be overriden for each task
|
||||
def configure_optimizers(self, model: "Model"):
|
||||
# for tasks such as SpeakerEmbedding,
|
||||
# other parameters should be added here
|
||||
return torch.optim.Adam(model.parameters(), lr=1e-3)
|
||||
@@ -1,20 +1,17 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
|
||||
# The MIT License (MIT)
|
||||
|
||||
# Copyright (c) 2019-2020 CNRS
|
||||
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
@@ -22,8 +19,3 @@
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
# AUTHORS
|
||||
# Hervé BREDIN - http://herve.niderb.fr
|
||||
|
||||
from .models import PyanNet, SincTDNN, ACRoPoLiS
|
||||
|
||||
@@ -0,0 +1,147 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
|
||||
from pyannote.audio.core.model import Model
|
||||
from pyannote.audio.core.task import Task
|
||||
from typing import Optional
|
||||
|
||||
|
||||
from torchaudio.transforms import MFCC
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from einops import rearrange, reduce
|
||||
|
||||
|
||||
class SimpleSegmentationModel(Model):
|
||||
def __init__(
|
||||
self,
|
||||
sample_rate: int = 16000,
|
||||
num_channels: int = 1,
|
||||
task: Optional[Task] = None,
|
||||
):
|
||||
|
||||
super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
|
||||
|
||||
self.mfcc = MFCC(
|
||||
sample_rate=self.hparams.sample_rate,
|
||||
n_mfcc=40,
|
||||
dct_type=2,
|
||||
norm="ortho",
|
||||
log_mels=False,
|
||||
)
|
||||
|
||||
self.lstm = nn.LSTM(
|
||||
self.mfcc.n_mfcc * self.hparams.num_channels,
|
||||
32,
|
||||
num_layers=1,
|
||||
batch_first=True,
|
||||
bidirectional=True,
|
||||
)
|
||||
|
||||
def build(self):
|
||||
# define task-dependent layers
|
||||
self.classifier = nn.Linear(32 * 2, len(self.hparams.classes))
|
||||
self.activation = self.default_activation()
|
||||
|
||||
# why do we define those layers here and not in task.setup()?
|
||||
# because, at inference time, we need those layers.
|
||||
|
||||
# this is in contrast of SpeakerEmbedding.loss_func layers below
|
||||
# that are only needed during training -- we don't want them to
|
||||
# be applied at inference.
|
||||
|
||||
def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
waveforms : (batch, time, channel)
|
||||
|
||||
Returns
|
||||
-------
|
||||
scores : (batch, time, classes)
|
||||
"""
|
||||
|
||||
# extract MFCC
|
||||
mfcc = self.mfcc(rearrange(waveforms, "b t c -> b c t"))
|
||||
# pass MFCC sequeence into the recurrent layer
|
||||
output, hidden = self.lstm(rearrange(mfcc, "b c f t -> b t (c f)"))
|
||||
# apply the final classifier to get logits
|
||||
return self.activation(self.classifier(output))
|
||||
|
||||
|
||||
class SimpleEmbeddingModel(Model):
|
||||
def __init__(
|
||||
self,
|
||||
sample_rate: int = 16000,
|
||||
num_channels: int = 1,
|
||||
task: Optional[Task] = None,
|
||||
):
|
||||
|
||||
super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
|
||||
|
||||
self.mfcc = MFCC(
|
||||
sample_rate=self.hparams.sample_rate,
|
||||
n_mfcc=40,
|
||||
dct_type=2,
|
||||
norm="ortho",
|
||||
log_mels=False,
|
||||
)
|
||||
|
||||
self.lstm = nn.LSTM(
|
||||
self.mfcc.n_mfcc * self.hparams.num_channels,
|
||||
32,
|
||||
num_layers=1,
|
||||
batch_first=True,
|
||||
bidirectional=True,
|
||||
)
|
||||
|
||||
# this is needed because example_output_array is needed in SpeakerEmbedding.setup
|
||||
# to automagically infer the embedding size. but example_output_array is computed
|
||||
# in Model.setup (which is called **after** Task.setup).
|
||||
|
||||
# note that this is only a problem for embedding tasks.
|
||||
# we should find a way to automate this call so that the
|
||||
# end user does not forget to call it. note that this must
|
||||
# be called at the end of __init__
|
||||
if self.task is not None:
|
||||
self.task.example_output_array = self.forward(
|
||||
self.task.example_input_array()
|
||||
)
|
||||
|
||||
def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
waveforms : (batch, time, channel)
|
||||
|
||||
Returns
|
||||
-------
|
||||
embedding : (batch, dimension)
|
||||
"""
|
||||
|
||||
mfcc = self.mfcc(rearrange(waveforms, "b t c -> b c t"))
|
||||
output, hidden = self.lstm(rearrange(mfcc, "b c f t -> b t (c f)"))
|
||||
# mean temporal pooling
|
||||
return reduce(output, "b t f -> b f", "mean")
|
||||
@@ -0,0 +1,158 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
from pyannote.audio.core.task import TaskSpecification, Problem, Scale, Task
|
||||
from pyannote.database import Protocol
|
||||
|
||||
import math
|
||||
import random
|
||||
from pyannote.core import Segment, Timeline, SlidingWindow
|
||||
from pyannote.core.utils.numpy import one_hot_encoding
|
||||
|
||||
|
||||
class SpeakerTracking(Task):
|
||||
"""Speaker tracking
|
||||
|
||||
Speaker tracking is the process of determining if and when a (previously
|
||||
enrolled) person's voice can be heard in an audio recording.
|
||||
|
||||
Here, it is addressed with the same approach as voice activity detection,
|
||||
except {"non-speech", "speech"} classes are replaced by {"speaker1", ...,
|
||||
"speaker_N"} where N is the number of speakers in the training set.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
protocol: Protocol,
|
||||
duration: float = 2.0,
|
||||
batch_size: int = None,
|
||||
num_workers: int = 1,
|
||||
):
|
||||
|
||||
super().__init__(
|
||||
protocol, duration=duration, batch_size=batch_size, num_workers=num_workers
|
||||
)
|
||||
|
||||
# for speaker tracking, task specification depends
|
||||
# on the data: we do not know in advance which
|
||||
# speakers should be tracked. therefore, we postpone
|
||||
# the definition of specifications.
|
||||
|
||||
def setup(self, stage=None):
|
||||
|
||||
if stage == "fit":
|
||||
|
||||
# this is where we load the training set metadata
|
||||
# to be used later by the train_dataloader.
|
||||
|
||||
# here, we simply loop over the training set, remove
|
||||
# annotated regions shorter than chunk duration, and
|
||||
# keep track of the reference annotations.
|
||||
|
||||
# we also build the list of speakers to be tracked.
|
||||
|
||||
self.train, speakers = [], set()
|
||||
for f in self.protocol.train():
|
||||
segments = [
|
||||
segment
|
||||
for segment in f["annotated"]
|
||||
if segment.duration > self.duration
|
||||
]
|
||||
duration = sum(segment.duration for segment in segments)
|
||||
self.train.append(
|
||||
{
|
||||
"annotated": segments,
|
||||
"annotation": f["annotation"],
|
||||
"duration": duration,
|
||||
"audio": f["audio"],
|
||||
}
|
||||
)
|
||||
speakers.update(f["annotation"].labels())
|
||||
|
||||
# now that we now who the speakers are, we can
|
||||
# define the task specifications.
|
||||
|
||||
# note that, since multiple speakers can be active
|
||||
# at once, the problem is multi-label classification.
|
||||
self.specifications = TaskSpecification(
|
||||
problem=Problem.MULTI_LABEL_CLASSIFICATION,
|
||||
scale=Scale.FRAME,
|
||||
classes=sorted(speakers),
|
||||
)
|
||||
|
||||
def train__iter__(self):
|
||||
"""Iterate over training samples
|
||||
|
||||
Yields
|
||||
------
|
||||
X: (time, channel)
|
||||
Audio chunks.
|
||||
y: (frame, num_speakers)
|
||||
Frame-level targets. Note that frame < time.
|
||||
`frame` is infered automagically from the
|
||||
example model output.
|
||||
"""
|
||||
|
||||
random.seed()
|
||||
|
||||
while True:
|
||||
|
||||
# select one file at random (with probability proportional to its annotated duration)
|
||||
file, *_ = random.choices(
|
||||
self.train, weights=[f["duration"] for f in self.train], k=1,
|
||||
)
|
||||
|
||||
# select one annotated region at random (with probability proportional to its duration)
|
||||
segment, *_ = random.choices(
|
||||
file["annotated"], weights=[s.duration for s in file["annotated"]], k=1,
|
||||
)
|
||||
|
||||
# select one chunk at random (with uniform distribution)
|
||||
start_time = random.uniform(segment.start, segment.end - self.duration)
|
||||
chunk = Segment(start_time, start_time + self.duration)
|
||||
|
||||
# extract features
|
||||
X, _ = self.audio.crop(file, chunk, mode="center", fixed=self.duration)
|
||||
|
||||
# TODO | this one_hot_encoding thing needs to be rewritten into pyannote.audio
|
||||
# TODO | to make sure we always return the same number of frames for the same
|
||||
# TODO | input duration. we should also support variable-length chunks.
|
||||
frames = SlidingWindow(
|
||||
start=chunk.start,
|
||||
duration=self.frame_duration,
|
||||
step=self.frame_duration,
|
||||
)
|
||||
|
||||
y = one_hot_encoding(
|
||||
file["annotation"].crop(chunk),
|
||||
Timeline([chunk]),
|
||||
frames,
|
||||
labels=self.specifications.classes,
|
||||
mode="center",
|
||||
).data
|
||||
|
||||
yield {"X": X, "y": y}
|
||||
|
||||
def train__len__(self):
|
||||
# Number of training samples in one epoch
|
||||
duration = sum(file["duration"] for file in self.train)
|
||||
return math.ceil(duration / self.duration)
|
||||
@@ -0,0 +1,21 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
@@ -0,0 +1,176 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from pyannote.audio.core.task import TaskSpecification, Problem, Scale, Task
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pyannote.audio.core.model import Model
|
||||
from pyannote.database import Protocol
|
||||
|
||||
import random
|
||||
import math
|
||||
from pyannote.core import Segment
|
||||
|
||||
import pytorch_metric_learning.losses
|
||||
from itertools import chain
|
||||
import torch.optim
|
||||
|
||||
|
||||
class SpeakerEmbeddingArcFace(Task):
|
||||
def __init__(
|
||||
self,
|
||||
protocol: Protocol,
|
||||
duration: float = 2.0,
|
||||
batch_size: int = None,
|
||||
num_workers: int = 1,
|
||||
):
|
||||
|
||||
super().__init__(
|
||||
protocol, duration=duration, batch_size=batch_size, num_workers=num_workers
|
||||
)
|
||||
|
||||
# there is no such thing as a "class" in representation
|
||||
# learning, so we do not need to define it here.
|
||||
self.specifications = TaskSpecification(
|
||||
problem=Problem.REPRESENTATION, scale=Scale.CHUNK
|
||||
)
|
||||
|
||||
def setup(self, stage=None):
|
||||
|
||||
if stage == "fit":
|
||||
|
||||
# gather training set metadata
|
||||
self.speakers = dict()
|
||||
for f in self.protocol.train():
|
||||
|
||||
for speaker in f["annotation"].labels():
|
||||
|
||||
# keep speaker's (long enough) speech turns...
|
||||
speech_turns = [
|
||||
segment
|
||||
for segment in f["annotation"].label_timeline(speaker)
|
||||
if segment.duration > self.duration
|
||||
]
|
||||
|
||||
# skip if there is no speech turns left
|
||||
if not speech_turns:
|
||||
continue
|
||||
|
||||
# ... and their total duration
|
||||
duration = sum(segment.duration for segment in speech_turns)
|
||||
|
||||
# add speaker to the list of speakers
|
||||
if speaker not in self.speakers:
|
||||
self.speakers[speaker] = list()
|
||||
|
||||
self.speakers[speaker].append(
|
||||
{
|
||||
"audio": f["audio"],
|
||||
"duration": duration,
|
||||
"speech_turns": speech_turns,
|
||||
}
|
||||
)
|
||||
|
||||
# for convenience, we keep track of the list of speakers, after all
|
||||
self.specifications.classes = sorted(self.speakers)
|
||||
|
||||
num_classes = len(self.speakers)
|
||||
# use example_output_array to guess embedding size
|
||||
_, embedding_size = self.example_output_array.shape
|
||||
self.loss_func = pytorch_metric_learning.losses.ArcFaceLoss(
|
||||
num_classes, embedding_size, margin=28.6, scale=64
|
||||
)
|
||||
|
||||
def train__iter__(self):
|
||||
"""Iterate over training samples
|
||||
|
||||
Yields
|
||||
------
|
||||
X: (time, channel)
|
||||
Audio chunks.
|
||||
y: int
|
||||
Speaker index.
|
||||
"""
|
||||
|
||||
random.seed()
|
||||
|
||||
speakers = list(self.speakers)
|
||||
|
||||
while True:
|
||||
|
||||
# shuffle speakers so that we don't always have the same
|
||||
# groups of speakers in a batch (which might be especially
|
||||
# problematic for contrast-based losses like contrastive
|
||||
# or triplet loss.
|
||||
random.shuffle(speakers)
|
||||
|
||||
for speaker in speakers:
|
||||
|
||||
# speaker index in original sorted order
|
||||
y = self.specifications.classes.index(speaker)
|
||||
|
||||
# three chunks per speaker
|
||||
for _ in range(3):
|
||||
|
||||
# select one file at random (with probability proportional to its speaker duration)
|
||||
file, *_ = random.choices(
|
||||
self.speakers[speaker],
|
||||
weights=[f["duration"] for f in self.speakers[speaker]],
|
||||
k=1,
|
||||
)
|
||||
|
||||
# select one speech turn at random (with probability proportional to its duration)
|
||||
speech_turn, *_ = random.choices(
|
||||
file["speech_turns"],
|
||||
weights=[s.duration for s in file["speech_turns"]],
|
||||
k=1,
|
||||
)
|
||||
|
||||
# select one chunk at random (with uniform distribution)
|
||||
start_time = random.uniform(
|
||||
speech_turn.start, speech_turn.end - self.duration
|
||||
)
|
||||
chunk = Segment(start_time, start_time + self.duration)
|
||||
|
||||
# extract features
|
||||
X, _ = self.audio.crop(
|
||||
file, chunk, mode="center", fixed=self.duration
|
||||
)
|
||||
|
||||
yield {"X": X, "y": y}
|
||||
|
||||
def train__len__(self):
|
||||
duration = sum(
|
||||
datum["duration"] for data in self.speakers.values() for datum in data
|
||||
)
|
||||
return math.ceil(duration / self.duration)
|
||||
|
||||
def training_step(self, model: "Model", batch, batch_idx: int):
|
||||
X, y = batch["X"], batch["y"]
|
||||
loss = self.loss_func(model(X), y)
|
||||
model.log("train_loss", loss)
|
||||
return loss
|
||||
|
||||
def configure_optimizers(self, model: "Model"):
|
||||
parameters = chain(model.parameters(), self.loss_func.parameters())
|
||||
return torch.optim.Adam(parameters, lr=1e-3)
|
||||
@@ -0,0 +1,22 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
@@ -0,0 +1,141 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 CNRS
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
from pyannote.audio.core.task import TaskSpecification, Problem, Scale, Task
|
||||
from pyannote.database import Protocol
|
||||
|
||||
import numpy as np
|
||||
import math
|
||||
import random
|
||||
from pyannote.core import Segment, Timeline, SlidingWindow
|
||||
from pyannote.core.utils.numpy import one_hot_encoding
|
||||
|
||||
|
||||
class VoiceActivityDetection(Task):
|
||||
def __init__(
|
||||
self,
|
||||
protocol: Protocol,
|
||||
duration: float = 2.0,
|
||||
batch_size: int = None,
|
||||
num_workers: int = 1,
|
||||
):
|
||||
|
||||
super().__init__(
|
||||
protocol, duration=duration, batch_size=batch_size, num_workers=num_workers
|
||||
)
|
||||
|
||||
# for voice activity detection, task specification
|
||||
# does not depend on the data: we can define it in
|
||||
# __init__
|
||||
self.specifications = TaskSpecification(
|
||||
problem=Problem.MONO_LABEL_CLASSIFICATION,
|
||||
scale=Scale.FRAME,
|
||||
classes=["non_speech", "speech"],
|
||||
)
|
||||
|
||||
def setup(self, stage=None):
|
||||
if stage == "fit":
|
||||
# this is where we load the training set metadata
|
||||
# to be used later by the train_dataloader.
|
||||
|
||||
# here, we simply loop over the training set, remove
|
||||
# annotated regions shorter than chunk duration, and
|
||||
# keep track of the reference annotations.
|
||||
self.train = []
|
||||
for f in self.protocol.train():
|
||||
segments = [
|
||||
segment
|
||||
for segment in f["annotated"]
|
||||
if segment.duration > self.duration
|
||||
]
|
||||
duration = sum(segment.duration for segment in segments)
|
||||
self.train.append(
|
||||
{
|
||||
"annotated": segments,
|
||||
"annotation": f["annotation"],
|
||||
"duration": duration,
|
||||
"audio": f["audio"],
|
||||
}
|
||||
)
|
||||
|
||||
def train__iter__(self):
|
||||
"""Iterate over training samples
|
||||
|
||||
Yields
|
||||
------
|
||||
X: (time, channel)
|
||||
Audio chunks.
|
||||
y: (frame, )
|
||||
Frame-level targets. Note that frame < time.
|
||||
`frame` is infered automagically from the
|
||||
example model output.
|
||||
"""
|
||||
|
||||
random.seed()
|
||||
|
||||
while True:
|
||||
|
||||
# select one file at random (with probability proportional to its annotated duration)
|
||||
file, *_ = random.choices(
|
||||
self.train, weights=[f["duration"] for f in self.train], k=1,
|
||||
)
|
||||
|
||||
# select one annotated region at random (with probability proportional to its duration)
|
||||
segment, *_ = random.choices(
|
||||
file["annotated"], weights=[s.duration for s in file["annotated"]], k=1,
|
||||
)
|
||||
|
||||
# select one chunk at random (with uniform distribution)
|
||||
start_time = random.uniform(segment.start, segment.end - self.duration)
|
||||
chunk = Segment(start_time, start_time + self.duration)
|
||||
|
||||
# extract features
|
||||
X, _ = self.audio.crop(file, chunk, mode="center", fixed=self.duration)
|
||||
|
||||
# note how, contrary to what is currently done in pyannote.audio,
|
||||
# y is not precomputed for the whole file at initialization time.
|
||||
# here, we stick with pyannote.core.Annotation as long as possible
|
||||
# and "one hot" encode the data only when generating training samples.
|
||||
# this should allow to train on much larger datasets.
|
||||
|
||||
# TODO | this one_hot_encoding thing needs to be rewritten into pyannote.audio
|
||||
# TODO | to make sure we always return the same number of frames for the same
|
||||
# TODO | input duration. we should also support variable-length chunks.
|
||||
frames = SlidingWindow(
|
||||
start=chunk.start,
|
||||
duration=self.frame_duration,
|
||||
step=self.frame_duration,
|
||||
)
|
||||
y = one_hot_encoding(
|
||||
file["annotation"].crop(chunk), Timeline([chunk]), frames, mode="center"
|
||||
).data
|
||||
|
||||
# this is the only part of this method that is specific to VAD
|
||||
# the rest should also work for any task with Scale.FRAME
|
||||
y = np.int64(np.sum(y, axis=1) > 0)
|
||||
|
||||
yield {"X": X, "y": y}
|
||||
|
||||
def train__len__(self):
|
||||
# Number of training samples in one epoch
|
||||
duration = sum(file["duration"] for file in self.train)
|
||||
return math.ceil(duration / self.duration)
|
||||
+7
-15
@@ -1,17 +1,9 @@
|
||||
cachetools >= 2.0.0
|
||||
librosa >= 0.8.0
|
||||
pandas >= 0.18.0
|
||||
torch >= 1.6
|
||||
torchaudio >= 0.6
|
||||
pytorch_lightning >= 1.0.0rc4
|
||||
pytorch_metric_learning >= 0.9.93
|
||||
einops >= 0.3.0
|
||||
pyannote.core >= 4.1
|
||||
pyannote.database >= 4.0
|
||||
pyannote.metrics >= 2.3
|
||||
pyannote.pipeline >= 1.5.2
|
||||
pyYAML >= 3.12
|
||||
scikit-learn >= 0.20.2
|
||||
sortedcollections >= 1.0.1
|
||||
sortedcontainers >= 2.0.4
|
||||
pyannote.database >= 4.0.1
|
||||
librosa >= 0.8
|
||||
soundfile >= 0.10.2
|
||||
tqdm >= 4.29.1
|
||||
tensorboard >= 2.0.0
|
||||
typing_extensions >= 3.7.4;python_version < '3.8'
|
||||
pescador >= 2.1.0
|
||||
Pillow >= 6.2.1
|
||||
|
||||
@@ -1,34 +1,3 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
|
||||
# The MIT License (MIT)
|
||||
|
||||
# Copyright (c) 2016-2019 CNRS
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
# AUTHORS
|
||||
# Hervé BREDIN - http://herve.niderb.fr
|
||||
|
||||
|
||||
import versioneer
|
||||
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
with open("README.md") as f:
|
||||
@@ -42,19 +11,7 @@ setup(
|
||||
namespace_packages=["pyannote"],
|
||||
packages=find_packages(),
|
||||
install_requires=requirements,
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"pyannote-audio=pyannote.audio.applications.pyannote_audio:main",
|
||||
"pyannote-speech-feature=pyannote.audio.applications.feature_extraction:main",
|
||||
],
|
||||
"prodigy_recipes": [
|
||||
"pyannote.sad.manual = pyannote.audio.interactive.recipes.sad:sad_manual",
|
||||
"pyannote.dia.binary = pyannote.audio.interactive.recipes.dia:dia_binary",
|
||||
"pyannote.dia.manual = pyannote.audio.interactive.recipes.dia:dia_manual",
|
||||
],
|
||||
},
|
||||
version=versioneer.get_version(),
|
||||
cmdclass=versioneer.get_cmdclass(),
|
||||
entry_points={},
|
||||
description="Neural building blocks for speaker diarization",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
Reference in New Issue
Block a user