feat: initial import

This commit is contained in:
Hervé Bredin
2020-10-12 16:20:50 +02:00
parent ba0835f44e
commit 4097af53d4
19 changed files with 1730 additions and 162 deletions
+1 -1
View File
@@ -1,6 +1,6 @@
The MIT License (MIT)
Copyright (c) 2016 CNRS
Copyright (c) 2020 CNRS
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
+270
View File
@@ -0,0 +1,270 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ['PYANNOTE_DATABASE_CONFIG'] = '/Users/bredin/Development/pyannote/pyannote-audio/tests/data/database.yml'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from pyannote.database import get_protocol, FileFinder\n",
"protocol = get_protocol('Debug.SpeakerDiarization.Debug', \n",
" preprocessors={\"audio\": FileFinder()})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Voice activity detection"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from pyannote.audio.tasks.voice_activity_detection.task import VoiceActivityDetection\n",
"vad = VoiceActivityDetection(protocol, duration=2., batch_size=32, num_workers=4)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from pyannote.audio.models.debug import SimpleSegmentationModel\n",
"model = SimpleSegmentationModel(task=vad)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"GPU available: False, used: False\n",
"TPU available: False, using: 0 TPU cores\n",
"\n",
" | Name | Type | Params\n",
"------------------------------------------\n",
"0 | mfcc | MFCC | 0 \n",
"1 | lstm | LSTM | 18 K \n",
"2 | classifier | Linear | 130 \n",
"3 | activation | LogSoftmax | 0 \n",
"/Users/bredin/miniconda3/envs/pyannote-audio-v2/lib/python3.7/site-packages/pytorch_lightning/utilities/distributed.py:45: UserWarning: Your `IterableDataset` has `__len__` defined. In combination with multi-processing data loading (e.g. batch size > 1), this can lead to unintended side effects since the samples will be duplicated.\n",
" warnings.warn(*args, **kwargs)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9a872fa9cfcd4e298bb2d9e3410eb4de",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"import pytorch_lightning as pl\n",
"trainer = pl.Trainer(max_epochs=10)\n",
"_ = trainer.fit(model, vad)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Speaker tracking"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from pyannote.audio.tasks.speaker_tracking.task import SpeakerTracking\n",
"spk = SpeakerTracking(protocol, duration=2., batch_size=32, num_workers=2)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"model = SimpleSegmentationModel(task=spk)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"GPU available: False, used: False\n",
"TPU available: False, using: 0 TPU cores\n",
"\n",
" | Name | Type | Params\n",
"---------------------------------------\n",
"0 | mfcc | MFCC | 0 \n",
"1 | lstm | LSTM | 18 K \n",
"2 | classifier | Linear | 1 K \n",
"3 | activation | Sigmoid | 0 \n",
"/Users/bredin/miniconda3/envs/pyannote-audio-v2/lib/python3.7/site-packages/pytorch_lightning/utilities/distributed.py:45: UserWarning: The dataloader, train dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 4 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
" warnings.warn(*args, **kwargs)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "368ad052fb504dd3898bd70701b3997f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"trainer = pl.Trainer(max_epochs=10)\n",
"_ = trainer.fit(model, spk)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Speaker embedding"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from pyannote.audio.tasks.speaker_verification.task import SpeakerEmbeddingArcFace\n",
"emb = SpeakerEmbeddingArcFace(protocol, duration=2., batch_size=32, num_workers=4)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from pyannote.audio.models.debug import SimpleEmbeddingModel\n",
"model = SimpleEmbeddingModel(task=emb)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"GPU available: False, used: False\n",
"TPU available: False, using: 0 TPU cores\n",
"\n",
" | Name | Type | Params\n",
"------------------------------\n",
"0 | mfcc | MFCC | 0 \n",
"1 | lstm | LSTM | 18 K \n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fba3a039c408450bac0a9d56381b780b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"trainer = pl.Trainer(max_epochs=10)\n",
"_ = trainer.fit(model, emb)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
+8 -14
View File
@@ -1,20 +1,17 @@
#!/usr/bin/env python
# encoding: utf-8
# The MIT License (MIT)
# Copyright (c) 2016-2020 CNRS
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -23,7 +20,4 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# AUTHORS
# Hervé BREDIN - http://herve.niderb.fr
__import__("pkg_resources").declare_namespace(__name__)
+8 -72
View File
@@ -1,20 +1,17 @@
#!/usr/bin/env python
# encoding: utf-8
# The MIT License (MIT)
# Copyright (c) 2016-2020 CNRS
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -22,64 +19,3 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# AUTHORS
# Hervé BREDIN - http://herve.niderb.fr
"""
`pyannote.audio` provides
* speech activity detection
* speaker change detection
* speaker embedding
* speaker diarization pipeline
## Installation
```bash
$ pip install pyannote.audio
```
## Citation
If you use `pyannote.audio` please use the following citations.
- Speech activity and speaker change detection
@inproceedings{Yin2017,
Author = {Ruiqing Yin and Herv\'e Bredin and Claude Barras},
Title = {{Speaker Change Detection in Broadcast TV using Bidirectional Long Short-Term Memory Networks}},
Booktitle = {{18th Annual Conference of the International Speech Communication Association, Interspeech 2017}},
Year = {2017},
Month = {August},
Address = {Stockholm, Sweden},
Url = {https://github.com/yinruiqing/change_detection}
}
- Speaker embedding
@inproceedings{Bredin2017,
author = {Herv\'{e} Bredin},
title = {{TristouNet: Triplet Loss for Speaker Turn Embedding}},
booktitle = {42nd IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2017},
year = {2017},
url = {http://arxiv.org/abs/1609.04301},
}
- Speaker diarization pipeline
@inproceedings{Yin2018,
Author = {Ruiqing Yin and Herv\'e Bredin and Claude Barras},
Title = {{Neural Speech Turn Segmentation and Affinity Propagation for Speaker Diarization}},
Booktitle = {{19th Annual Conference of the International Speech Communication Association, Interspeech 2018}},
Year = {2018},
Month = {September},
Address = {Hyderabad, India},
}
"""
from ._version import get_versions
__version__ = get_versions()["version"]
del get_versions
+22
View File
@@ -0,0 +1,22 @@
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
+354
View File
@@ -0,0 +1,354 @@
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from typing import Union, Optional, Text
from pathlib import Path
from pyannote.database import ProtocolFile
import soundfile as sf
import warnings
import numpy as np
import librosa
from pyannote.core import Segment, SlidingWindow, SlidingWindowFeature
from pyannote.core.utils.types import Alignment
AudioFile = Union[Path, Text, ProtocolFile, dict]
"""
Audio files can be provided to the Audio class using different types:
- a "str" instance: "/path/to/audio.wav"
- a "Path" instance: Path("/path/to/audio.wav")
- a ProtocolFile (or regular dict) with an "audio" key:
{"audio": Path("/path/to/audio.wav")}
- a ProtocolFile (or regular dict) with both "waveform" and "sample_rate" key:
{"waveform": (time, channel) numpy array, "sample_rate": 44100}
For last two options, an additional "channel" key can be provided as a zero-indexed
integer to load a specific channel:
{"audio": Path("/path/to/stereo.wav"), "channel": 0}
"""
class Audio:
"""Audio IO
Parameters
----------
sample_rate: int, optional
Target sampling rate. Defaults to using native sampling rate.
mono : int, optional
Convert multi-channel to mono. Defaults to True.
Usage
-----
>>> audio = Audio(sample_rate=16000, mono=True)
>>> waveform, sample_rate = audio({"audio": "/path/to/audio.wav"})
>>> assert sample_rate == 16000
>>> two_seconds_stereo = np.random.rand(44100 * 2, 2, dtype=np.float32)
>>> waveform, sample_rate = audio({"waveform": two_seconds_stereo, "sample_rate": 44100})
>>> assert sample_rate == 16000
>>> assert waveform.shape[1] == 1
"""
@staticmethod
def get_duration(file: AudioFile) -> float:
"""Get audio file duration
Parameters
----------
file : AudioFile
Audio file.
Returns
-------
duration : float
Duration in seconds.
"""
if isinstance(file, (ProtocolFile, dict)):
audio = file["audio"]
else:
audio = file
if isinstance(audio, Path):
audio = str(audio)
with sf.SoundFile(audio, "r") as f:
return float(f.frames) / f.samplerate
@staticmethod
def is_valid(file: AudioFile) -> bool:
if isinstance(file, (ProtocolFile, dict)):
if "waveform" in file:
waveform = file["waveform"]
if len(waveform.shape) != 2 or waveform.shape[0] < waveform.shape[1]:
raise ValueError(
"'waveform' must be provided as a (time, channel) numpy array."
)
sample_rate = file.get("sample_rate", None)
if sample_rate is None:
raise ValueError(
"'waveform' must be provided with their 'sample_rate'."
)
return True
elif "audio" in file:
audio = file["audio"]
else:
# TODO improve error message
raise ValueError("either 'audio' or 'waveform' key must be provided.")
else:
audio = file
#  should we check here that "audio" file exists?
#  this will slow things down and will fail later anyway.
return True
def __init__(self, sample_rate=None, mono=True):
super().__init__()
self.sample_rate = sample_rate
self.mono = mono
def downmix_and_resample(
self, waveform: np.ndarray, sample_rate: int
) -> np.ndarray:
"""Downmix and resample
Parameters
----------
waveform : (time, channel) np.ndarray
Waveform.
sample_rate : int
Sample rate.
Returns
-------
waveform : (time, channel) np.ndarray
Remixed and resampled waveform
sample_rate : int
New sample rate
"""
# downmix to mono
if self.mono and waveform.shape[1] > 1:
waveform = np.mean(waveform, axis=1, keepdims=True)
# resample
if (self.sample_rate is not None) and (self.sample_rate != sample_rate):
if self.mono:
# librosa expects mono audio to be of shape (n,), but we have (n, 1).
waveform = librosa.core.resample(
waveform[:, 0], sample_rate, self.sample_rate
)[:, None]
else:
waveform = librosa.core.resample(
waveform.T, sample_rate, self.sample_rate
).T
sample_rate = self.sample_rate
return waveform, sample_rate
def __call__(self, file: AudioFile):
"""Obtain waveform
Parameters
----------
file : AudioFile
Returns
-------
waveform : `pyannote.core.SlidingWindowFeature`
Waveform.
See also
--------
AudioFile
"""
self.is_valid(file)
if isinstance(file, (ProtocolFile, dict)):
if "waveform" in file:
audio = None
waveform = file["waveform"]
sample_rate = file.get("sample_rate", None)
elif "audio" in file:
audio = file["audio"]
waveform = None
sample_rate = None
else:
pass
channel = file.get("channel", None)
else:
audio = file
waveform = None
sample_rate = None
channel = None
if isinstance(audio, Path):
audio = str(audio)
if waveform is None:
waveform, sample_rate = sf.read(audio, dtype="float32", always_2d=True)
if channel is not None:
waveform = waveform[:, channel - 1 : channel]
waveform = self.downmix_and_resample(waveform, sample_rate)
sliding_window = SlidingWindow(
start=-0.5 / sample_rate, duration=1.0 / sample_rate, step=1.0 / sample_rate
)
return SlidingWindowFeature(waveform, sliding_window)
def crop(
self,
file: AudioFile,
segment: Segment,
mode: Alignment = "center",
fixed: Optional[float] = None,
) -> np.ndarray:
"""Fast version of self(file).crop(segment, **kwargs)
Parameters
----------
file : AudioFile
Audio file.
segment : `pyannote.core.Segment`
Temporal segment to load.
mode : {'loose', 'strict', 'center'}, optional
In 'strict' mode, only samples fully included in 'segment' are
returned. In 'loose' mode, any intersecting frames are returned. In
'center' mode, first and last frames are chosen to be the ones
whose centers are the closest to 'focus' start and end times.
Defaults to 'center'.
fixed : float, optional
Overrides `Segment` 'focus' duration and ensures that the number of
returned frames is fixed (which might otherwise not be the case
because of rounding errors). Has no effect in 'strict' or 'loose'
modes.
Returns
-------
waveform : (time, channel) numpy array
Waveform
sample_rate : int
Sample rate
TODO: remove support for "mode" option. It is always "center" anyway.
See also
--------
`pyannote.core.SlidingWindowFeature.crop`
"""
self.is_valid(file)
if isinstance(file, (ProtocolFile, dict)):
if "waveform" in file:
audio = None
waveform = file["waveform"]
sample_rate = file.get("sample_rate", None)
frames = len(waveform)
elif "audio" in file:
audio = file["audio"]
waveform = None
else:
pass
channel = file.get("channel", None)
else:
audio = file
waveform = None
channel = None
if isinstance(audio, Path):
audio = str(audio)
# read sample rate and number of frames
if waveform is None:
with sf.SoundFile(audio, "r") as f:
sample_rate = f.samplerate
frames = f.frames
# infer which samples to load from sample rate and requested chunk
#  TODO: compute start directly instead of using a sliding window
samples = SlidingWindow(
start=-0.5 / sample_rate, duration=1.0 / sample_rate, step=1.0 / sample_rate
)
((start, stop),) = samples.crop(
segment, mode=mode, fixed=fixed, return_ranges=True
)
if start < 0 or stop > frames:
raise ValueError(
f"requested chunk [{segment.start:.6f}, {segment.end:.6f}] "
f"lies outside of file bounds [0., {frames / sample_rate:.6f}]."
)
if waveform is not None:
data = waveform[start:stop]
else:
with sf.SoundFile(audio, "r") as f:
try:
f.seek(start)
data = f.read(stop - start, dtype="float32", always_2d=True)
except RuntimeError:
msg = (
f"SoundFile failed to seek-and-read in "
f"{audio}: loading the whole file..."
)
warnings.warn(msg)
return self(audio).crop(segment, mode=mode, fixed=fixed)
if channel is not None:
data = data[:, channel - 1 : channel]
return self.downmix_and_resample(data, sample_rate)
def normalize(wav):
"""Normalize waveform"""
return wav / (np.sqrt(np.mean(wav ** 2)) + 1e-8)
+149
View File
@@ -0,0 +1,149 @@
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import pytorch_lightning as pl
from pyannote.audio.core.task import Task, Problem
from typing import Optional
import torch
import torch.nn as nn
from pyannote.audio.core.io import Audio
class Model(pl.LightningModule):
"""Base model
Parameters
----------
sample_rate : int, optional
Audio sample rate. Defaults to 16kHz (16000).
num_channels : int, optional
Number of channels. Defaults to mono (1).
task : Task, optional
Task addressed by the model. Only provided when training the model.
A model should be `load_from_checkpoint`-able without a task as
`on_load_checkpoint` hook takes care of calling `setup`.
"""
def __init__(
self,
sample_rate: int = 16000,
num_channels: int = 1,
task: Optional[Task] = None,
):
super().__init__()
# set-up audio IO
assert (
num_channels == 1
), "Only mono audio is supported for now (num_channels = 1)"
self.hparams.sample_rate = sample_rate
self.hparams.num_channels = num_channels
self.audio = Audio(sample_rate=self.hparams.sample_rate, mono=True)
# set task attribute when available (i.e. at training time)
# and also tell the task what kind of audio is expected from
# the model
if task is not None:
self.task = task
self.task.audio = self.audio
def build(self):
# use this method to add task-dependent layers to the model
# (e.g. the final classification and activation layers)
pass
def setup(self, stage=None):
if stage == "fit":
# keep track of the classes here because it is used
# to setup the final classification layer (even when stage != fit)
self.hparams.classes = self.task.specifications.classes
# keep track of the type of problem here because it is used
# to setup the final activation layer (even when stage != fit)
self.hparams.problem = self.task.specifications.problem
# any other common parameters should be saved?
# maybe the class of the model (and pyannote.audio semantic version?)
# so that it can be loaded without knowing what type of model it is.
# this would probably make distributing pretrained models much easier.
else:
# should we do something specific when stage != fit?
# hparams.classes and hparams.problem should already exist
# because they should have been loaded on_load_checkpoint
pass
# add task-dependent layers to the model
# (e.g. the final classification and activation layers)
self.build()
if stage == "fit":
# let task know about the shape of model output
# so that its dataloader knows how to generate targets
self.task.example_output_array = self.forward(
self.task.example_input_array()
)
def on_load_checkpoint(self, checkpoint):
# only hyper-parameters defined in __init__ are loaded automatically.
# therefore, we have to manually load hyper-parameters that were
# defined during setup()
self.hparams.classes = checkpoint["hyper_parameters"]["classes"]
self.hparams.problem = checkpoint["hyper_parameters"]["problem"]
# TODO: would have to check pytorch-lightning documentation to see
# if we can get rid of this... it is weird that only "some" parameters
# in self.hparams are assigned at __init__ time...
# now that setup()-defined hyper-parameters are available,
# we can actually setup() the model.
self.setup()
def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
msg = "Class {self.__class__.__name__} should define a `forward` method."
raise NotImplementedError(msg)
# convenience function to automate the choice of the final activation function
def default_activation(self) -> nn.Module:
if self.hparams.problem == Problem.MONO_LABEL_CLASSIFICATION:
return nn.LogSoftmax(dim=-1)
elif self.hparams.problem == Problem.MULTI_LABEL_CLASSIFICATION:
return nn.Sigmoid()
else:
msg = "TODO: implement default activation for other types of problems"
raise NotImplementedError(msg)
# training step logic is defined by the task because the
# model does not really need to know how it is being used.
def training_step(self, batch, batch_idx):
return self.task.training_step(self, batch, batch_idx)
# optimizer is defined by the task for the same reason as above
def configure_optimizers(self):
return self.task.configure_optimizers(self)
+215
View File
@@ -0,0 +1,215 @@
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from enum import Enum
from dataclasses import dataclass
from typing import Optional, List, Text
import pytorch_lightning as pl
import torch
import torch.optim
from torch.utils.data import DataLoader
from torch.utils.data import IterableDataset
import torch.nn.functional as F
from pyannote.database import Protocol
# Type of machine learning problem
class Problem(Enum):
MONO_LABEL_CLASSIFICATION = 1
MULTI_LABEL_CLASSIFICATION = 2
REPRESENTATION = 3
REGRESSION = 4
# any other we could think of?
# A task takes an audio chunk as input and returns
# either a temporal sequence of predictions
# or just one prediction for the whole audio chunk
class Scale(Enum):
FRAME = 1 # model outputs a sequence of frames
CHUNK = 2 # model outputs just one vector for the whole chunk
@dataclass
class TaskSpecification:
problem: Problem
scale: Scale
# for classification tasks only
classes: Optional[List[Text]] = None
# note how a task is actually a LightningDataModule
class Task(pl.LightningDataModule):
"""Base task class
A task is the combination of a "problem" and a "dataset".
For example, here are a few tasks:
- voice activity detection on the AMI corpus
- speaker embedding on the VoxCeleb corpus
- end-to-end speaker diarization on the VoxConverse corpus
A task is expected to be solved by a "model" that takes an
audio chunk as input and returns the solution. Hence, the
task is in charge of generating (input, expected_output)
samples used for training the model.
Parameters
----------
protocol : Protocol
pyannote.database protocol
duration : float, optional
Chunks duration. Defaults to variable duration (None).
batch_size : int, optional
Number of training samples per batch.
num_workers : int, optional
Number of workers used for generating training samples.
"""
def __init__(
self,
protocol: Protocol,
duration: float = None,
batch_size: int = None,
num_workers: int = 1,
):
super().__init__()
# dataset
self.protocol = protocol
# batching
self.duration = duration
self.batch_size = batch_size
self.num_workers = num_workers
def prepare_data(self):
# this is where we might end up downloading datasets
# and transform them so that they are ready to be used
# with pyannote.database. but for now, the API assume
# that we directly provide a pyannote.database.Protocol.
pass
def train__iter__(self):
# will become train_dataset.__iter__ method
msg = f"Missing '{self.__class__.__name__}.train__iter__' method."
raise NotImplementedError(msg)
def train__len__(self):
# will become train_dataset.__len__ method
msg = f"Missing '{self.__class__.__name__}.train__len__' method."
raise NotImplementedError(msg)
def train_dataloader(self) -> DataLoader:
# build train IterableDataset subclass programmatically
dataset = type(
"TrainDataset",
(IterableDataset,),
{"__iter__": self.train__iter__, "__len__": self.train__len__},
)
return DataLoader(
dataset(),
batch_size=self.batch_size,
num_workers=self.num_workers,
drop_last=True,
)
@property
def example_input_duration(self) -> float:
return 2.0 if self.duration is None else self.duration
def example_input_array(self):
# this method is called in Model.setup where it is used
# to automagically infer the temporal resolution of the
# model output, and hence allow the dataloader to shape
# its targets correctly.
# since we plan to have the feature extraction step done
# on GPU as part of the model, the example input array is
# basically always a chunk of audio
if self.audio.mono:
num_channels = 1
else:
msg = "Only 'mono' audio is supported."
raise NotImplementedError(msg)
return torch.randn(
(
self.batch_size,
int(self.audio.sample_rate * self.example_input_duration),
num_channels,
)
)
# below is a (hacky) way to automagically infer the expected
# resolution of the target. basically, we do a forward pass
# of an example input array and look at the resulting shape
# of the output. the problem with this approach is that we
# may encounter weird rounding errors in case of variable-length
# chunks. TODO: someone should look at this to make it more robust.
@property
def example_output_array(self) -> torch.Tensor:
return self.example_output_array_
@example_output_array.setter
def example_output_array(self, example_output_array: torch.Tensor):
self.example_output_array_ = example_output_array
if self.specifications.scale == Scale.FRAME:
self.frame_duration_ = (
self.example_input_duration / example_output_array.shape[1]
)
@property
def frame_duration(self) -> float:
if self.specifications.scale == Scale.FRAME:
return self.frame_duration_
# default training_step provided for convenience
# can obviously be overriden for each task
def training_step(self, model: "Model", batch, batch_idx: int):
X, y = batch["X"], batch["y"]
if self.specifications.problem == Problem.MONO_LABEL_CLASSIFICATION:
loss = F.nll_loss(
model(X).view(-1, len(self.specifications.classes)), y.view(-1)
)
elif self.specifications.problem == Problem.MULTI_LABEL_CLASSIFICATION:
loss = F.binary_cross_entropy(model(X), y.float())
else:
msg = "TODO: implement for other types of problems"
raise NotImplementedError(msg)
model.log("train_loss", loss)
return loss
# default configure_optimizers provided for convenience
# can obviously be overriden for each task
def configure_optimizers(self, model: "Model"):
# for tasks such as SpeakerEmbedding,
# other parameters should be added here
return torch.optim.Adam(model.parameters(), lr=1e-3)
+8 -16
View File
@@ -1,20 +1,17 @@
#!/usr/bin/env python
# encoding: utf-8
# The MIT License (MIT)
# Copyright (c) 2019-2020 CNRS
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -22,8 +19,3 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# AUTHORS
# Hervé BREDIN - http://herve.niderb.fr
from .models import PyanNet, SincTDNN, ACRoPoLiS
+147
View File
@@ -0,0 +1,147 @@
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from pyannote.audio.core.model import Model
from pyannote.audio.core.task import Task
from typing import Optional
from torchaudio.transforms import MFCC
import torch
import torch.nn as nn
from einops import rearrange, reduce
class SimpleSegmentationModel(Model):
def __init__(
self,
sample_rate: int = 16000,
num_channels: int = 1,
task: Optional[Task] = None,
):
super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
self.mfcc = MFCC(
sample_rate=self.hparams.sample_rate,
n_mfcc=40,
dct_type=2,
norm="ortho",
log_mels=False,
)
self.lstm = nn.LSTM(
self.mfcc.n_mfcc * self.hparams.num_channels,
32,
num_layers=1,
batch_first=True,
bidirectional=True,
)
def build(self):
# define task-dependent layers
self.classifier = nn.Linear(32 * 2, len(self.hparams.classes))
self.activation = self.default_activation()
# why do we define those layers here and not in task.setup()?
# because, at inference time, we need those layers.
# this is in contrast of SpeakerEmbedding.loss_func layers below
# that are only needed during training -- we don't want them to
# be applied at inference.
def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
"""
Parameters
----------
waveforms : (batch, time, channel)
Returns
-------
scores : (batch, time, classes)
"""
# extract MFCC
mfcc = self.mfcc(rearrange(waveforms, "b t c -> b c t"))
# pass MFCC sequeence into the recurrent layer
output, hidden = self.lstm(rearrange(mfcc, "b c f t -> b t (c f)"))
# apply the final classifier to get logits
return self.activation(self.classifier(output))
class SimpleEmbeddingModel(Model):
def __init__(
self,
sample_rate: int = 16000,
num_channels: int = 1,
task: Optional[Task] = None,
):
super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
self.mfcc = MFCC(
sample_rate=self.hparams.sample_rate,
n_mfcc=40,
dct_type=2,
norm="ortho",
log_mels=False,
)
self.lstm = nn.LSTM(
self.mfcc.n_mfcc * self.hparams.num_channels,
32,
num_layers=1,
batch_first=True,
bidirectional=True,
)
# this is needed because example_output_array is needed in SpeakerEmbedding.setup
# to automagically infer the embedding size. but example_output_array is computed
# in Model.setup (which is called **after** Task.setup).
# note that this is only a problem for embedding tasks.
# we should find a way to automate this call so that the
# end user does not forget to call it. note that this must
# be called at the end of __init__
if self.task is not None:
self.task.example_output_array = self.forward(
self.task.example_input_array()
)
def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
"""
Parameters
----------
waveforms : (batch, time, channel)
Returns
-------
embedding : (batch, dimension)
"""
mfcc = self.mfcc(rearrange(waveforms, "b t c -> b c t"))
output, hidden = self.lstm(rearrange(mfcc, "b c f t -> b t (c f)"))
# mean temporal pooling
return reduce(output, "b t f -> b f", "mean")
@@ -0,0 +1,158 @@
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from pyannote.audio.core.task import TaskSpecification, Problem, Scale, Task
from pyannote.database import Protocol
import math
import random
from pyannote.core import Segment, Timeline, SlidingWindow
from pyannote.core.utils.numpy import one_hot_encoding
class SpeakerTracking(Task):
"""Speaker tracking
Speaker tracking is the process of determining if and when a (previously
enrolled) person's voice can be heard in an audio recording.
Here, it is addressed with the same approach as voice activity detection,
except {"non-speech", "speech"} classes are replaced by {"speaker1", ...,
"speaker_N"} where N is the number of speakers in the training set.
"""
def __init__(
self,
protocol: Protocol,
duration: float = 2.0,
batch_size: int = None,
num_workers: int = 1,
):
super().__init__(
protocol, duration=duration, batch_size=batch_size, num_workers=num_workers
)
# for speaker tracking, task specification depends
# on the data: we do not know in advance which
# speakers should be tracked. therefore, we postpone
# the definition of specifications.
def setup(self, stage=None):
if stage == "fit":
# this is where we load the training set metadata
# to be used later by the train_dataloader.
# here, we simply loop over the training set, remove
# annotated regions shorter than chunk duration, and
# keep track of the reference annotations.
# we also build the list of speakers to be tracked.
self.train, speakers = [], set()
for f in self.protocol.train():
segments = [
segment
for segment in f["annotated"]
if segment.duration > self.duration
]
duration = sum(segment.duration for segment in segments)
self.train.append(
{
"annotated": segments,
"annotation": f["annotation"],
"duration": duration,
"audio": f["audio"],
}
)
speakers.update(f["annotation"].labels())
# now that we now who the speakers are, we can
# define the task specifications.
# note that, since multiple speakers can be active
# at once, the problem is multi-label classification.
self.specifications = TaskSpecification(
problem=Problem.MULTI_LABEL_CLASSIFICATION,
scale=Scale.FRAME,
classes=sorted(speakers),
)
def train__iter__(self):
"""Iterate over training samples
Yields
------
X: (time, channel)
Audio chunks.
y: (frame, num_speakers)
Frame-level targets. Note that frame < time.
`frame` is infered automagically from the
example model output.
"""
random.seed()
while True:
# select one file at random (with probability proportional to its annotated duration)
file, *_ = random.choices(
self.train, weights=[f["duration"] for f in self.train], k=1,
)
# select one annotated region at random (with probability proportional to its duration)
segment, *_ = random.choices(
file["annotated"], weights=[s.duration for s in file["annotated"]], k=1,
)
# select one chunk at random (with uniform distribution)
start_time = random.uniform(segment.start, segment.end - self.duration)
chunk = Segment(start_time, start_time + self.duration)
# extract features
X, _ = self.audio.crop(file, chunk, mode="center", fixed=self.duration)
# TODO | this one_hot_encoding thing needs to be rewritten into pyannote.audio
# TODO | to make sure we always return the same number of frames for the same
# TODO | input duration. we should also support variable-length chunks.
frames = SlidingWindow(
start=chunk.start,
duration=self.frame_duration,
step=self.frame_duration,
)
y = one_hot_encoding(
file["annotation"].crop(chunk),
Timeline([chunk]),
frames,
labels=self.specifications.classes,
mode="center",
).data
yield {"X": X, "y": y}
def train__len__(self):
# Number of training samples in one epoch
duration = sum(file["duration"] for file in self.train)
return math.ceil(duration / self.duration)
@@ -0,0 +1,21 @@
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
@@ -0,0 +1,176 @@
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from typing import TYPE_CHECKING
from pyannote.audio.core.task import TaskSpecification, Problem, Scale, Task
if TYPE_CHECKING:
from pyannote.audio.core.model import Model
from pyannote.database import Protocol
import random
import math
from pyannote.core import Segment
import pytorch_metric_learning.losses
from itertools import chain
import torch.optim
class SpeakerEmbeddingArcFace(Task):
def __init__(
self,
protocol: Protocol,
duration: float = 2.0,
batch_size: int = None,
num_workers: int = 1,
):
super().__init__(
protocol, duration=duration, batch_size=batch_size, num_workers=num_workers
)
# there is no such thing as a "class" in representation
# learning, so we do not need to define it here.
self.specifications = TaskSpecification(
problem=Problem.REPRESENTATION, scale=Scale.CHUNK
)
def setup(self, stage=None):
if stage == "fit":
# gather training set metadata
self.speakers = dict()
for f in self.protocol.train():
for speaker in f["annotation"].labels():
# keep speaker's (long enough) speech turns...
speech_turns = [
segment
for segment in f["annotation"].label_timeline(speaker)
if segment.duration > self.duration
]
# skip if there is no speech turns left
if not speech_turns:
continue
# ... and their total duration
duration = sum(segment.duration for segment in speech_turns)
# add speaker to the list of speakers
if speaker not in self.speakers:
self.speakers[speaker] = list()
self.speakers[speaker].append(
{
"audio": f["audio"],
"duration": duration,
"speech_turns": speech_turns,
}
)
# for convenience, we keep track of the list of speakers, after all
self.specifications.classes = sorted(self.speakers)
num_classes = len(self.speakers)
# use example_output_array to guess embedding size
_, embedding_size = self.example_output_array.shape
self.loss_func = pytorch_metric_learning.losses.ArcFaceLoss(
num_classes, embedding_size, margin=28.6, scale=64
)
def train__iter__(self):
"""Iterate over training samples
Yields
------
X: (time, channel)
Audio chunks.
y: int
Speaker index.
"""
random.seed()
speakers = list(self.speakers)
while True:
# shuffle speakers so that we don't always have the same
# groups of speakers in a batch (which might be especially
# problematic for contrast-based losses like contrastive
# or triplet loss.
random.shuffle(speakers)
for speaker in speakers:
# speaker index in original sorted order
y = self.specifications.classes.index(speaker)
# three chunks per speaker
for _ in range(3):
# select one file at random (with probability proportional to its speaker duration)
file, *_ = random.choices(
self.speakers[speaker],
weights=[f["duration"] for f in self.speakers[speaker]],
k=1,
)
# select one speech turn at random (with probability proportional to its duration)
speech_turn, *_ = random.choices(
file["speech_turns"],
weights=[s.duration for s in file["speech_turns"]],
k=1,
)
# select one chunk at random (with uniform distribution)
start_time = random.uniform(
speech_turn.start, speech_turn.end - self.duration
)
chunk = Segment(start_time, start_time + self.duration)
# extract features
X, _ = self.audio.crop(
file, chunk, mode="center", fixed=self.duration
)
yield {"X": X, "y": y}
def train__len__(self):
duration = sum(
datum["duration"] for data in self.speakers.values() for datum in data
)
return math.ceil(duration / self.duration)
def training_step(self, model: "Model", batch, batch_idx: int):
X, y = batch["X"], batch["y"]
loss = self.loss_func(model(X), y)
model.log("train_loss", loss)
return loss
def configure_optimizers(self, model: "Model"):
parameters = chain(model.parameters(), self.loss_func.parameters())
return torch.optim.Adam(parameters, lr=1e-3)
@@ -0,0 +1,22 @@
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
@@ -0,0 +1,22 @@
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
@@ -0,0 +1,141 @@
# MIT License
#
# Copyright (c) 2020 CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from pyannote.audio.core.task import TaskSpecification, Problem, Scale, Task
from pyannote.database import Protocol
import numpy as np
import math
import random
from pyannote.core import Segment, Timeline, SlidingWindow
from pyannote.core.utils.numpy import one_hot_encoding
class VoiceActivityDetection(Task):
def __init__(
self,
protocol: Protocol,
duration: float = 2.0,
batch_size: int = None,
num_workers: int = 1,
):
super().__init__(
protocol, duration=duration, batch_size=batch_size, num_workers=num_workers
)
# for voice activity detection, task specification
# does not depend on the data: we can define it in
# __init__
self.specifications = TaskSpecification(
problem=Problem.MONO_LABEL_CLASSIFICATION,
scale=Scale.FRAME,
classes=["non_speech", "speech"],
)
def setup(self, stage=None):
if stage == "fit":
# this is where we load the training set metadata
# to be used later by the train_dataloader.
# here, we simply loop over the training set, remove
# annotated regions shorter than chunk duration, and
# keep track of the reference annotations.
self.train = []
for f in self.protocol.train():
segments = [
segment
for segment in f["annotated"]
if segment.duration > self.duration
]
duration = sum(segment.duration for segment in segments)
self.train.append(
{
"annotated": segments,
"annotation": f["annotation"],
"duration": duration,
"audio": f["audio"],
}
)
def train__iter__(self):
"""Iterate over training samples
Yields
------
X: (time, channel)
Audio chunks.
y: (frame, )
Frame-level targets. Note that frame < time.
`frame` is infered automagically from the
example model output.
"""
random.seed()
while True:
# select one file at random (with probability proportional to its annotated duration)
file, *_ = random.choices(
self.train, weights=[f["duration"] for f in self.train], k=1,
)
# select one annotated region at random (with probability proportional to its duration)
segment, *_ = random.choices(
file["annotated"], weights=[s.duration for s in file["annotated"]], k=1,
)
# select one chunk at random (with uniform distribution)
start_time = random.uniform(segment.start, segment.end - self.duration)
chunk = Segment(start_time, start_time + self.duration)
# extract features
X, _ = self.audio.crop(file, chunk, mode="center", fixed=self.duration)
# note how, contrary to what is currently done in pyannote.audio,
# y is not precomputed for the whole file at initialization time.
# here, we stick with pyannote.core.Annotation as long as possible
# and "one hot" encode the data only when generating training samples.
# this should allow to train on much larger datasets.
# TODO | this one_hot_encoding thing needs to be rewritten into pyannote.audio
# TODO | to make sure we always return the same number of frames for the same
# TODO | input duration. we should also support variable-length chunks.
frames = SlidingWindow(
start=chunk.start,
duration=self.frame_duration,
step=self.frame_duration,
)
y = one_hot_encoding(
file["annotation"].crop(chunk), Timeline([chunk]), frames, mode="center"
).data
# this is the only part of this method that is specific to VAD
# the rest should also work for any task with Scale.FRAME
y = np.int64(np.sum(y, axis=1) > 0)
yield {"X": X, "y": y}
def train__len__(self):
# Number of training samples in one epoch
duration = sum(file["duration"] for file in self.train)
return math.ceil(duration / self.duration)
+7 -15
View File
@@ -1,17 +1,9 @@
cachetools >= 2.0.0
librosa >= 0.8.0
pandas >= 0.18.0
torch >= 1.6
torchaudio >= 0.6
pytorch_lightning >= 1.0.0rc4
pytorch_metric_learning >= 0.9.93
einops >= 0.3.0
pyannote.core >= 4.1
pyannote.database >= 4.0
pyannote.metrics >= 2.3
pyannote.pipeline >= 1.5.2
pyYAML >= 3.12
scikit-learn >= 0.20.2
sortedcollections >= 1.0.1
sortedcontainers >= 2.0.4
pyannote.database >= 4.0.1
librosa >= 0.8
soundfile >= 0.10.2
tqdm >= 4.29.1
tensorboard >= 2.0.0
typing_extensions >= 3.7.4;python_version < '3.8'
pescador >= 2.1.0
Pillow >= 6.2.1
+1 -44
View File
@@ -1,34 +1,3 @@
#!/usr/bin/env python
# encoding: utf-8
# The MIT License (MIT)
# Copyright (c) 2016-2019 CNRS
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# AUTHORS
# Hervé BREDIN - http://herve.niderb.fr
import versioneer
from setuptools import setup, find_packages
with open("README.md") as f:
@@ -42,19 +11,7 @@ setup(
namespace_packages=["pyannote"],
packages=find_packages(),
install_requires=requirements,
entry_points={
"console_scripts": [
"pyannote-audio=pyannote.audio.applications.pyannote_audio:main",
"pyannote-speech-feature=pyannote.audio.applications.feature_extraction:main",
],
"prodigy_recipes": [
"pyannote.sad.manual = pyannote.audio.interactive.recipes.sad:sad_manual",
"pyannote.dia.binary = pyannote.audio.interactive.recipes.dia:dia_binary",
"pyannote.dia.manual = pyannote.audio.interactive.recipes.dia:dia_manual",
],
},
version=versioneer.get_version(),
cmdclass=versioneer.get_cmdclass(),
entry_points={},
description="Neural building blocks for speaker diarization",
long_description=long_description,
long_description_content_type="text/markdown",