feat: initial import

2026-07-01 20:24:10 -04:00 · 2020-10-12 16:20:50 +02:00
parent ba0835f44e
commit 4097af53d4
19 changed files with 1730 additions and 162 deletions
@@ -1,6 +1,6 @@
 The MIT License (MIT)

-Copyright (c) 2016 CNRS
+Copyright (c) 2020 CNRS

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -0,0 +1,270 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['PYANNOTE_DATABASE_CONFIG'] = '/Users/bredin/Development/pyannote/pyannote-audio/tests/data/database.yml'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyannote.database import get_protocol, FileFinder\n",
+    "protocol = get_protocol('Debug.SpeakerDiarization.Debug', \n",
+    "                        preprocessors={\"audio\": FileFinder()})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Voice activity detection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyannote.audio.tasks.voice_activity_detection.task import VoiceActivityDetection\n",
+    "vad = VoiceActivityDetection(protocol, duration=2., batch_size=32, num_workers=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyannote.audio.models.debug import SimpleSegmentationModel\n",
+    "model = SimpleSegmentationModel(task=vad)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "GPU available: False, used: False\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "\n",
+      "  | Name       | Type       | Params\n",
+      "------------------------------------------\n",
+      "0 | mfcc       | MFCC       | 0     \n",
+      "1 | lstm       | LSTM       | 18 K  \n",
+      "2 | classifier | Linear     | 130   \n",
+      "3 | activation | LogSoftmax | 0     \n",
+      "/Users/bredin/miniconda3/envs/pyannote-audio-v2/lib/python3.7/site-packages/pytorch_lightning/utilities/distributed.py:45: UserWarning: Your `IterableDataset` has `__len__` defined. In combination with multi-processing data loading (e.g. batch size > 1), this can lead to unintended side effects since the samples will be duplicated.\n",
+      "  warnings.warn(*args, **kwargs)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9a872fa9cfcd4e298bb2d9e3410eb4de",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pytorch_lightning as pl\n",
+    "trainer = pl.Trainer(max_epochs=10)\n",
+    "_ = trainer.fit(model, vad)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Speaker tracking"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyannote.audio.tasks.speaker_tracking.task import SpeakerTracking\n",
+    "spk = SpeakerTracking(protocol, duration=2., batch_size=32, num_workers=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = SimpleSegmentationModel(task=spk)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "GPU available: False, used: False\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "\n",
+      "  | Name       | Type    | Params\n",
+      "---------------------------------------\n",
+      "0 | mfcc       | MFCC    | 0     \n",
+      "1 | lstm       | LSTM    | 18 K  \n",
+      "2 | classifier | Linear  | 1 K   \n",
+      "3 | activation | Sigmoid | 0     \n",
+      "/Users/bredin/miniconda3/envs/pyannote-audio-v2/lib/python3.7/site-packages/pytorch_lightning/utilities/distributed.py:45: UserWarning: The dataloader, train dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 4 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
+      "  warnings.warn(*args, **kwargs)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "368ad052fb504dd3898bd70701b3997f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "trainer = pl.Trainer(max_epochs=10)\n",
+    "_ = trainer.fit(model, spk)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Speaker embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyannote.audio.tasks.speaker_verification.task import SpeakerEmbeddingArcFace\n",
+    "emb = SpeakerEmbeddingArcFace(protocol, duration=2., batch_size=32, num_workers=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyannote.audio.models.debug import SimpleEmbeddingModel\n",
+    "model = SimpleEmbeddingModel(task=emb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "GPU available: False, used: False\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "\n",
+      "  | Name | Type | Params\n",
+      "------------------------------\n",
+      "0 | mfcc | MFCC | 0     \n",
+      "1 | lstm | LSTM | 18 K  \n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fba3a039c408450bac0a9d56381b780b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "trainer = pl.Trainer(max_epochs=10)\n",
+    "_ = trainer.fit(model, emb)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -1,20 +1,17 @@
-#!/usr/bin/env python
-# encoding: utf-8
-
-# The MIT License (MIT)
-
-# Copyright (c) 2016-2020 CNRS
-
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -23,7 +20,4 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

-# AUTHORS
-# Hervé BREDIN - http://herve.niderb.fr
-
 __import__("pkg_resources").declare_namespace(__name__)
@@ -1,20 +1,17 @@
-#!/usr/bin/env python
-# encoding: utf-8
-
-# The MIT License (MIT)
-
-# Copyright (c) 2016-2020 CNRS
-
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -22,64 +19,3 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-
-# AUTHORS
-# Hervé BREDIN - http://herve.niderb.fr
-
-"""
-`pyannote.audio` provides
-
-  * speech activity detection
-  * speaker change detection
-  * speaker embedding
-  * speaker diarization pipeline
-
-## Installation
-
-```bash
-$ pip install pyannote.audio
-```
-
-## Citation
-
-If you use `pyannote.audio` please use the following citations.
-
-  - Speech  activity and speaker change detection
-
-        @inproceedings{Yin2017,
-          Author = {Ruiqing Yin and Herv\'e Bredin and Claude Barras},
-          Title = {{Speaker Change Detection in Broadcast TV using Bidirectional Long Short-Term Memory Networks}},
-          Booktitle = {{18th Annual Conference of the International Speech Communication Association, Interspeech 2017}},
-          Year = {2017},
-          Month = {August},
-          Address = {Stockholm, Sweden},
-          Url = {https://github.com/yinruiqing/change_detection}
-        }
-
-  - Speaker embedding
-
-        @inproceedings{Bredin2017,
-            author = {Herv\'{e} Bredin},
-            title = {{TristouNet: Triplet Loss for Speaker Turn Embedding}},
-            booktitle = {42nd IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2017},
-            year = {2017},
-            url = {http://arxiv.org/abs/1609.04301},
-        }
-
-  - Speaker diarization pipeline
-
-        @inproceedings{Yin2018,
-          Author = {Ruiqing Yin and Herv\'e Bredin and Claude Barras},
-          Title = {{Neural Speech Turn Segmentation and Affinity Propagation for Speaker Diarization}},
-          Booktitle = {{19th Annual Conference of the International Speech Communication Association, Interspeech 2018}},
-          Year = {2018},
-          Month = {September},
-          Address = {Hyderabad, India},
-        }
-
-"""
-
-from ._version import get_versions
-
-__version__ = get_versions()["version"]
-del get_versions
@@ -0,0 +1,22 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
@@ -0,0 +1,354 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from typing import Union, Optional, Text
+from pathlib import Path
+from pyannote.database import ProtocolFile
+import soundfile as sf
+
+import warnings
+import numpy as np
+
+import librosa
+
+from pyannote.core import Segment, SlidingWindow, SlidingWindowFeature
+from pyannote.core.utils.types import Alignment
+
+AudioFile = Union[Path, Text, ProtocolFile, dict]
+""" 
+Audio files can be provided to the Audio class using different types:
+    - a "str" instance: "/path/to/audio.wav"
+    - a "Path" instance: Path("/path/to/audio.wav")
+    - a ProtocolFile (or regular dict) with an "audio" key: 
+        {"audio": Path("/path/to/audio.wav")}
+    - a ProtocolFile (or regular dict) with both "waveform" and "sample_rate" key:
+        {"waveform": (time, channel) numpy array, "sample_rate": 44100}
+
+For last two options, an additional "channel" key can be provided as a zero-indexed
+integer to load a specific channel: 
+        {"audio": Path("/path/to/stereo.wav"), "channel": 0}
+"""
+
+
+class Audio:
+    """Audio IO
+
+    Parameters
+    ----------
+    sample_rate: int, optional
+        Target sampling rate. Defaults to using native sampling rate.
+    mono : int, optional
+        Convert multi-channel to mono. Defaults to True.
+
+    Usage
+    -----
+    >>> audio = Audio(sample_rate=16000, mono=True)
+    >>> waveform, sample_rate = audio({"audio": "/path/to/audio.wav"})
+    >>> assert sample_rate == 16000
+
+    >>> two_seconds_stereo = np.random.rand(44100 * 2, 2, dtype=np.float32)
+    >>> waveform, sample_rate = audio({"waveform": two_seconds_stereo, "sample_rate": 44100})
+    >>> assert sample_rate == 16000
+    >>> assert waveform.shape[1] == 1
+    """
+
+    @staticmethod
+    def get_duration(file: AudioFile) -> float:
+        """Get audio file duration
+
+        Parameters
+        ----------
+        file : AudioFile
+            Audio file.
+        
+        Returns
+        -------
+        duration : float
+            Duration in seconds.
+        """
+
+        if isinstance(file, (ProtocolFile, dict)):
+            audio = file["audio"]
+        else:
+            audio = file
+
+        if isinstance(audio, Path):
+            audio = str(audio)
+
+        with sf.SoundFile(audio, "r") as f:
+            return float(f.frames) / f.samplerate
+
+    @staticmethod
+    def is_valid(file: AudioFile) -> bool:
+
+        if isinstance(file, (ProtocolFile, dict)):
+
+            if "waveform" in file:
+
+                waveform = file["waveform"]
+                if len(waveform.shape) != 2 or waveform.shape[0] < waveform.shape[1]:
+                    raise ValueError(
+                        "'waveform' must be provided as a (time, channel) numpy array."
+                    )
+
+                sample_rate = file.get("sample_rate", None)
+                if sample_rate is None:
+                    raise ValueError(
+                        "'waveform' must be provided with their 'sample_rate'."
+                    )
+
+                return True
+
+            elif "audio" in file:
+                audio = file["audio"]
+
+            else:
+                # TODO improve error message
+                raise ValueError("either 'audio' or 'waveform' key must be provided.")
+
+        else:
+            audio = file
+
+        #  should we check here that "audio" file exists?
+        #  this will slow things down and will fail later anyway.
+
+        return True
+
+    def __init__(self, sample_rate=None, mono=True):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.mono = mono
+
+    def downmix_and_resample(
+        self, waveform: np.ndarray, sample_rate: int
+    ) -> np.ndarray:
+        """Downmix and resample 
+
+        Parameters
+        ----------
+        waveform : (time, channel) np.ndarray
+            Waveform.
+        sample_rate : int
+            Sample rate.
+
+        Returns
+        -------
+        waveform : (time, channel) np.ndarray
+            Remixed and resampled waveform
+        sample_rate : int
+            New sample rate
+        """
+
+        # downmix to mono
+        if self.mono and waveform.shape[1] > 1:
+            waveform = np.mean(waveform, axis=1, keepdims=True)
+
+        # resample
+        if (self.sample_rate is not None) and (self.sample_rate != sample_rate):
+            if self.mono:
+                # librosa expects mono audio to be of shape (n,), but we have (n, 1).
+                waveform = librosa.core.resample(
+                    waveform[:, 0], sample_rate, self.sample_rate
+                )[:, None]
+            else:
+                waveform = librosa.core.resample(
+                    waveform.T, sample_rate, self.sample_rate
+                ).T
+            sample_rate = self.sample_rate
+
+        return waveform, sample_rate
+
+    def __call__(self, file: AudioFile):
+        """Obtain waveform
+
+        Parameters
+        ----------
+        file : AudioFile
+
+        Returns
+        -------
+        waveform : `pyannote.core.SlidingWindowFeature`
+            Waveform.
+
+        See also
+        --------
+        AudioFile
+        """
+
+        self.is_valid(file)
+
+        if isinstance(file, (ProtocolFile, dict)):
+
+            if "waveform" in file:
+                audio = None
+                waveform = file["waveform"]
+                sample_rate = file.get("sample_rate", None)
+
+            elif "audio" in file:
+                audio = file["audio"]
+                waveform = None
+                sample_rate = None
+
+            else:
+                pass
+
+            channel = file.get("channel", None)
+
+        else:
+            audio = file
+            waveform = None
+            sample_rate = None
+            channel = None
+
+        if isinstance(audio, Path):
+            audio = str(audio)
+
+        if waveform is None:
+            waveform, sample_rate = sf.read(audio, dtype="float32", always_2d=True)
+
+        if channel is not None:
+            waveform = waveform[:, channel - 1 : channel]
+
+        waveform = self.downmix_and_resample(waveform, sample_rate)
+
+        sliding_window = SlidingWindow(
+            start=-0.5 / sample_rate, duration=1.0 / sample_rate, step=1.0 / sample_rate
+        )
+
+        return SlidingWindowFeature(waveform, sliding_window)
+
+    def crop(
+        self,
+        file: AudioFile,
+        segment: Segment,
+        mode: Alignment = "center",
+        fixed: Optional[float] = None,
+    ) -> np.ndarray:
+        """Fast version of self(file).crop(segment, **kwargs)
+
+        Parameters
+        ----------
+        file : AudioFile
+            Audio file.
+        segment : `pyannote.core.Segment`
+            Temporal segment to load.
+        mode : {'loose', 'strict', 'center'}, optional
+            In 'strict' mode, only samples fully included in 'segment' are
+            returned. In 'loose' mode, any intersecting frames are returned. In
+            'center' mode, first and last frames are chosen to be the ones
+            whose centers are the closest to 'focus' start and end times.
+            Defaults to 'center'.
+        fixed : float, optional
+            Overrides `Segment` 'focus' duration and ensures that the number of
+            returned frames is fixed (which might otherwise not be the case
+            because of rounding errors). Has no effect in 'strict' or 'loose'
+            modes.
+
+        Returns
+        -------
+        waveform : (time, channel) numpy array
+            Waveform
+        sample_rate : int
+            Sample rate
+
+        TODO: remove support for "mode" option. It is always "center" anyway.
+
+        See also
+        --------
+        `pyannote.core.SlidingWindowFeature.crop`
+        """
+
+        self.is_valid(file)
+
+        if isinstance(file, (ProtocolFile, dict)):
+
+            if "waveform" in file:
+                audio = None
+                waveform = file["waveform"]
+                sample_rate = file.get("sample_rate", None)
+                frames = len(waveform)
+
+            elif "audio" in file:
+                audio = file["audio"]
+                waveform = None
+
+            else:
+                pass
+
+            channel = file.get("channel", None)
+
+        else:
+            audio = file
+            waveform = None
+            channel = None
+
+        if isinstance(audio, Path):
+            audio = str(audio)
+
+        # read sample rate and number of frames
+        if waveform is None:
+            with sf.SoundFile(audio, "r") as f:
+                sample_rate = f.samplerate
+                frames = f.frames
+
+        # infer which samples to load from sample rate and requested chunk
+        #  TODO: compute start directly instead of using a sliding window
+        samples = SlidingWindow(
+            start=-0.5 / sample_rate, duration=1.0 / sample_rate, step=1.0 / sample_rate
+        )
+        ((start, stop),) = samples.crop(
+            segment, mode=mode, fixed=fixed, return_ranges=True
+        )
+
+        if start < 0 or stop > frames:
+            raise ValueError(
+                f"requested chunk [{segment.start:.6f}, {segment.end:.6f}] "
+                f"lies outside of file bounds [0., {frames / sample_rate:.6f}]."
+            )
+
+        if waveform is not None:
+            data = waveform[start:stop]
+
+        else:
+
+            with sf.SoundFile(audio, "r") as f:
+
+                try:
+                    f.seek(start)
+                    data = f.read(stop - start, dtype="float32", always_2d=True)
+                except RuntimeError:
+                    msg = (
+                        f"SoundFile failed to seek-and-read in "
+                        f"{audio}: loading the whole file..."
+                    )
+                    warnings.warn(msg)
+                    return self(audio).crop(segment, mode=mode, fixed=fixed)
+
+        if channel is not None:
+            data = data[:, channel - 1 : channel]
+
+        return self.downmix_and_resample(data, sample_rate)
+
+
+def normalize(wav):
+    """Normalize waveform"""
+    return wav / (np.sqrt(np.mean(wav ** 2)) + 1e-8)
@@ -0,0 +1,149 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import pytorch_lightning as pl
+from pyannote.audio.core.task import Task, Problem
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from pyannote.audio.core.io import Audio
+
+
+class Model(pl.LightningModule):
+    """Base model
+    
+    Parameters
+    ----------
+    sample_rate : int, optional
+        Audio sample rate. Defaults to 16kHz (16000).
+    num_channels : int, optional
+        Number of channels. Defaults to mono (1).
+    task : Task, optional
+        Task addressed by the model. Only provided when training the model. 
+        A model should be `load_from_checkpoint`-able without a task as  
+        `on_load_checkpoint` hook takes care of calling `setup`. 
+    """
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        num_channels: int = 1,
+        task: Optional[Task] = None,
+    ):
+        super().__init__()
+
+        # set-up audio IO
+        assert (
+            num_channels == 1
+        ), "Only mono audio is supported for now (num_channels = 1)"
+        self.hparams.sample_rate = sample_rate
+        self.hparams.num_channels = num_channels
+        self.audio = Audio(sample_rate=self.hparams.sample_rate, mono=True)
+
+        # set task attribute when available (i.e. at training time)
+        # and also tell the task what kind of audio is expected from
+        # the model
+        if task is not None:
+            self.task = task
+            self.task.audio = self.audio
+
+    def build(self):
+        # use this method to add task-dependent layers to the model
+        # (e.g. the final classification and activation layers)
+        pass
+
+    def setup(self, stage=None):
+
+        if stage == "fit":
+
+            # keep track of the classes here because it is used
+            # to setup the final classification layer (even when stage != fit)
+            self.hparams.classes = self.task.specifications.classes
+
+            # keep track of the type of problem here because it is used
+            # to setup the final activation layer (even when stage != fit)
+            self.hparams.problem = self.task.specifications.problem
+
+            # any other common parameters should be saved?
+            # maybe the class of the model (and pyannote.audio semantic version?)
+            # so that it can be loaded without knowing what type of model it is.
+            # this would probably make distributing pretrained models much easier.
+
+        else:
+            # should we do something specific when stage != fit?
+            # hparams.classes and hparams.problem should already exist
+            # because they should have been loaded on_load_checkpoint
+            pass
+
+        # add task-dependent layers to the model
+        # (e.g. the final classification and activation layers)
+        self.build()
+
+        if stage == "fit":
+
+            # let task know about the shape of model output
+            # so that its dataloader knows how to generate targets
+            self.task.example_output_array = self.forward(
+                self.task.example_input_array()
+            )
+
+    def on_load_checkpoint(self, checkpoint):
+
+        # only hyper-parameters defined in __init__ are loaded automatically.
+        # therefore, we have to manually load hyper-parameters that were
+        # defined during setup()
+        self.hparams.classes = checkpoint["hyper_parameters"]["classes"]
+        self.hparams.problem = checkpoint["hyper_parameters"]["problem"]
+        # TODO: would have to check pytorch-lightning documentation to see
+        # if we can get rid of this... it is weird that only "some" parameters
+        # in self.hparams are assigned at __init__ time...
+
+        # now that setup()-defined hyper-parameters are available,
+        # we can actually setup() the model.
+        self.setup()
+
+    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
+        msg = "Class {self.__class__.__name__} should define a `forward` method."
+        raise NotImplementedError(msg)
+
+    # convenience function to automate the choice of the final activation function
+    def default_activation(self) -> nn.Module:
+
+        if self.hparams.problem == Problem.MONO_LABEL_CLASSIFICATION:
+            return nn.LogSoftmax(dim=-1)
+
+        elif self.hparams.problem == Problem.MULTI_LABEL_CLASSIFICATION:
+            return nn.Sigmoid()
+
+        else:
+            msg = "TODO: implement default activation for other types of problems"
+            raise NotImplementedError(msg)
+
+    # training step logic is defined by the task because the
+    # model does not really need to know how it is being used.
+    def training_step(self, batch, batch_idx):
+        return self.task.training_step(self, batch, batch_idx)
+
+    # optimizer is defined by the task for the same reason as above
+    def configure_optimizers(self):
+        return self.task.configure_optimizers(self)
@@ -0,0 +1,215 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+from enum import Enum
+from dataclasses import dataclass
+from typing import Optional, List, Text
+
+import pytorch_lightning as pl
+import torch
+import torch.optim
+from torch.utils.data import DataLoader
+from torch.utils.data import IterableDataset
+import torch.nn.functional as F
+from pyannote.database import Protocol
+
+
+# Type of machine learning problem
+class Problem(Enum):
+    MONO_LABEL_CLASSIFICATION = 1
+    MULTI_LABEL_CLASSIFICATION = 2
+    REPRESENTATION = 3
+    REGRESSION = 4
+    # any other we could think of?
+
+
+# A task takes an audio chunk as input and returns
+# either a temporal sequence of predictions
+# or just one prediction for the whole audio chunk
+class Scale(Enum):
+    FRAME = 1  # model outputs a sequence of frames
+    CHUNK = 2  # model outputs just one vector for the whole chunk
+
+
+@dataclass
+class TaskSpecification:
+    problem: Problem
+    scale: Scale
+
+    # for classification tasks only
+    classes: Optional[List[Text]] = None
+
+
+# note how a task is actually a LightningDataModule
+class Task(pl.LightningDataModule):
+    """Base task class
+    
+    A task is the combination of a "problem" and a "dataset".
+    For example, here are a few tasks:
+    - voice activity detection on the AMI corpus
+    - speaker embedding on the VoxCeleb corpus
+    - end-to-end speaker diarization on the VoxConverse corpus
+
+    A task is expected to be solved by a "model" that takes an
+    audio chunk as input and returns the solution. Hence, the 
+    task is in charge of generating (input, expected_output)
+    samples used for training the model.
+
+    Parameters
+    ----------
+    protocol : Protocol
+        pyannote.database protocol
+    duration : float, optional
+        Chunks duration. Defaults to variable duration (None).
+    batch_size : int, optional
+        Number of training samples per batch.
+    num_workers : int, optional
+        Number of workers used for generating training samples.
+    """
+
+    def __init__(
+        self,
+        protocol: Protocol,
+        duration: float = None,
+        batch_size: int = None,
+        num_workers: int = 1,
+    ):
+        super().__init__()
+
+        # dataset
+        self.protocol = protocol
+
+        # batching
+        self.duration = duration
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+
+    def prepare_data(self):
+        # this is where we might end up downloading datasets
+        # and transform them so that they are ready to be used
+        # with pyannote.database. but for now, the API assume
+        # that we directly provide a pyannote.database.Protocol.
+        pass
+
+    def train__iter__(self):
+        # will become train_dataset.__iter__ method
+        msg = f"Missing '{self.__class__.__name__}.train__iter__' method."
+        raise NotImplementedError(msg)
+
+    def train__len__(self):
+        # will become train_dataset.__len__ method
+        msg = f"Missing '{self.__class__.__name__}.train__len__' method."
+        raise NotImplementedError(msg)
+
+    def train_dataloader(self) -> DataLoader:
+        # build train IterableDataset subclass programmatically
+        dataset = type(
+            "TrainDataset",
+            (IterableDataset,),
+            {"__iter__": self.train__iter__, "__len__": self.train__len__},
+        )
+
+        return DataLoader(
+            dataset(),
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            drop_last=True,
+        )
+
+    @property
+    def example_input_duration(self) -> float:
+        return 2.0 if self.duration is None else self.duration
+
+    def example_input_array(self):
+        # this method is called in Model.setup where it is used
+        # to automagically infer the temporal resolution of the
+        # model output, and hence allow the dataloader to shape
+        # its targets correctly.
+
+        # since we plan to have the feature extraction step done
+        # on GPU as part of the model, the example input array is
+        # basically always a chunk of audio
+
+        if self.audio.mono:
+            num_channels = 1
+        else:
+            msg = "Only 'mono' audio is supported."
+            raise NotImplementedError(msg)
+
+        return torch.randn(
+            (
+                self.batch_size,
+                int(self.audio.sample_rate * self.example_input_duration),
+                num_channels,
+            )
+        )
+
+    # below is a (hacky) way to automagically infer the expected
+    # resolution of the target. basically, we do a forward pass
+    # of an example input array and look at the resulting shape
+    # of the output. the problem with this approach is that we
+    # may encounter weird rounding errors in case of variable-length
+    # chunks. TODO: someone should look at this to make it more robust.
+
+    @property
+    def example_output_array(self) -> torch.Tensor:
+        return self.example_output_array_
+
+    @example_output_array.setter
+    def example_output_array(self, example_output_array: torch.Tensor):
+        self.example_output_array_ = example_output_array
+        if self.specifications.scale == Scale.FRAME:
+            self.frame_duration_ = (
+                self.example_input_duration / example_output_array.shape[1]
+            )
+
+    @property
+    def frame_duration(self) -> float:
+        if self.specifications.scale == Scale.FRAME:
+            return self.frame_duration_
+
+    # default training_step provided for convenience
+    # can obviously be overriden for each task
+    def training_step(self, model: "Model", batch, batch_idx: int):
+        X, y = batch["X"], batch["y"]
+        if self.specifications.problem == Problem.MONO_LABEL_CLASSIFICATION:
+            loss = F.nll_loss(
+                model(X).view(-1, len(self.specifications.classes)), y.view(-1)
+            )
+
+        elif self.specifications.problem == Problem.MULTI_LABEL_CLASSIFICATION:
+            loss = F.binary_cross_entropy(model(X), y.float())
+
+        else:
+            msg = "TODO: implement for other types of problems"
+            raise NotImplementedError(msg)
+
+        model.log("train_loss", loss)
+        return loss
+
+    # default configure_optimizers provided for convenience
+    # can obviously be overriden for each task
+    def configure_optimizers(self, model: "Model"):
+        # for tasks such as SpeakerEmbedding,
+        # other parameters should be added here
+        return torch.optim.Adam(model.parameters(), lr=1e-3)
@@ -1,20 +1,17 @@
-#!/usr/bin/env python
-# encoding: utf-8
-
-# The MIT License (MIT)
-
-# Copyright (c) 2019-2020 CNRS
-
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -22,8 +19,3 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-
-# AUTHORS
-# Hervé BREDIN - http://herve.niderb.fr
-
-from .models import PyanNet, SincTDNN, ACRoPoLiS
@@ -0,0 +1,147 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+from pyannote.audio.core.model import Model
+from pyannote.audio.core.task import Task
+from typing import Optional
+
+
+from torchaudio.transforms import MFCC
+import torch
+import torch.nn as nn
+from einops import rearrange, reduce
+
+
+class SimpleSegmentationModel(Model):
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        num_channels: int = 1,
+        task: Optional[Task] = None,
+    ):
+
+        super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
+
+        self.mfcc = MFCC(
+            sample_rate=self.hparams.sample_rate,
+            n_mfcc=40,
+            dct_type=2,
+            norm="ortho",
+            log_mels=False,
+        )
+
+        self.lstm = nn.LSTM(
+            self.mfcc.n_mfcc * self.hparams.num_channels,
+            32,
+            num_layers=1,
+            batch_first=True,
+            bidirectional=True,
+        )
+
+    def build(self):
+        # define task-dependent layers
+        self.classifier = nn.Linear(32 * 2, len(self.hparams.classes))
+        self.activation = self.default_activation()
+
+        # why do we define those layers here and not in task.setup()?
+        # because, at inference time, we need those layers.
+
+        # this is in contrast of SpeakerEmbedding.loss_func layers below
+        # that are only needed during training -- we don't want them to
+        # be applied at inference.
+
+    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
+        """
+        
+        Parameters
+        ----------
+        waveforms : (batch, time, channel)
+        
+        Returns
+        -------
+        scores : (batch, time, classes)        
+        """
+
+        # extract MFCC
+        mfcc = self.mfcc(rearrange(waveforms, "b t c -> b c t"))
+        # pass MFCC sequeence into the recurrent layer
+        output, hidden = self.lstm(rearrange(mfcc, "b c f t -> b t (c f)"))
+        # apply the final classifier to get logits
+        return self.activation(self.classifier(output))
+
+
+class SimpleEmbeddingModel(Model):
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        num_channels: int = 1,
+        task: Optional[Task] = None,
+    ):
+
+        super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
+
+        self.mfcc = MFCC(
+            sample_rate=self.hparams.sample_rate,
+            n_mfcc=40,
+            dct_type=2,
+            norm="ortho",
+            log_mels=False,
+        )
+
+        self.lstm = nn.LSTM(
+            self.mfcc.n_mfcc * self.hparams.num_channels,
+            32,
+            num_layers=1,
+            batch_first=True,
+            bidirectional=True,
+        )
+
+        # this is needed because example_output_array is needed in SpeakerEmbedding.setup
+        # to automagically infer the embedding size. but example_output_array is computed
+        # in Model.setup (which is called **after** Task.setup).
+
+        # note that this is only a problem for embedding tasks.
+        # we should find a way to automate this call so that the
+        # end user does not forget to call it. note that this must
+        # be called at the end of __init__
+        if self.task is not None:
+            self.task.example_output_array = self.forward(
+                self.task.example_input_array()
+            )
+
+    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
+        """
+        
+        Parameters
+        ----------
+        waveforms : (batch, time, channel)
+        
+        Returns
+        -------
+        embedding : (batch, dimension)        
+        """
+
+        mfcc = self.mfcc(rearrange(waveforms, "b t c -> b c t"))
+        output, hidden = self.lstm(rearrange(mfcc, "b c f t -> b t (c f)"))
+        # mean temporal pooling
+        return reduce(output, "b t f -> b f", "mean")
@@ -0,0 +1,158 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from pyannote.audio.core.task import TaskSpecification, Problem, Scale, Task
+from pyannote.database import Protocol
+
+import math
+import random
+from pyannote.core import Segment, Timeline, SlidingWindow
+from pyannote.core.utils.numpy import one_hot_encoding
+
+
+class SpeakerTracking(Task):
+    """Speaker tracking
+    
+    Speaker tracking is the process of determining if and when a (previously 
+    enrolled) person's voice can be heard in an audio recording.
+    
+    Here, it is addressed with the same approach as voice activity detection,
+    except {"non-speech", "speech"} classes are replaced by {"speaker1", ...,
+    "speaker_N"} where N is the number of speakers in the training set.
+    """
+
+    def __init__(
+        self,
+        protocol: Protocol,
+        duration: float = 2.0,
+        batch_size: int = None,
+        num_workers: int = 1,
+    ):
+
+        super().__init__(
+            protocol, duration=duration, batch_size=batch_size, num_workers=num_workers
+        )
+
+        # for speaker tracking, task specification depends
+        # on the data: we do not know in advance which
+        # speakers should be tracked. therefore, we postpone
+        # the definition of specifications.
+
+    def setup(self, stage=None):
+
+        if stage == "fit":
+
+            # this is where we load the training set metadata
+            # to be used later by the train_dataloader.
+
+            # here, we simply loop over the training set, remove
+            # annotated regions shorter than chunk duration, and
+            # keep track of the reference annotations.
+
+            # we also build the list of speakers to be tracked.
+
+            self.train, speakers = [], set()
+            for f in self.protocol.train():
+                segments = [
+                    segment
+                    for segment in f["annotated"]
+                    if segment.duration > self.duration
+                ]
+                duration = sum(segment.duration for segment in segments)
+                self.train.append(
+                    {
+                        "annotated": segments,
+                        "annotation": f["annotation"],
+                        "duration": duration,
+                        "audio": f["audio"],
+                    }
+                )
+                speakers.update(f["annotation"].labels())
+
+        # now that we now who the speakers are, we can
+        # define the task specifications.
+
+        # note that, since multiple speakers can be active
+        # at once, the problem is multi-label classification.
+        self.specifications = TaskSpecification(
+            problem=Problem.MULTI_LABEL_CLASSIFICATION,
+            scale=Scale.FRAME,
+            classes=sorted(speakers),
+        )
+
+    def train__iter__(self):
+        """Iterate over training samples
+        
+        Yields
+        ------
+        X: (time, channel)
+            Audio chunks.
+        y: (frame, num_speakers)
+            Frame-level targets. Note that frame < time.
+            `frame` is infered automagically from the 
+            example model output.
+        """
+
+        random.seed()
+
+        while True:
+
+            # select one file at random (with probability proportional to its annotated duration)
+            file, *_ = random.choices(
+                self.train, weights=[f["duration"] for f in self.train], k=1,
+            )
+
+            # select one annotated region at random (with probability proportional to its duration)
+            segment, *_ = random.choices(
+                file["annotated"], weights=[s.duration for s in file["annotated"]], k=1,
+            )
+
+            # select one chunk at random (with uniform distribution)
+            start_time = random.uniform(segment.start, segment.end - self.duration)
+            chunk = Segment(start_time, start_time + self.duration)
+
+            # extract features
+            X, _ = self.audio.crop(file, chunk, mode="center", fixed=self.duration)
+
+            # TODO | this one_hot_encoding thing needs to be rewritten into pyannote.audio
+            # TODO | to make sure we always return the same number of frames for the same
+            # TODO | input duration. we should also support variable-length chunks.
+            frames = SlidingWindow(
+                start=chunk.start,
+                duration=self.frame_duration,
+                step=self.frame_duration,
+            )
+
+            y = one_hot_encoding(
+                file["annotation"].crop(chunk),
+                Timeline([chunk]),
+                frames,
+                labels=self.specifications.classes,
+                mode="center",
+            ).data
+
+            yield {"X": X, "y": y}
+
+    def train__len__(self):
+        # Number of training samples in one epoch
+        duration = sum(file["duration"] for file in self.train)
+        return math.ceil(duration / self.duration)
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
@@ -0,0 +1,176 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from typing import TYPE_CHECKING
+from pyannote.audio.core.task import TaskSpecification, Problem, Scale, Task
+
+if TYPE_CHECKING:
+    from pyannote.audio.core.model import Model
+from pyannote.database import Protocol
+
+import random
+import math
+from pyannote.core import Segment
+
+import pytorch_metric_learning.losses
+from itertools import chain
+import torch.optim
+
+
+class SpeakerEmbeddingArcFace(Task):
+    def __init__(
+        self,
+        protocol: Protocol,
+        duration: float = 2.0,
+        batch_size: int = None,
+        num_workers: int = 1,
+    ):
+
+        super().__init__(
+            protocol, duration=duration, batch_size=batch_size, num_workers=num_workers
+        )
+
+        # there is no such thing as a "class" in representation
+        # learning, so we do not need to define it here.
+        self.specifications = TaskSpecification(
+            problem=Problem.REPRESENTATION, scale=Scale.CHUNK
+        )
+
+    def setup(self, stage=None):
+
+        if stage == "fit":
+
+            # gather training set metadata
+            self.speakers = dict()
+            for f in self.protocol.train():
+
+                for speaker in f["annotation"].labels():
+
+                    # keep speaker's (long enough) speech turns...
+                    speech_turns = [
+                        segment
+                        for segment in f["annotation"].label_timeline(speaker)
+                        if segment.duration > self.duration
+                    ]
+
+                    # skip if there is no speech turns left
+                    if not speech_turns:
+                        continue
+
+                    # ... and their total duration
+                    duration = sum(segment.duration for segment in speech_turns)
+
+                    # add speaker to the list of speakers
+                    if speaker not in self.speakers:
+                        self.speakers[speaker] = list()
+
+                    self.speakers[speaker].append(
+                        {
+                            "audio": f["audio"],
+                            "duration": duration,
+                            "speech_turns": speech_turns,
+                        }
+                    )
+
+            # for convenience, we keep track of the list of speakers, after all
+            self.specifications.classes = sorted(self.speakers)
+
+            num_classes = len(self.speakers)
+            # use example_output_array to guess embedding size
+            _, embedding_size = self.example_output_array.shape
+            self.loss_func = pytorch_metric_learning.losses.ArcFaceLoss(
+                num_classes, embedding_size, margin=28.6, scale=64
+            )
+
+    def train__iter__(self):
+        """Iterate over training samples
+        
+        Yields
+        ------
+        X: (time, channel)
+            Audio chunks.
+        y: int
+            Speaker index.
+        """
+
+        random.seed()
+
+        speakers = list(self.speakers)
+
+        while True:
+
+            # shuffle speakers so that we don't always have the same
+            # groups of speakers in a batch (which might be especially
+            # problematic for contrast-based losses like contrastive
+            # or triplet loss.
+            random.shuffle(speakers)
+
+            for speaker in speakers:
+
+                # speaker index in original sorted order
+                y = self.specifications.classes.index(speaker)
+
+                # three chunks per speaker
+                for _ in range(3):
+
+                    # select one file at random (with probability proportional to its speaker duration)
+                    file, *_ = random.choices(
+                        self.speakers[speaker],
+                        weights=[f["duration"] for f in self.speakers[speaker]],
+                        k=1,
+                    )
+
+                    # select one speech turn at random (with probability proportional to its duration)
+                    speech_turn, *_ = random.choices(
+                        file["speech_turns"],
+                        weights=[s.duration for s in file["speech_turns"]],
+                        k=1,
+                    )
+
+                    # select one chunk at random (with uniform distribution)
+                    start_time = random.uniform(
+                        speech_turn.start, speech_turn.end - self.duration
+                    )
+                    chunk = Segment(start_time, start_time + self.duration)
+
+                    # extract features
+                    X, _ = self.audio.crop(
+                        file, chunk, mode="center", fixed=self.duration
+                    )
+
+                    yield {"X": X, "y": y}
+
+    def train__len__(self):
+        duration = sum(
+            datum["duration"] for data in self.speakers.values() for datum in data
+        )
+        return math.ceil(duration / self.duration)
+
+    def training_step(self, model: "Model", batch, batch_idx: int):
+        X, y = batch["X"], batch["y"]
+        loss = self.loss_func(model(X), y)
+        model.log("train_loss", loss)
+        return loss
+
+    def configure_optimizers(self, model: "Model"):
+        parameters = chain(model.parameters(), self.loss_func.parameters())
+        return torch.optim.Adam(parameters, lr=1e-3)
@@ -0,0 +1,22 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
@@ -0,0 +1,22 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
@@ -0,0 +1,141 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from pyannote.audio.core.task import TaskSpecification, Problem, Scale, Task
+from pyannote.database import Protocol
+
+import numpy as np
+import math
+import random
+from pyannote.core import Segment, Timeline, SlidingWindow
+from pyannote.core.utils.numpy import one_hot_encoding
+
+
+class VoiceActivityDetection(Task):
+    def __init__(
+        self,
+        protocol: Protocol,
+        duration: float = 2.0,
+        batch_size: int = None,
+        num_workers: int = 1,
+    ):
+
+        super().__init__(
+            protocol, duration=duration, batch_size=batch_size, num_workers=num_workers
+        )
+
+        # for voice activity detection, task specification
+        # does not depend on the data: we can define it in
+        # __init__
+        self.specifications = TaskSpecification(
+            problem=Problem.MONO_LABEL_CLASSIFICATION,
+            scale=Scale.FRAME,
+            classes=["non_speech", "speech"],
+        )
+
+    def setup(self, stage=None):
+        if stage == "fit":
+            # this is where we load the training set metadata
+            # to be used later by the train_dataloader.
+
+            # here, we simply loop over the training set, remove
+            # annotated regions shorter than chunk duration, and
+            # keep track of the reference annotations.
+            self.train = []
+            for f in self.protocol.train():
+                segments = [
+                    segment
+                    for segment in f["annotated"]
+                    if segment.duration > self.duration
+                ]
+                duration = sum(segment.duration for segment in segments)
+                self.train.append(
+                    {
+                        "annotated": segments,
+                        "annotation": f["annotation"],
+                        "duration": duration,
+                        "audio": f["audio"],
+                    }
+                )
+
+    def train__iter__(self):
+        """Iterate over training samples
+        
+        Yields
+        ------
+        X: (time, channel)
+            Audio chunks.
+        y: (frame, )
+            Frame-level targets. Note that frame < time.
+            `frame` is infered automagically from the 
+            example model output.
+        """
+
+        random.seed()
+
+        while True:
+
+            # select one file at random (with probability proportional to its annotated duration)
+            file, *_ = random.choices(
+                self.train, weights=[f["duration"] for f in self.train], k=1,
+            )
+
+            # select one annotated region at random (with probability proportional to its duration)
+            segment, *_ = random.choices(
+                file["annotated"], weights=[s.duration for s in file["annotated"]], k=1,
+            )
+
+            # select one chunk at random (with uniform distribution)
+            start_time = random.uniform(segment.start, segment.end - self.duration)
+            chunk = Segment(start_time, start_time + self.duration)
+
+            # extract features
+            X, _ = self.audio.crop(file, chunk, mode="center", fixed=self.duration)
+
+            # note how, contrary to what is currently done in pyannote.audio,
+            # y is not precomputed for the whole file at initialization time.
+            # here, we stick with pyannote.core.Annotation as long as possible
+            # and "one hot" encode the data only when generating training samples.
+            # this should allow to train on much larger datasets.
+
+            # TODO | this one_hot_encoding thing needs to be rewritten into pyannote.audio
+            # TODO | to make sure we always return the same number of frames for the same
+            # TODO | input duration. we should also support variable-length chunks.
+            frames = SlidingWindow(
+                start=chunk.start,
+                duration=self.frame_duration,
+                step=self.frame_duration,
+            )
+            y = one_hot_encoding(
+                file["annotation"].crop(chunk), Timeline([chunk]), frames, mode="center"
+            ).data
+
+            # this is the only part of this method that is specific to VAD
+            # the rest should also work for any task with Scale.FRAME
+            y = np.int64(np.sum(y, axis=1) > 0)
+
+            yield {"X": X, "y": y}
+
+    def train__len__(self):
+        # Number of training samples in one epoch
+        duration = sum(file["duration"] for file in self.train)
+        return math.ceil(duration / self.duration)
@@ -1,17 +1,9 @@
-cachetools >= 2.0.0
-librosa >= 0.8.0
-pandas >= 0.18.0
+torch >= 1.6
+torchaudio >= 0.6
+pytorch_lightning >= 1.0.0rc4
+pytorch_metric_learning >= 0.9.93
+einops >= 0.3.0
 pyannote.core >= 4.1
-pyannote.database >= 4.0
-pyannote.metrics >= 2.3
-pyannote.pipeline >= 1.5.2
-pyYAML >= 3.12
-scikit-learn >= 0.20.2
-sortedcollections >= 1.0.1
-sortedcontainers >= 2.0.4
+pyannote.database >= 4.0.1
+librosa >= 0.8
 soundfile >= 0.10.2
-tqdm >= 4.29.1
-tensorboard >= 2.0.0
-typing_extensions >= 3.7.4;python_version < '3.8'
-pescador >= 2.1.0
-Pillow >= 6.2.1
@@ -1,34 +1,3 @@
-#!/usr/bin/env python
-# encoding: utf-8
-
-# The MIT License (MIT)
-
-# Copyright (c) 2016-2019 CNRS
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# AUTHORS
-# Hervé BREDIN - http://herve.niderb.fr
-
-
-import versioneer
-
 from setuptools import setup, find_packages

 with open("README.md") as f:
@@ -42,19 +11,7 @@ setup(
    namespace_packages=["pyannote"],
    packages=find_packages(),
    install_requires=requirements,
-    entry_points={
-        "console_scripts": [
-            "pyannote-audio=pyannote.audio.applications.pyannote_audio:main",
-            "pyannote-speech-feature=pyannote.audio.applications.feature_extraction:main",
-        ],
-        "prodigy_recipes": [
-            "pyannote.sad.manual = pyannote.audio.interactive.recipes.sad:sad_manual",
-            "pyannote.dia.binary = pyannote.audio.interactive.recipes.dia:dia_binary",
-            "pyannote.dia.manual = pyannote.audio.interactive.recipes.dia:dia_manual",
-        ],
-    },
-    version=versioneer.get_version(),
-    cmdclass=versioneer.get_cmdclass(),
+    entry_points={},
    description="Neural building blocks for speaker diarization",
    long_description=long_description,
    long_description_content_type="text/markdown",