Skip to content

Commit

Permalink
speaker indentification: use spectral clustering
Browse files Browse the repository at this point in the history
This switches from agglomerative clustering to spectral clustering.
Of the "standard" clustering methods, it achieves the best speaker
identification for my test data. Furthermore this should closely match
what the original paper on speaker identification using the ECAPA-TDNN
model uses [1].

I can get better clustering combining something like t-SNE with a
"standard" clustering method, however as t-SNE and others do not
preserve distances and therefore do not seem like a general solution.

[1] Dawalatabad, Nauman, et al. "ECAPA-TDNN embeddings for speaker diarization."
  • Loading branch information
rroohhh committed Jan 5, 2024
1 parent a08f701 commit 79c38f8
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 14 deletions.
49 changes: 48 additions & 1 deletion worker/pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions worker/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ authors = [
]

dependencies = [
"spectralcluster>=0.2.21",
"numpy>=1.23.5",
"pydantic[dotenv]>=1.10.7",
"transformers>=4.26.1",
Expand Down
50 changes: 37 additions & 13 deletions worker/transcribee_worker/identify_speakers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,36 @@
import numpy as np
import numpy.typing as npt
import torch
from sklearn.cluster import AgglomerativeClustering
from spectralcluster import refinement, spectral_clusterer
from speechbrain.pretrained import EncoderClassifier
from transcribee_proto.document import Document
from transcribee_worker.types import ProgressCallbackType
from transcribee_worker.util import alist, async_task

from .config import settings

RefinementName = refinement.RefinementName
RefinementOptions = refinement.RefinementOptions
ThresholdType = refinement.ThresholdType
SymmetrizeType = refinement.SymmetrizeType
SpectralClusterer = spectral_clusterer.SpectralClusterer

ICASSP2018_REFINEMENT_SEQUENCE = [
RefinementName.CropDiagonal,
RefinementName.RowWiseThreshold,
RefinementName.Symmetrize,
RefinementName.Diffuse,
RefinementName.RowWiseNormalize,
]

icassp2018_refinement_options = RefinementOptions(
gaussian_blur_sigma=5,
p_percentile=0.95,
thresholding_soft_multiplier=0.05,
thresholding_type=ThresholdType.RowMax,
refinement_sequence=ICASSP2018_REFINEMENT_SEQUENCE,
)


async def identify_speakers(
number_of_speakers: int | None,
Expand Down Expand Up @@ -75,24 +97,26 @@ def time_to_sample(time: float | None):
progress=len(segments) / (len(segments) + 1),
)

clustering = AgglomerativeClustering(
compute_full_tree=True, # type: ignore
linkage="complete",
n_clusters=number_of_speakers, # type: ignore
# distance_threshold curtesty of
# https://huggingface.co/pyannote/speaker-diarization/blob/369ac1852c71759894a48c9bb1c6f499a54862fe/config.yaml#L15
distance_threshold=0.7153 if number_of_speakers is None else None,
metric="cosine",
clusterer = SpectralClusterer(
min_clusters=1 if number_of_speakers is None else number_of_speakers,
max_clusters=100
if number_of_speakers is None
else number_of_speakers, # TODO(robin): arbitrary upper limit
autotune=None,
laplacian_type=None,
refinement_options=icassp2018_refinement_options,
custom_dist="cosine",
)
clustering.fit(np.array(embeddings))

labels = clusterer.predict(np.vstack(embeddings))

# we now re-shuffle the labels so that the first occuring speaker is 1, the second is 2, ...
label_map = {}
for label in clustering.labels_:
for label in labels:
if label not in label_map:
label_map[label] = str(len(label_map) + 1)

for para, label in zip(doc.children, clustering.labels_):
for para, label in zip(doc.children, labels):
para.speaker = automerge.Text(label_map[label])

await alist(aiter(async_task(work)))
return await alist(aiter(async_task(work)))

0 comments on commit 79c38f8

Please sign in to comment.