From f306d52ec451853e9495f5959f12c60a6341bfa3 Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Mon, 10 Feb 2025 21:45:23 -0600
Subject: [PATCH] tweaks

---
 vall_e/emb/process.py |  2 +-
 vall_e/emb/similar.py | 24 +++++++++++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/vall_e/emb/process.py b/vall_e/emb/process.py
index 11426c1..1272ff8 100644
--- a/vall_e/emb/process.py
+++ b/vall_e/emb/process.py
@@ -312,7 +312,7 @@ def process(
 				elif language == "chinese":
 					language = "zh"
 
-				if strict_language and language not in ["en", "ja", "fr", "de", "ko", "zh"]:
+				if strict_languages and language not in ["en", "ja", "fr", "de", "ko", "zh"]:
 					language = "auto"
 
 				if len(metadata[filename]["segments"]) == 0 or not use_slices:
diff --git a/vall_e/emb/similar.py b/vall_e/emb/similar.py
index c56aeb0..92caca5 100644
--- a/vall_e/emb/similar.py
+++ b/vall_e/emb/similar.py
@@ -22,6 +22,7 @@ import torchaudio.functional as F
 import torchaudio.transforms as T
 
 from ..config import cfg
+from ...data import _load_artifact
 from ..utils import truncate_json, coerce_dtype
 from ..utils.io import json_read, json_write
 
@@ -198,10 +199,12 @@ def batch_similar_utterances(
 			# treat embeddings as features, if provided quantized audio
 			if extension not in artifact_extension:
 				continue
-			artifact = np.load(f'./{speaker_path}/{filename}.{extension}', allow_pickle=True)[()]
-			duration = artifact["metadata"]["original_length"] / artifact["metadata"]["sample_rate"]
+
+			qnt, metadata = _load_artifact(f'./{speaker_path}/{filename}.{extension}', return_metadata=True)
 
 			"""
+			duration = metadata["original_length"] / metadata["sample_rate"]
+
 			if 0 < min_duration and duration < min_duration:
 				continue
 			
@@ -209,8 +212,6 @@ def batch_similar_utterances(
 				continue
 			"""
 
-			qnt = torch.from_numpy(artifact["codes"].astype(int))[0].t().to(dtype=torch.int16, device=device)
-
 			if trim_duration > 0:
 				qnt = trim( qnt, int( cfg.dataset.frames_per_second * trim_duration ) )
 			
@@ -307,10 +308,23 @@ def batch_similar_utterances(
 """
 def sort_similarities(
 	path,
+	num_speakers,
 	out_path=None,
 	threshold=0.8,
 	orphan_threshold=0.6,
 ):
+	from sklearn.cluster import KMeans
+
+	folders = [ "1", "2", "3", "4", "5", "6-7", "8", "9", "10", "11", "12", "14", "15" ]
+	embeddings = json_read(path / "0" / "embeddings.json")
+
+	for filename, embedding in embeddings.items():
+		embeddings[filename] = np.array(embedding)
+
+	embeddings_array = np.stack( list( embeddings.values() ) )
+	kmeans = KMeans(n_clusters=num_speakers).fit(embeddings_array)
+
+	"""
 	if not out_path:
 		out_path = path.parent / "speakers.json"
 
@@ -371,7 +385,7 @@ def sort_similarities(
 			continue
 		
 		speakers[target].append(filename)
-
+	"""
 
 	json_write( speakers, out_path )