fix vall_e.data --action=hdf5 actually transcribing because past me completely forgot it tried to already put the transcribe/process dataset scripts inside the module before

2024-08-08 07:51:42 -05:00 · 2024-08-08 07:51:42 -05:00 · 79a6781c9e
commit 79a6781c9e
parent 949339a3fa
2 changed files with 2 additions and 109 deletions
--- a/vall_e/data.py
+++ b/vall_e/data.py
@ -1498,111 +1498,6 @@ def create_dataset_hdf5( skip_existing=True ):
 	hf.create_dataset('symmap', data=json.dumps(symmap))
 	hf.close()

-def transcribe_dataset():
-	import os
-	import json
-	import torch
-	import torchaudio
-	import whisperx
-
-	from tqdm.auto import tqdm
-	from pathlib import Path
-
-	# to-do: use argparser
-	batch_size = 16
-	device = "cuda" 
-	dtype = "float16"
-	model_name = "large-v3"
-
-	input_audio = "voices"
-	output_dataset = "training/metadata"
-
-	skip_existing = True
-	diarize = False
-
-	# 
-	model = whisperx.load_model(model_name, device, compute_type=dtype)
-	align_model, align_model_metadata, align_model_language = (None, None, None)
-	if diarize:
-		diarize_model = whisperx.DiarizationPipeline(device=device)
-	else:
-		diarize_model = None
-
-	def pad(num, zeroes):
-		return str(num).zfill(zeroes+1)
-
-	for dataset_name in os.listdir(f'./{input_audio}/'):
-		if not os.path.isdir(f'./{input_audio}/{dataset_name}/'):
-			continue
-
-		for speaker_id in tqdm(os.listdir(f'./{input_audio}/{dataset_name}/'), desc="Processing speaker"):
-			if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'):
-				continue
-
-			outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/whisper.json')
-
-			if outpath.exists():
-				metadata = json.loads(open(outpath, 'r', encoding='utf-8').read())
-			else:
-				os.makedirs(f'./{output_dataset}/{dataset_name}/{speaker_id}/', exist_ok=True)
-				metadata = {}
-
-			for filename in tqdm(os.listdir(f'./{input_audio}/{dataset_name}/{speaker_id}/'), desc=f"Processing speaker: {speaker_id}"):
-
-				if skip_existing and filename in metadata:
-					continue
-
-				if ".json" in filename:
-					continue
-
-				inpath = f'./{input_audio}/{dataset_name}/{speaker_id}/{filename}'
-
-				if os.path.isdir(inpath):
-					continue
-				
-				metadata[filename] = {
-					"segments": [],
-					"language": "",
-					"text": "",
-					"start": 0,
-					"end": 0,
-				}
-
-				audio = whisperx.load_audio(inpath)
-				result = model.transcribe(audio, batch_size=batch_size)
-				language = result["language"]
-
-				if language[:2] not in ["ja"]:
-					language = "en"
-
-				if align_model_language != language:
-					tqdm.write(f'Loading language: {language}')
-					align_model, align_model_metadata = whisperx.load_align_model(language_code=language, device=device)
-					align_model_language = language
-
-				result = whisperx.align(result["segments"], align_model, align_model_metadata, audio, device, return_char_alignments=False)
-
-				metadata[filename]["segments"] = result["segments"]
-				metadata[filename]["language"] = language
-
-				if diarize_model is not None:
-					diarize_segments = diarize_model(audio)
-					result = whisperx.assign_word_speakers(diarize_segments, result)
-
-				text = []
-				start = 0
-				end = 0
-				for segment in result["segments"]:
-					text.append( segment["text"] )
-					start = min( start, segment["start"] )
-					end = max( end, segment["end"] )
-
-				metadata[filename]["text"] = " ".join(text).strip()
-				metadata[filename]["start"] = start
-				metadata[filename]["end"] = end
-
-				open(outpath, 'w', encoding='utf-8').write(json.dumps(metadata))
-
 if __name__ == "__main__":
 	import argparse

@ -1622,8 +1517,6 @@ if __name__ == "__main__":
 	_logger = LoggerOveride()

 	if args.action == "hdf5":
-		transcribe_dataset()
-	elif args.action == "hdf5":
 		create_dataset_hdf5()
 	elif args.action == "list-dataset":
 		dataset = []
--- a/vall_e/emb/process.py
+++ b/vall_e/emb/process.py
@ -100,7 +100,7 @@ def process(
 		amp=False,
 	):
 	# prepare from args
-	cfg.set_audio_backend(args.audio_backend)
+	cfg.set_audio_backend(audio_backend)
 	audio_extension = cfg.audio_backend_extension

 	cfg.inference.weight_dtype = dtype # "bfloat16"
@ -117,7 +117,7 @@ def process(
 	only_groups = [] # only process these groups
 	only_speakers = [] # only process these speakers

-	always_slice_groups = [] # always slice from this group
+	always_slice_groups = ["Audiobooks", "LibriVox"] # always slice from this group
 	audio_only = ["Noise"] # special pathway for processing audio only (without a transcription)

 	missing = {