2023-09-04 02:27:13 +00:00
|
|
|
from dataclasses import dataclass
|
|
|
|
from typing import Any
|
|
|
|
import random
|
|
|
|
|
2024-06-14 21:55:40 +00:00
|
|
|
import torch
|
|
|
|
from torch.utils.data import Sampler
|
|
|
|
|
2024-06-29 15:10:35 +00:00
|
|
|
from .distributed import global_rank, local_rank, world_size
|
|
|
|
|
2024-06-14 21:55:40 +00:00
|
|
|
# Randomly picks an index from an array of indices
|
|
|
|
class PoolSampler():
|
2024-11-12 00:16:56 +00:00
|
|
|
def __init__( self, pool = [], keep_all = False, shuffle = False ):
|
2024-06-14 21:55:40 +00:00
|
|
|
self.length = len(pool)
|
2024-06-30 16:36:46 +00:00
|
|
|
self.shuffle = shuffle
|
2023-09-04 02:27:13 +00:00
|
|
|
self.global_pool = pool if keep_all else None
|
2024-06-14 21:55:40 +00:00
|
|
|
self.global_indices = [ i for i in range(self.length) ]
|
2023-09-04 02:27:13 +00:00
|
|
|
self.reset()
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.current_pool = [ i for i in self.global_indices ]
|
2024-06-30 16:36:46 +00:00
|
|
|
if self.shuffle:
|
2024-07-23 00:36:07 +00:00
|
|
|
random.shuffle(self.current_pool)
|
2023-09-04 02:27:13 +00:00
|
|
|
|
|
|
|
def sample(self, pool = None):
|
|
|
|
if pool is None:
|
|
|
|
pool = self.global_pool
|
|
|
|
# check if we need to reset
|
|
|
|
index = random.choice( self.current_pool )
|
|
|
|
# remove from pool
|
|
|
|
self.current_pool.remove(index)
|
|
|
|
# reset if needed
|
|
|
|
if len(self.current_pool) == 0:
|
|
|
|
self.reset()
|
|
|
|
# map indices to our real values
|
|
|
|
return pool[index] if pool is not None else index
|
|
|
|
|
2024-06-14 21:55:40 +00:00
|
|
|
def __len__(self):
|
|
|
|
return self.length # len(self.current_pool)
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
while len(self.current_pool) > 0:
|
|
|
|
yield self.sample()
|
|
|
|
|
2023-09-04 02:27:13 +00:00
|
|
|
def __call__(self, *args, **kwargs):
|
2024-06-14 21:55:40 +00:00
|
|
|
return self.sample(*args, **kwargs)
|
|
|
|
|
2024-11-13 04:30:09 +00:00
|
|
|
def index(self):
|
|
|
|
return len(self.global_indices) - len(self.current_pool)
|
|
|
|
|
2024-06-14 21:55:40 +00:00
|
|
|
def get_state(self):
|
2024-11-12 00:16:56 +00:00
|
|
|
return { "length": self.length, "global_pool": self.global_pool, "global_indices": self.global_indices, "current_pool": self.current_pool }
|
2024-06-14 21:55:40 +00:00
|
|
|
|
|
|
|
def set_state(self, state):
|
|
|
|
self.length = state["length"]
|
|
|
|
self.global_pool = state["global_pool"]
|
|
|
|
self.global_indices = state["global_indices"]
|
|
|
|
self.current_pool = state["current_pool"]
|
|
|
|
|
|
|
|
# "Samples" through a fixed sequence from 0 to length
|
|
|
|
# Necessary for our "shuffle+sort by duration+interleave" sampling method
|
|
|
|
# Allows saving and loading state
|
|
|
|
class OrderedSampler(Sampler):
|
2024-11-12 00:16:56 +00:00
|
|
|
def __init__( self, length ):
|
2024-06-14 21:55:40 +00:00
|
|
|
self.position = 0
|
|
|
|
self.length = length
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
return self.length
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
if self.position >= self.length:
|
|
|
|
self.position = 0
|
|
|
|
|
|
|
|
while self.position < self.length:
|
|
|
|
yield self.position
|
|
|
|
self.position += 1
|
|
|
|
|
2024-11-13 04:30:09 +00:00
|
|
|
def index(self):
|
|
|
|
return self.position
|
|
|
|
|
2024-06-14 21:55:40 +00:00
|
|
|
def get_state(self):
|
2024-11-12 00:16:56 +00:00
|
|
|
return { "position": self.position, "length": self.length }
|
2024-06-14 21:55:40 +00:00
|
|
|
|
|
|
|
def set_state(self, state):
|
|
|
|
self.position = state["position"]
|
|
|
|
self.length = state["length"]
|
|
|
|
|
2024-06-29 03:28:54 +00:00
|
|
|
# Like the above, but will batch based on token count
|
|
|
|
class BatchedOrderedSampler(Sampler):
|
2024-11-13 17:09:24 +00:00
|
|
|
def __init__( self, buckets, max_duration=0, max_batch_size=0, shuffle=False, drop_last=True, use_max_size=True ):
|
2024-06-29 03:28:54 +00:00
|
|
|
self.position = 0
|
|
|
|
self.batches = []
|
2024-06-30 16:36:46 +00:00
|
|
|
self.shuffle = shuffle
|
2024-06-29 03:28:54 +00:00
|
|
|
|
|
|
|
assert max_duration != 0 and max_batch_size != 0, "max_duration and max_batch_size cannot both be 0"
|
|
|
|
|
|
|
|
current_batch = []
|
|
|
|
current_index = 0
|
2024-11-13 17:09:24 +00:00
|
|
|
current_duration = 0
|
|
|
|
|
2024-06-29 03:28:54 +00:00
|
|
|
for key, bucket in buckets.items():
|
|
|
|
for path, duration in bucket:
|
|
|
|
# flush
|
|
|
|
should_flush = False
|
2024-11-13 17:09:24 +00:00
|
|
|
if max_duration > 0 and current_duration + duration > max_duration:
|
2024-06-29 03:28:54 +00:00
|
|
|
should_flush = True
|
|
|
|
elif max_batch_size > 0 and len(current_batch) >= max_batch_size:
|
|
|
|
should_flush = True
|
|
|
|
|
|
|
|
if should_flush and len(current_batch) > 0:
|
|
|
|
self.batches.append( current_batch )
|
|
|
|
current_batch = []
|
2024-11-13 17:09:24 +00:00
|
|
|
current_duration = 0
|
2024-06-29 03:28:54 +00:00
|
|
|
|
|
|
|
current_batch.append( current_index )
|
|
|
|
current_index += 1
|
2024-11-13 17:09:24 +00:00
|
|
|
# as long as durations are ordered, this assertion is always true
|
|
|
|
if use_max_size:
|
|
|
|
current_duration = duration * len(current_batch)
|
|
|
|
else:
|
|
|
|
current_duration += duration
|
|
|
|
|
|
|
|
if not drop_last and current_batch:
|
|
|
|
self.batches.append( current_batch )
|
2024-06-29 03:28:54 +00:00
|
|
|
|
2024-06-30 16:36:46 +00:00
|
|
|
if self.shuffle:
|
|
|
|
random.shuffle(self.batches)
|
|
|
|
|
2024-06-29 03:28:54 +00:00
|
|
|
def __len__(self):
|
|
|
|
return len(self.batches)
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
if self.position >= len(self.batches):
|
|
|
|
self.position = 0
|
2024-06-30 16:36:46 +00:00
|
|
|
if self.shuffle:
|
|
|
|
random.shuffle(self.batches)
|
2024-06-29 03:28:54 +00:00
|
|
|
|
|
|
|
while self.position < len(self.batches):
|
|
|
|
yield self.batches[self.position]
|
|
|
|
self.position += 1
|
|
|
|
|
2024-11-13 04:30:09 +00:00
|
|
|
def index(self):
|
|
|
|
return self.position
|
|
|
|
|
2024-06-29 03:28:54 +00:00
|
|
|
def get_state(self):
|
2024-11-12 00:16:56 +00:00
|
|
|
return { "position": self.position, "batches": self.batches }
|
2024-06-29 03:28:54 +00:00
|
|
|
|
|
|
|
def set_state(self, state):
|
|
|
|
self.position = state["position"]
|
|
|
|
self.batches = state["batches"]
|
|
|
|
|
2024-06-14 21:55:40 +00:00
|
|
|
# Randomly samples indices from a given sequence from 0 to length
|
|
|
|
# Allows saving and loading state
|
|
|
|
class RandomSampler(Sampler):
|
2024-11-12 00:16:56 +00:00
|
|
|
def __init__( self, length ):
|
2024-06-14 21:55:40 +00:00
|
|
|
self.position = 0
|
|
|
|
self.length = length
|
|
|
|
|
|
|
|
self.generator = torch.Generator()
|
|
|
|
self.perm = torch.randperm(self.length, generator=self.generator)
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
return self.length
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
if self.position >= self.length:
|
|
|
|
self.position = 0
|
|
|
|
self.perm = torch.randperm(self.length, generator=self.generator)
|
|
|
|
|
|
|
|
while self.position < self.length:
|
|
|
|
yield self.perm[self.position]
|
|
|
|
self.position += 1
|
|
|
|
|
2024-11-13 04:30:09 +00:00
|
|
|
def index(self):
|
|
|
|
return self.position
|
|
|
|
|
2024-06-14 21:55:40 +00:00
|
|
|
def get_state(self):
|
2024-11-12 00:16:56 +00:00
|
|
|
return { "position": self.position, "length": self.length, "perm": self.perm, "generator": self.generator.get_state() }
|
2024-06-14 21:55:40 +00:00
|
|
|
|
|
|
|
def set_state(self, state):
|
|
|
|
self.position = state["position"]
|
|
|
|
self.length = state["length"]
|
|
|
|
self.perm = state["perm"]
|
2024-11-12 00:16:56 +00:00
|
|
|
self.generator.set_state(state["generator"])
|