Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,6 @@ def init_parameters(self, cfg: DictConfig) -> None:
self.right_padding = self.padding_mode is FeatureBufferPaddingMode.RIGHT
self.return_tail_result = cfg.return_tail_result

# Keep small amount of extra padding
self.tail_padding_in_samples = max(int(self.chunk_size * self.sample_rate * 0.45), 6400)
self.zero_log_probs = self.init_zero_log_probs() if self.right_padding else None

def init_endpointer(self) -> None:
Expand Down Expand Up @@ -239,10 +237,10 @@ def get_logprobs_given_raw_signals(
buffers.append(buffer.unsqueeze_(0))

# Only final frames have right padding
# Keep some amount of extra padding to avoid the performance degradation
right_paddings = torch.tensor(
[frame.size - frame.valid_size - self.tail_padding_in_samples for frame in frames], device=self.device
).clamp(min=0)
# Calculate right paddings
right_paddings = torch.tensor([frame.size - frame.valid_size for frame in frames], device=self.device).clamp(
min=0
)

# Create and adjust the buffer lens
buffer_lens = torch.tensor([buffers[0].size(1)] * len(buffers), device=self.device)
Expand Down Expand Up @@ -442,6 +440,5 @@ def get_request_generator(self) -> ContinuousBatchedRequestStreamer:
device=self.device,
pad_last_frame=True,
right_pad_features=self.right_padding,
tail_padding_in_samples=self.tail_padding_in_samples,
)
return request_generator
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,6 @@ def init_parameters(self, cfg: DictConfig) -> None:
self.return_tail_result = cfg.return_tail_result
self.tokens_to_move = self.punctuation_ids.union(self.language_token_ids)

# Keep small amount of extra padding
self.tail_padding_in_samples = max(int(self.chunk_size * self.sample_rate * 0.45), 6400)
self.zero_encoded = self.init_zero_enc() if self.right_padding else None

def init_endpointer(self) -> None:
Expand Down Expand Up @@ -314,10 +312,10 @@ def encode_raw_signals(
buffers.append(buffer.unsqueeze_(0))

# Only final frames have right padding
# Keep some amount of extra padding to avoid the performance degradation
right_paddings = torch.tensor(
[frame.size - frame.valid_size - self.tail_padding_in_samples for frame in frames], device=self.device
).clamp(min=0)
# Calculate right paddings
right_paddings = torch.tensor([frame.size - frame.valid_size for frame in frames], device=self.device).clamp(
min=0
)

# Create and adjust the buffer lens
buffer_lens = torch.tensor([buffers[0].size(1)] * len(buffers), device=self.device)
Expand Down Expand Up @@ -808,6 +806,5 @@ def get_request_generator(self) -> ContinuousBatchedRequestStreamer:
device=self.device,
pad_last_frame=True,
right_pad_features=self.right_padding,
tail_padding_in_samples=self.tail_padding_in_samples,
)
return request_generator
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def __init__(
preprocessor_cfg: DictConfig,
device: torch.device,
fill_value: float = LOG_MEL_ZERO,
right_padding_ratio: float = 0.8,
):
"""
Args:
Expand All @@ -55,7 +54,6 @@ def __init__(
preprocessor_cfg (DictConfig): preprocessor configuration
device (torch.device): device
fill_value (float): fill value for the feature buffer
right_padding_ratio (float): right padding ratio
"""
if buffer_size_in_secs < chunk_size_in_secs:
raise ValueError(
Expand All @@ -68,7 +66,6 @@ def __init__(
self.chunk_size_in_secs = chunk_size_in_secs
self.preprocessor_cfg = preprocessor_cfg
self.device = device
self.right_padding_ratio = right_padding_ratio

self.is_buffer_size_equal_to_chunk_size = math.isclose(self.buffer_size_in_secs, self.chunk_size_in_secs)
self.plus_one = 0 if self.is_buffer_size_equal_to_chunk_size else 1
Expand Down Expand Up @@ -142,12 +139,12 @@ def preprocess(
"""
signals = torch.vstack(audio_buffers).to(self.device) # B x T
signals_len = torch.tensor([signals.shape[1]] * signals.shape[0], device=self.device, dtype=torch.long) # B
right_paddings = right_paddings * self.right_padding_ratio
signals_len = signals_len - right_paddings.long()
features, _ = self.preprocessor(input_signal=signals, length=signals_len)
features, feature_lens = self.preprocessor(input_signal=signals, length=signals_len)
if features.shape[2] > expected_feat_len:
features = features[:, :, :expected_feat_len] # B x F x T
right_padding = torch.floor(right_paddings / self.sample_rate / self.timestep_duration) # B
feature_lens = feature_lens.clamp(max=expected_feat_len)
right_padding = (features.shape[2] - feature_lens).clamp(min=0).to(torch.long)
return features, right_padding

def _update_feature_buffer(self, slot_ids: list[int], feat_chunk: Tensor) -> None:
Expand Down
11 changes: 3 additions & 8 deletions nemo/collections/asr/inference/streaming/framing/multi_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,6 @@ def __init__(
device: torch.device = None,
pad_last_frame: bool = False,
right_pad_features: bool = False,
tail_padding_in_samples: int = 0,
):
"""
Args:
Expand All @@ -257,7 +256,6 @@ def __init__(
device (torch.device): The device to use, required for request type FEATURE_BUFFER
pad_last_frame (bool): Whether to pad the last frame
right_pad_features (bool): Whether to right pad the features, optional for request type FEATURE_BUFFER
tail_padding_in_samples (int): The tail padding in samples, optional for request type FEATURE_BUFFER
"""

if request_type is RequestType.FEATURE_BUFFER:
Expand All @@ -284,7 +282,6 @@ def __init__(
sample_rate=sample_rate, buffer_size_in_secs=buffer_size_in_secs
)
self.right_pad_features = right_pad_features
self.tail_padding_in_samples = tail_padding_in_samples

def set_audio_filepaths(self, audio_filepaths: list[str], options: list[RequestOptions]) -> None:
"""
Expand Down Expand Up @@ -351,11 +348,9 @@ def to_feature_buffers(self, frames: list[Frame]) -> list[FeatureBuffer]:
buffer_lens = torch.tensor([buffers[0].size(1)] * len(buffers), device=self.device)

# Calculate right paddings and subtract from buffer lens
# tail_padding_in_samples is used to keep some amount of padding at the end of the buffer
# some models perform better with this padding
right_paddings = torch.tensor(
[frame.size - frame.valid_size - self.tail_padding_in_samples for frame in frames], device=self.device
).clamp(min=0)
right_paddings = torch.tensor([frame.size - frame.valid_size for frame in frames], device=self.device).clamp(
min=0
)

# Subtract right paddings from buffer lens
buffer_lens = buffer_lens - right_paddings
Expand Down
Loading