From 2768558bbf7094429acee1450878142c04547f44 Mon Sep 17 00:00:00 2001 From: hkwon Date: Tue, 13 Aug 2024 13:30:35 -0700 Subject: [PATCH 01/15] Wav2Vec2 upgrade with Conv1D options --- include/ctranslate2/layers/wav2vec2.h | 71 ++++++++++- python/ctranslate2/converters/transformers.py | 39 ++++-- python/ctranslate2/specs/wav2vec2_spec.py | 35 ++++-- python/tests/test_transformers.py | 81 ++---------- src/layers/wav2vec2.cc | 117 +++++++++++++----- src/models/wav2vec2.cc | 3 +- 6 files changed, 230 insertions(+), 116 deletions(-) diff --git a/include/ctranslate2/layers/wav2vec2.h b/include/ctranslate2/layers/wav2vec2.h index 4c25c941a..2d6a7d12c 100644 --- a/include/ctranslate2/layers/wav2vec2.h +++ b/include/ctranslate2/layers/wav2vec2.h @@ -5,6 +5,68 @@ namespace ctranslate2 { namespace layers { + class Wav2Vec2LayerNormConvLayer0 : public Layer { + public: + Wav2Vec2LayerNormConvLayer0(const models::Model& model, const std::string& scope); + + void operator()(const StorageView& input, StorageView& output) const; + + DataType output_type() const override { + return _conv.output_type(); + } + + dim_t output_size() const override { + return _conv.output_size(); + } + + private: + const Conv1D _conv; + const LayerNorm _output_norm; + const ops::Transpose _transpose; + const ops::GELU _gelu; + }; + + class Wav2Vec2LayerNormConvLayer : public Layer { + public: + Wav2Vec2LayerNormConvLayer(const models::Model& model, const std::string& scope); + + void operator()(const StorageView& input, StorageView& output) const; + + DataType output_type() const override { + return _conv.output_type(); + } + + dim_t output_size() const override { + return _conv.output_size(); + } + + private: + const Conv1D _conv; + const LayerNorm _output_norm; + const ops::Transpose _transpose; + const ops::GELU _gelu; + }; + + class Wav2Vec2PosConvLayer : public Layer { + public: + Wav2Vec2PosConvLayer(const models::Model& model, const std::string& scope); + + void operator()(const StorageView& input, StorageView& output) const; + + DataType output_type() const override { + return _conv.output_type(); + } + + dim_t output_size() const override { + return _conv.output_size(); + } + + private: + const Conv1D _conv; + const ops::Transpose _transpose; + const ops::GELU _gelu; + }; + class Wav2Vec2Encoder : public Layer { public: Wav2Vec2Encoder(const models::Model& model, const std::string& scope); @@ -35,12 +97,17 @@ namespace ctranslate2 { } private: + const Wav2Vec2LayerNormConvLayer0 _feat_layer0; + const std::vector> _feat_layers; + const LayerNorm _fp_norm; + const Dense _fp_ff; + const Wav2Vec2PosConvLayer _pos_conv_embed; + const ops::Transpose _transpose; const ops::GELU _gelu; - // wav2vec2.encoder modules except pos_conv_embed due to groups=16 being not supported - //const ops::Transpose _transpose; const dim_t _num_heads; const std::vector> _layers; const LayerNorm _output_norm; + const Dense _lm_head; }; } diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index a6985b9d1..3c3cd6d2d 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -992,9 +992,8 @@ def architecture_name(self): return "Wav2Vec2ForCTC" def get_model_spec(self, model): - # Wav2Vec2 encoder Wav2Vec2PositionalConvEmbedding conv1d has groups 16 - # that doesn't look available here so we make Wav2Vec2 encoder layers only spec = wav2vec2_spec.Wav2Vec2Spec( + model.wav2vec2.config.num_feat_extract_layers, model.wav2vec2.encoder.config.num_hidden_layers, model.wav2vec2.encoder.config.num_attention_heads, ) @@ -1007,9 +1006,7 @@ def get_model_spec(self, model): layer.fc1 = layer.feed_forward.intermediate_dense layer.fc2 = layer.feed_forward.output_dense - self.set_encoder(spec.encoder, model.wav2vec2.encoder) - self.set_linear(spec.lm_head, model.lm_head) - # only for Wav2Vec2Spec.get_vocabulary_size() + self.set_encoder(spec.encoder, model, model.wav2vec2.config) return spec def set_config(self, config, model, tokenizer): @@ -1021,8 +1018,36 @@ def get_vocabulary(self, model, tokenizer): def set_vocabulary(self, spec, tokens): spec.register_vocabulary(tokens) - def set_encoder(self, spec, encoder): - super().set_encoder(spec, encoder) + def set_feature_extractor(self, spec, feature_extractor): + spec.feat_layer0.conv.weight = feature_extractor.conv_layers[0].conv.weight + spec.feat_layer0.conv.bias = feature_extractor.conv_layers[0].conv.bias + self.set_layer_norm(spec.feat_layer0.layer_norm, feature_extractor.conv_layers[0].layer_norm) + for spec_layer, module_layer in zip(spec.feat_layer, feature_extractor.conv_layers[1:]): + spec_layer.conv.weight = module_layer.conv.weight + spec_layer.conv.bias = module_layer.conv.bias + self.set_layer_norm(spec_layer.layer_norm, module_layer.layer_norm) + + def set_feature_projection(self, spec, feature_projection): + self.set_layer_norm(spec.fp_layer_norm, feature_projection.layer_norm) + self.set_linear(spec.fp_projection, feature_projection.projection) + + def set_pos_conv_embed(self, spec, encoder, config): + # forcing parameters to be set because some transformers version initializes garbage numbers + # conv parameters are float16 so force float32 for the loading + encoder.pos_conv_embed.conv.weight.data = encoder.pos_conv_embed.conv.weight.data.float() + encoder.pos_conv_embed.conv.bias.data = encoder.pos_conv_embed.conv.bias.float() + for param in encoder.pos_conv_embed.parameters(): + param.data = param.data.float() + tmp = encoder.pos_conv_embed(torch.randn((1,1,config.hidden_size))) + spec.pos_conv_embed.conv.weight = encoder.pos_conv_embed.conv.weight + spec.pos_conv_embed.conv.bias = encoder.pos_conv_embed.conv.bias + + def set_encoder(self, spec, model, config): + self.set_feature_extractor(spec, model.wav2vec2.feature_extractor) + self.set_feature_projection(spec, model.wav2vec2.feature_projection) + self.set_pos_conv_embed(spec, model.wav2vec2.encoder, config) + super().set_encoder(spec, model.wav2vec2.encoder) + self.set_linear(spec.lm_head, model.lm_head) def set_common_layers(self, spec, module): self.set_layer_norm(spec.layer_norm, module.layer_norm) diff --git a/python/ctranslate2/specs/wav2vec2_spec.py b/python/ctranslate2/specs/wav2vec2_spec.py index 78b2ffa84..161899d8b 100644 --- a/python/ctranslate2/specs/wav2vec2_spec.py +++ b/python/ctranslate2/specs/wav2vec2_spec.py @@ -1,5 +1,6 @@ from typing import List, Optional, Tuple +import torch.nn as nn import numpy as np from ctranslate2.specs import common_spec, model_spec, transformer_spec @@ -11,12 +12,14 @@ class Wav2Vec2Config(model_spec.ModelConfig): def __init__(self): return - class Wav2Vec2Spec(model_spec.LanguageModelSpec): - def __init__(self, num_layers, num_heads): + def __init__(self, feat_layers, num_layers, num_heads): super().__init__() - self.encoder = Wav2Vec2EncoderSpec(num_layers, num_heads) - self.lm_head = common_spec.LinearSpec() + self.encoder = Wav2Vec2EncoderSpec( + feat_layers, + num_layers, + num_heads + ) @property def name(self): @@ -30,14 +33,32 @@ def get_default_config(self): return Wav2Vec2Config() def get_vocabulary_size(self): - return self.lm_head.weight.shape[0] + return self.encoder.lm_head.weight.shape[0] + + +class Wav2Vec2LayerNormConvLayer(model_spec.LayerSpec): + def __init__(self): + self.conv = common_spec.Conv1DSpec() + self.layer_norm = common_spec.LayerNormSpec() + + +class Wav2Vec2PosEmbedConvLayer(model_spec.LayerSpec): + def __init__(self): + self.conv = common_spec.Conv1DSpec() class Wav2Vec2EncoderSpec(model_spec.LayerSpec): - def __init__(self, num_layers, num_heads): + def __init__(self, feat_layers, num_layers, num_heads): self.num_heads = np.dtype("int16").type(num_heads) - # wav2vec2.encoder modules except pos_conv_embed due to groups=16 being not supported + self.feat_layer0 = Wav2Vec2LayerNormConvLayer() + self.feat_layer = [ + Wav2Vec2LayerNormConvLayer() for i in range(feat_layers-1) + ] + self.fp_layer_norm = common_spec.LayerNormSpec() + self.fp_projection = common_spec.LinearSpec() + self.pos_conv_embed = Wav2Vec2PosEmbedConvLayer() self.layer_norm = common_spec.LayerNormSpec() self.layer = [ transformer_spec.TransformerEncoderLayerSpec() for _ in range(num_layers) ] + self.lm_head = common_spec.LinearSpec() diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index f27bd6ca2..e85f12657 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -979,24 +979,15 @@ def test_transformers_wav2vec2( ) output_dir = str(tmp_dir.join("ctranslate2_model")) output_dir = converter.convert(output_dir) - # 24 x Wav2Vec2EncoderLayerStableLayerNorm converted & saved - w2v2_model = transformers.Wav2Vec2ForCTC.from_pretrained(model_name) - del w2v2_model.wav2vec2.encoder.layers - del w2v2_model.wav2vec2.encoder.layer_norm - w2v2_model.save_pretrained(output_dir + "/wav2vec2_partial.bin") w2v2_processor = transformers.Wav2Vec2Processor.from_pretrained(model_name) - torch.save(w2v2_processor, output_dir + "/wav2vec2_processor.bin") + w2v2_processor.save_pretrained(output_dir + "/wav2vec2_processor") + processor = transformers.AutoProcessor.from_pretrained(output_dir+"/wav2vec2_processor") + model = ctranslate2.models.Wav2Vec2(output_dir, device=device, device_index=[0], compute_type="int8") device = "cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu" cpu_threads = int(os.environ.get("OMP_NUM_THREADS", 0)) - w2v2_model = transformers.Wav2Vec2ForCTC.from_pretrained( - output_dir + "/wav2vec2_partial.bin" - ).to(device) - del w2v2_model.wav2vec2.encoder.layers - del w2v2_model.wav2vec2.encoder.layer_norm - w2v2_processor = torch.load(output_dir + "/wav2vec2_processor.bin") - ct2_w2v2_model = ctranslate2.models.Wav2Vec2( + model = ctranslate2.models.Wav2Vec2( output_dir, device=device, device_index=[0], @@ -1015,66 +1006,16 @@ def test_transformers_wav2vec2( sampling_rate=16000, ).input_values - with torch.no_grad(): - extract_features = w2v2_model.wav2vec2.feature_extractor( - input_values.to(w2v2_model.device) - ).transpose(1, 2) - hidden_states, extract_features = w2v2_model.wav2vec2.feature_projection( - extract_features - ) - position_embeddings = w2v2_model.wav2vec2.encoder.pos_conv_embed( - hidden_states - ) - hidden_states = position_embeddings + hidden_states - # hidden_states = w2v2_model.encoder.dropout(hidden_states) - # Dropout(p=0.0, inplace=False) bypassed - - if ct2_w2v2_model.device == "cuda": - hidden_states = hidden_states.cpu() - else: - hidden_states.numpy() - - hidden_states = np.ascontiguousarray(hidden_states) + hidden_states = np.ascontiguousarray(input_values.unsqueeze(0)) hidden_states = ctranslate2.StorageView.from_array(hidden_states) - to_cpu = ( - ct2_w2v2_model.device == "cuda" and len(ct2_w2v2_model.device_index) > 1 - ) - ct2_output = ct2_w2v2_model.encode( - hidden_states, - to_cpu=to_cpu, - ) # 24 x Wav2Vec2EncoderLayerStableLayerNorm processed - if ct2_w2v2_model.device == "cuda": - hidden_states = torch.as_tensor( - ct2_output, - device=ct2_w2v2_model.device, - ) + to_cpu = (model.device == "cuda" and len(model.device_index) > 1) + output = model.encode(hidden_states,to_cpu=to_cpu) + if model.device=="cuda": + logits = torch.as_tensor(output, device=ct2_model.device)[0] else: - hidden_states = torch.as_tensor( - np.array(ct2_output), - dtype=torch.float32, - device=ct2_w2v2_model.device, - ) - - encoder_outputs = transformers.modeling_outputs.BaseModelOutput( - last_hidden_state=hidden_states, - hidden_states=None, - attentions=None, - ) - hidden_states = encoder_outputs[0] - outputs = transformers.modeling_outputs.Wav2Vec2BaseModelOutput( - last_hidden_state=hidden_states, - extract_features=extract_features, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - hidden_states = outputs[0] - # hidden_states = w2v2_model.dropout(hidden_states) - # Dropout(p=0.0, inplace=False) bypassed - - with torch.no_grad(): - logits = w2v2_model.lm_head(hidden_states.to(torch.float32))[0] + logits = torch.as_tensor(np.array(output), dtype=torch.float32, device=model.device)[0] predicted_ids = torch.argmax(logits, dim=-1) - transcription = w2v2_processor.decode(predicted_ids, output_word_offsets=True) + transcription = processor.decode(predicted_ids, output_word_offsets=True) assert transcription[0] == expected_transcription[0] diff --git a/src/layers/wav2vec2.cc b/src/layers/wav2vec2.cc index 237c77fad..04ccb077e 100644 --- a/src/layers/wav2vec2.cc +++ b/src/layers/wav2vec2.cc @@ -1,16 +1,80 @@ #include "ctranslate2/layers/wav2vec2.h" +#include namespace ctranslate2 { namespace layers { + + Wav2Vec2LayerNormConvLayer0::Wav2Vec2LayerNormConvLayer0(const models::Model& model, const std::string& scope) + : _conv(model, scope + "/conv", /*stride=*/5, /*padding=*/0) + , _transpose({0, 2, 1}) + , _output_norm(model, scope + "/layer_norm") { + } + + void Wav2Vec2LayerNormConvLayer0::operator()(const StorageView& input, StorageView& output) const{ + PROFILE("Wav2Vec2LayerNormConvLayer0"); + + StorageView buffer(input.dtype(), input.device()); + buffer = std::move(input); + _conv(buffer, output); + _transpose(output, buffer); + _output_norm(buffer, output); + _transpose(output, buffer); + _gelu(buffer, output); + } + + Wav2Vec2LayerNormConvLayer::Wav2Vec2LayerNormConvLayer(const models::Model& model, const std::string& scope) + : _conv(model, scope + "/conv", /*stride=*/2, /*padding=*/0) + , _transpose({0, 2, 1}) + , _output_norm(model, scope + "/layer_norm") { + } + + void Wav2Vec2LayerNormConvLayer::operator()(const StorageView& input, StorageView& output) const{ + PROFILE("Wav2Vec2LayerNormConvLayer"); + + StorageView buffer(input.dtype(), input.device()); + buffer = std::move(input); + _conv(buffer, output); + _transpose(output, buffer); + _output_norm(buffer, output); + _transpose(output, buffer); + _gelu(buffer, output); + } + + Wav2Vec2PosConvLayer::Wav2Vec2PosConvLayer(const models::Model& model, const std::string& scope) + : _conv(model, scope + "/conv", /*stride=*/1, /*padding=*/64, /*dilation*/1, /*groups*/16) + , _transpose({0, 2, 1}) { + } + + void Wav2Vec2PosConvLayer::operator()(const StorageView& input, StorageView& output) const{ + PROFILE("Wav2Vec2PosConvLayer"); + + StorageView buffer(input.dtype(), input.device()); + StorageView buffer2(input.dtype(), input.device()); + _transpose(input, buffer); + _conv(buffer, buffer2); + ops::Split(2, {buffer.dim(2), 1})(buffer2, buffer, output); + _gelu(buffer, buffer); + _transpose(buffer, buffer2); + ops::Add()(input, buffer2, output); + } + Wav2Vec2Encoder::Wav2Vec2Encoder(const models::Model& model, const std::string& scope) - : _num_heads(model.get_attribute_with_default(scope + "/num_heads", 8)) + : _feat_layer0(model, scope + "/feat_layer0") + , _feat_layers(build_layers_list(model, + scope + "/feat_layer")) + , _fp_norm(model, scope + "/fp_layer_norm") + , _fp_ff(model, scope + "/fp_projection", nullptr, true) + , _pos_conv_embed(model, scope + "/pos_conv_embed") + , _num_heads(model.get_attribute_with_default(scope + "/num_heads", 8)) + , _transpose({0, 2, 1}) , _layers(build_layers_list(model, scope + "/layer", _num_heads, /*pre_norm=*/true, ops::ActivationType::GELU)) , _output_norm(model, scope + "/layer_norm") + , _lm_head(model, scope + "/lm_head", nullptr, true) { } @@ -18,40 +82,37 @@ namespace ctranslate2 { PROFILE("Wav2Vec2Encoder"); // SAD in front-end handles the input length - //const dim_t expected_depth = 1024; - //const dim_t expected_time = 406; - if (features.rank() != 3) throw std::invalid_argument("Expected input features to have 3 dimensions, but got " + std::to_string(features.rank()) + " dimension(s) instead"); - /* //may need to limit the input lenght - if (features.dim(1) != expected_depth || features.dim(2) != expected_time) - throw std::invalid_argument("Invalid input features shape: expected an input with shape (" - + std::to_string(features.dim(0)) - + ", " - + std::to_string(expected_depth) - + ", " - + std::to_string(expected_time) - + "), but got an input with shape (" - + std::to_string(features.dim(0)) - + ", " - + std::to_string(features.dim(1)) - + ", " - + std::to_string(features.dim(2)) - + ") instead;; _conv1.output_size() " - + std::to_string(_conv1.output_size())); - //+ ") instead"); - */ - - StorageView input(output_type(), features.device()); - input = features; + + // Wav2Vec2FeatureExtractor------------------------------------ + StorageView feat_buffer(features.dtype(), features.device()); + StorageView feat_buffer2(features.dtype(), features.device()); + feat_buffer = std::move(features); + _feat_layer0(feat_buffer, output); + feat_buffer = std::move(output); + for (dim_t l = 0; l < _feat_layers.size(); l++) { + (*_feat_layers[l])(feat_buffer, output); + if (l < _feat_layers.size() - 1 ) { + feat_buffer = std::move(output); + } + } + _transpose(output, feat_buffer); + // Wav2Vec2FeatureProjection----------------------------------- + _fp_norm(feat_buffer, output); + _fp_ff(output, feat_buffer); + // Wav2Vec2PositionalConvEmbedding----------------------------- + _pos_conv_embed(feat_buffer, feat_buffer2); + // Wav2Vec2EncoderLayerStableLayerNorm------------------------- for (const auto& layer : _layers) { - (*layer)(input, nullptr, output); - input = std::move(output); + (*layer)(feat_buffer2, nullptr, feat_buffer); + feat_buffer2 = std::move(feat_buffer); } + _output_norm(feat_buffer2, feat_buffer); - _output_norm(input, output); + _lm_head(feat_buffer, output); } } diff --git a/src/models/wav2vec2.cc b/src/models/wav2vec2.cc index 79a7a40d4..7309f6eb6 100644 --- a/src/models/wav2vec2.cc +++ b/src/models/wav2vec2.cc @@ -35,8 +35,7 @@ namespace ctranslate2 { } bool Wav2Vec2Model::is_quantizable(const std::string& variable_name) const { - return (Model::is_quantizable(variable_name) - && variable_name.find("conv") == std::string::npos); + return Model::is_quantizable(variable_name); } bool Wav2Vec2Model::is_linear_weight(const std::string& variable_name) const { From 7d0513c3dbfd208b3a8d1ef6b22cb70e6eae5399 Mon Sep 17 00:00:00 2001 From: hkwon Date: Tue, 13 Aug 2024 13:53:41 -0700 Subject: [PATCH 02/15] refining scripts --- python/ctranslate2/converters/transformers.py | 2 +- python/ctranslate2/specs/wav2vec2_spec.py | 3 ++- python/tests/test_transformers.py | 9 ++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 3c3cd6d2d..af8944b7e 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -1038,7 +1038,7 @@ def set_pos_conv_embed(self, spec, encoder, config): encoder.pos_conv_embed.conv.bias.data = encoder.pos_conv_embed.conv.bias.float() for param in encoder.pos_conv_embed.parameters(): param.data = param.data.float() - tmp = encoder.pos_conv_embed(torch.randn((1,1,config.hidden_size))) + encoder.pos_conv_embed(torch.randn((1, 1, config.hidden_size))) spec.pos_conv_embed.conv.weight = encoder.pos_conv_embed.conv.weight spec.pos_conv_embed.conv.bias = encoder.pos_conv_embed.conv.bias diff --git a/python/ctranslate2/specs/wav2vec2_spec.py b/python/ctranslate2/specs/wav2vec2_spec.py index 161899d8b..68bbc8c3a 100644 --- a/python/ctranslate2/specs/wav2vec2_spec.py +++ b/python/ctranslate2/specs/wav2vec2_spec.py @@ -12,6 +12,7 @@ class Wav2Vec2Config(model_spec.ModelConfig): def __init__(self): return + class Wav2Vec2Spec(model_spec.LanguageModelSpec): def __init__(self, feat_layers, num_layers, num_heads): super().__init__() @@ -52,7 +53,7 @@ def __init__(self, feat_layers, num_layers, num_heads): self.num_heads = np.dtype("int16").type(num_heads) self.feat_layer0 = Wav2Vec2LayerNormConvLayer() self.feat_layer = [ - Wav2Vec2LayerNormConvLayer() for i in range(feat_layers-1) + Wav2Vec2LayerNormConvLayer() for i in range(feat_layers - 1) ] self.fp_layer_norm = common_spec.LayerNormSpec() self.fp_projection = common_spec.LinearSpec() diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index e85f12657..d985541d5 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -982,8 +982,7 @@ def test_transformers_wav2vec2( w2v2_processor = transformers.Wav2Vec2Processor.from_pretrained(model_name) w2v2_processor.save_pretrained(output_dir + "/wav2vec2_processor") - processor = transformers.AutoProcessor.from_pretrained(output_dir+"/wav2vec2_processor") - model = ctranslate2.models.Wav2Vec2(output_dir, device=device, device_index=[0], compute_type="int8") + processor = transformers.AutoProcessor.from_pretrained(output_dir + "/wav2vec2_processor") device = "cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu" cpu_threads = int(os.environ.get("OMP_NUM_THREADS", 0)) @@ -1009,9 +1008,9 @@ def test_transformers_wav2vec2( hidden_states = np.ascontiguousarray(input_values.unsqueeze(0)) hidden_states = ctranslate2.StorageView.from_array(hidden_states) to_cpu = (model.device == "cuda" and len(model.device_index) > 1) - output = model.encode(hidden_states,to_cpu=to_cpu) - if model.device=="cuda": - logits = torch.as_tensor(output, device=ct2_model.device)[0] + output = model.encode(hidden_states, to_cpu=to_cpu) + if model.device == "cuda": + logits = torch.as_tensor(output, device=model.device)[0] else: logits = torch.as_tensor(np.array(output), dtype=torch.float32, device=model.device)[0] From 2d4670f69ac104470daca59312129ca355f38794 Mon Sep 17 00:00:00 2001 From: hkwon Date: Tue, 13 Aug 2024 13:57:04 -0700 Subject: [PATCH 03/15] refining script again --- python/ctranslate2/converters/transformers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index af8944b7e..7ec06276b 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -1021,7 +1021,10 @@ def set_vocabulary(self, spec, tokens): def set_feature_extractor(self, spec, feature_extractor): spec.feat_layer0.conv.weight = feature_extractor.conv_layers[0].conv.weight spec.feat_layer0.conv.bias = feature_extractor.conv_layers[0].conv.bias - self.set_layer_norm(spec.feat_layer0.layer_norm, feature_extractor.conv_layers[0].layer_norm) + self.set_layer_norm( + spec.feat_layer0.layer_norm, + feature_extractor.conv_layers[0].layer_norm + ) for spec_layer, module_layer in zip(spec.feat_layer, feature_extractor.conv_layers[1:]): spec_layer.conv.weight = module_layer.conv.weight spec_layer.conv.bias = module_layer.conv.bias From 85654ece202e5e8cafb949c66b180313c89e5600 Mon Sep 17 00:00:00 2001 From: hkwon Date: Tue, 13 Aug 2024 14:10:43 -0700 Subject: [PATCH 04/15] fix the formats --- python/ctranslate2/converters/transformers.py | 11 +++++++---- python/ctranslate2/specs/wav2vec2_spec.py | 10 ++-------- python/tests/test_transformers.py | 10 +++++++--- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 7ec06276b..d98c65860 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -1022,10 +1022,11 @@ def set_feature_extractor(self, spec, feature_extractor): spec.feat_layer0.conv.weight = feature_extractor.conv_layers[0].conv.weight spec.feat_layer0.conv.bias = feature_extractor.conv_layers[0].conv.bias self.set_layer_norm( - spec.feat_layer0.layer_norm, - feature_extractor.conv_layers[0].layer_norm + spec.feat_layer0.layer_norm, feature_extractor.conv_layers[0].layer_norm ) - for spec_layer, module_layer in zip(spec.feat_layer, feature_extractor.conv_layers[1:]): + for spec_layer, module_layer in zip( + spec.feat_layer, feature_extractor.conv_layers[1:] + ): spec_layer.conv.weight = module_layer.conv.weight spec_layer.conv.bias = module_layer.conv.bias self.set_layer_norm(spec_layer.layer_norm, module_layer.layer_norm) @@ -1037,7 +1038,9 @@ def set_feature_projection(self, spec, feature_projection): def set_pos_conv_embed(self, spec, encoder, config): # forcing parameters to be set because some transformers version initializes garbage numbers # conv parameters are float16 so force float32 for the loading - encoder.pos_conv_embed.conv.weight.data = encoder.pos_conv_embed.conv.weight.data.float() + encoder.pos_conv_embed.conv.weight.data = ( + encoder.pos_conv_embed.conv.weight.data.float() + ) encoder.pos_conv_embed.conv.bias.data = encoder.pos_conv_embed.conv.bias.float() for param in encoder.pos_conv_embed.parameters(): param.data = param.data.float() diff --git a/python/ctranslate2/specs/wav2vec2_spec.py b/python/ctranslate2/specs/wav2vec2_spec.py index 68bbc8c3a..761a6f490 100644 --- a/python/ctranslate2/specs/wav2vec2_spec.py +++ b/python/ctranslate2/specs/wav2vec2_spec.py @@ -16,11 +16,7 @@ def __init__(self): class Wav2Vec2Spec(model_spec.LanguageModelSpec): def __init__(self, feat_layers, num_layers, num_heads): super().__init__() - self.encoder = Wav2Vec2EncoderSpec( - feat_layers, - num_layers, - num_heads - ) + self.encoder = Wav2Vec2EncoderSpec(feat_layers, num_layers, num_heads) @property def name(self): @@ -52,9 +48,7 @@ class Wav2Vec2EncoderSpec(model_spec.LayerSpec): def __init__(self, feat_layers, num_layers, num_heads): self.num_heads = np.dtype("int16").type(num_heads) self.feat_layer0 = Wav2Vec2LayerNormConvLayer() - self.feat_layer = [ - Wav2Vec2LayerNormConvLayer() for i in range(feat_layers - 1) - ] + self.feat_layer = [Wav2Vec2LayerNormConvLayer() for i in range(feat_layers - 1)] self.fp_layer_norm = common_spec.LayerNormSpec() self.fp_projection = common_spec.LinearSpec() self.pos_conv_embed = Wav2Vec2PosEmbedConvLayer() diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index d985541d5..5a788ef86 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -982,7 +982,9 @@ def test_transformers_wav2vec2( w2v2_processor = transformers.Wav2Vec2Processor.from_pretrained(model_name) w2v2_processor.save_pretrained(output_dir + "/wav2vec2_processor") - processor = transformers.AutoProcessor.from_pretrained(output_dir + "/wav2vec2_processor") + processor = transformers.AutoProcessor.from_pretrained( + output_dir + "/wav2vec2_processor" + ) device = "cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu" cpu_threads = int(os.environ.get("OMP_NUM_THREADS", 0)) @@ -1007,12 +1009,14 @@ def test_transformers_wav2vec2( hidden_states = np.ascontiguousarray(input_values.unsqueeze(0)) hidden_states = ctranslate2.StorageView.from_array(hidden_states) - to_cpu = (model.device == "cuda" and len(model.device_index) > 1) + to_cpu = model.device == "cuda" and len(model.device_index) > 1 output = model.encode(hidden_states, to_cpu=to_cpu) if model.device == "cuda": logits = torch.as_tensor(output, device=model.device)[0] else: - logits = torch.as_tensor(np.array(output), dtype=torch.float32, device=model.device)[0] + logits = torch.as_tensor( + np.array(output), dtype=torch.float32, device=model.device + )[0] predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids, output_word_offsets=True) From 0b125a9c192d2c90f11daf376724cab16879175b Mon Sep 17 00:00:00 2001 From: hkwon Date: Tue, 13 Aug 2024 14:13:40 -0700 Subject: [PATCH 05/15] fix the isort format --- python/ctranslate2/specs/wav2vec2_spec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/specs/wav2vec2_spec.py b/python/ctranslate2/specs/wav2vec2_spec.py index 761a6f490..0cc6227d5 100644 --- a/python/ctranslate2/specs/wav2vec2_spec.py +++ b/python/ctranslate2/specs/wav2vec2_spec.py @@ -1,7 +1,7 @@ from typing import List, Optional, Tuple -import torch.nn as nn import numpy as np +import torch.nn as nn from ctranslate2.specs import common_spec, model_spec, transformer_spec From d844362e68b74261c7b35225960842603903540d Mon Sep 17 00:00:00 2001 From: hkwon Date: Tue, 13 Aug 2024 16:19:45 -0700 Subject: [PATCH 06/15] refining the library --- python/ctranslate2/specs/wav2vec2_spec.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ctranslate2/specs/wav2vec2_spec.py b/python/ctranslate2/specs/wav2vec2_spec.py index 0cc6227d5..7b9b9cfe4 100644 --- a/python/ctranslate2/specs/wav2vec2_spec.py +++ b/python/ctranslate2/specs/wav2vec2_spec.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple import numpy as np -import torch.nn as nn from ctranslate2.specs import common_spec, model_spec, transformer_spec From a1c112f19fc06308f7b562e321bc7a9ff19e135b Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 14 Aug 2024 11:01:34 -0700 Subject: [PATCH 07/15] update based on the suggestions --- include/ctranslate2/layers/wav2vec2.h | 30 ++++++----------------- src/layers/wav2vec2.cc | 34 +++++++++------------------ 2 files changed, 18 insertions(+), 46 deletions(-) diff --git a/include/ctranslate2/layers/wav2vec2.h b/include/ctranslate2/layers/wav2vec2.h index 2d6a7d12c..29dea9783 100644 --- a/include/ctranslate2/layers/wav2vec2.h +++ b/include/ctranslate2/layers/wav2vec2.h @@ -5,30 +5,12 @@ namespace ctranslate2 { namespace layers { - class Wav2Vec2LayerNormConvLayer0 : public Layer { - public: - Wav2Vec2LayerNormConvLayer0(const models::Model& model, const std::string& scope); - - void operator()(const StorageView& input, StorageView& output) const; - - DataType output_type() const override { - return _conv.output_type(); - } - - dim_t output_size() const override { - return _conv.output_size(); - } - - private: - const Conv1D _conv; - const LayerNorm _output_norm; - const ops::Transpose _transpose; - const ops::GELU _gelu; - }; - class Wav2Vec2LayerNormConvLayer : public Layer { public: - Wav2Vec2LayerNormConvLayer(const models::Model& model, const std::string& scope); + Wav2Vec2LayerNormConvLayer(const models::Model& model, + const std::string& scope, + dim_t stride, + dim_t padding); void operator()(const StorageView& input, StorageView& output) const; @@ -41,6 +23,8 @@ namespace ctranslate2 { } private: + dim_t _stride; + dim_t _padding; const Conv1D _conv; const LayerNorm _output_norm; const ops::Transpose _transpose; @@ -97,7 +81,7 @@ namespace ctranslate2 { } private: - const Wav2Vec2LayerNormConvLayer0 _feat_layer0; + const Wav2Vec2LayerNormConvLayer _feat_layer0; const std::vector> _feat_layers; const LayerNorm _fp_norm; const Dense _fp_ff; diff --git a/src/layers/wav2vec2.cc b/src/layers/wav2vec2.cc index 04ccb077e..defbf0d84 100644 --- a/src/layers/wav2vec2.cc +++ b/src/layers/wav2vec2.cc @@ -1,30 +1,16 @@ #include "ctranslate2/layers/wav2vec2.h" -#include namespace ctranslate2 { namespace layers { - Wav2Vec2LayerNormConvLayer0::Wav2Vec2LayerNormConvLayer0(const models::Model& model, const std::string& scope) - : _conv(model, scope + "/conv", /*stride=*/5, /*padding=*/0) - , _transpose({0, 2, 1}) - , _output_norm(model, scope + "/layer_norm") { - } - - void Wav2Vec2LayerNormConvLayer0::operator()(const StorageView& input, StorageView& output) const{ - PROFILE("Wav2Vec2LayerNormConvLayer0"); - - StorageView buffer(input.dtype(), input.device()); - buffer = std::move(input); - _conv(buffer, output); - _transpose(output, buffer); - _output_norm(buffer, output); - _transpose(output, buffer); - _gelu(buffer, output); - } - - Wav2Vec2LayerNormConvLayer::Wav2Vec2LayerNormConvLayer(const models::Model& model, const std::string& scope) - : _conv(model, scope + "/conv", /*stride=*/2, /*padding=*/0) + Wav2Vec2LayerNormConvLayer::Wav2Vec2LayerNormConvLayer(const models::Model& model, + const std::string& scope, + dim_t stride, + dim_t padding) + : _stride(stride) + , _padding(padding) + , _conv(model, scope + "/conv", _stride, _padding) , _transpose({0, 2, 1}) , _output_norm(model, scope + "/layer_norm") { } @@ -60,9 +46,11 @@ namespace ctranslate2 { } Wav2Vec2Encoder::Wav2Vec2Encoder(const models::Model& model, const std::string& scope) - : _feat_layer0(model, scope + "/feat_layer0") + : _feat_layer0(model, scope + "/feat_layer0", /*stride=*/5, /*padding=*/0) , _feat_layers(build_layers_list(model, - scope + "/feat_layer")) + scope + "/feat_layer", + /*stride=*/2, + /*padding=*/0)) , _fp_norm(model, scope + "/fp_layer_norm") , _fp_ff(model, scope + "/fp_projection", nullptr, true) , _pos_conv_embed(model, scope + "/pos_conv_embed") From e5a2a469b0e34bbcf332c99d406134dff0dff9c1 Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 14 Aug 2024 13:14:51 -0700 Subject: [PATCH 08/15] update the variable name --- python/tests/test_transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 5a788ef86..d3319f82a 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -1000,7 +1000,7 @@ def test_transformers_wav2vec2( speech_array = np.load( os.path.join(test_utils.get_data_dir(), "audio", "mr_quilter.npy") ) - input_values = w2v2_processor( + input_values = processor( speech_array, padding=True, return_tensors="pt", From 065b240bfc706cd77f5f8b877d2e664b4207e1b6 Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 14 Aug 2024 15:08:50 -0700 Subject: [PATCH 09/15] adding unk_token removal for the Python testing --- python/tests/test_transformers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index d3319f82a..a5d0706b4 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -1020,5 +1020,6 @@ def test_transformers_wav2vec2( predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids, output_word_offsets=True) + transcription = transcription[0].replace(processor.tokenizer.unk_token,'') - assert transcription[0] == expected_transcription[0] + assert transcription == expected_transcription[0] From 785a0ad08458bf4185d503e53a4043e261942026 Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 14 Aug 2024 15:37:13 -0700 Subject: [PATCH 10/15] adding whitespace --- python/tests/test_transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index a5d0706b4..5e8c2dba8 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -1020,6 +1020,6 @@ def test_transformers_wav2vec2( predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids, output_word_offsets=True) - transcription = transcription[0].replace(processor.tokenizer.unk_token,'') + transcription = transcription[0].replace(processor.tokenizer.unk_token, '') assert transcription == expected_transcription[0] From 4c2b38cf536a57b2e5ac24a024a5d8aa1ef8d98d Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 14 Aug 2024 15:40:09 -0700 Subject: [PATCH 11/15] update Python format --- python/tests/test_transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 5e8c2dba8..3c35445fa 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -1020,6 +1020,6 @@ def test_transformers_wav2vec2( predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids, output_word_offsets=True) - transcription = transcription[0].replace(processor.tokenizer.unk_token, '') + transcription = transcription[0].replace(processor.tokenizer.unk_token, "") assert transcription == expected_transcription[0] From 47f38a785aa6effc7d3d18041c37bf6cb6d94b79 Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 14 Aug 2024 16:57:14 -0700 Subject: [PATCH 12/15] update variables --- python/tests/test_transformers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 3c35445fa..db00ca228 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -1020,6 +1020,6 @@ def test_transformers_wav2vec2( predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids, output_word_offsets=True) - transcription = transcription[0].replace(processor.tokenizer.unk_token, "") + transcriptions = transcription[0].replace(processor.tokenizer.unk_token, "") - assert transcription == expected_transcription[0] + assert transcriptions == expected_transcription[0] From ae69b23ea34412eaae48c1f30c4fa1a2659d21ac Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 14 Aug 2024 18:51:29 -0700 Subject: [PATCH 13/15] update variables --- python/tests/test_transformers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index db00ca228..3c35445fa 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -1020,6 +1020,6 @@ def test_transformers_wav2vec2( predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids, output_word_offsets=True) - transcriptions = transcription[0].replace(processor.tokenizer.unk_token, "") + transcription = transcription[0].replace(processor.tokenizer.unk_token, "") - assert transcriptions == expected_transcription[0] + assert transcription == expected_transcription[0] From 62d879902f84c24f864f43d62571b24c4218b34e Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 14 Aug 2024 20:23:05 -0700 Subject: [PATCH 14/15] update variables --- python/tests/test_transformers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 3c35445fa..db00ca228 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -1020,6 +1020,6 @@ def test_transformers_wav2vec2( predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids, output_word_offsets=True) - transcription = transcription[0].replace(processor.tokenizer.unk_token, "") + transcriptions = transcription[0].replace(processor.tokenizer.unk_token, "") - assert transcription == expected_transcription[0] + assert transcriptions == expected_transcription[0] From 3d76464481ee01ae9b98c706db76a7d8fbaf2c94 Mon Sep 17 00:00:00 2001 From: hkwon Date: Thu, 15 Aug 2024 09:03:54 -0700 Subject: [PATCH 15/15] update variables --- python/tests/test_transformers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index db00ca228..3c35445fa 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -1020,6 +1020,6 @@ def test_transformers_wav2vec2( predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids, output_word_offsets=True) - transcriptions = transcription[0].replace(processor.tokenizer.unk_token, "") + transcription = transcription[0].replace(processor.tokenizer.unk_token, "") - assert transcriptions == expected_transcription[0] + assert transcription == expected_transcription[0]