-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #233 from jiangyuxiaoxiao/master
fix: support infer 2.1 models 兼容bug修复
- Loading branch information
Showing
8 changed files
with
746 additions
and
22 deletions.
There are no files selected for viewing
28 changes: 28 additions & 0 deletions
28
emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/.gitattributes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
*.7z filter=lfs diff=lfs merge=lfs -text | ||
*.arrow filter=lfs diff=lfs merge=lfs -text | ||
*.bin filter=lfs diff=lfs merge=lfs -text | ||
*.bin.* filter=lfs diff=lfs merge=lfs -text | ||
*.bz2 filter=lfs diff=lfs merge=lfs -text | ||
*.ftz filter=lfs diff=lfs merge=lfs -text | ||
*.gz filter=lfs diff=lfs merge=lfs -text | ||
*.h5 filter=lfs diff=lfs merge=lfs -text | ||
*.joblib filter=lfs diff=lfs merge=lfs -text | ||
*.lfs.* filter=lfs diff=lfs merge=lfs -text | ||
*.model filter=lfs diff=lfs merge=lfs -text | ||
*.msgpack filter=lfs diff=lfs merge=lfs -text | ||
*.onnx filter=lfs diff=lfs merge=lfs -text | ||
*.ot filter=lfs diff=lfs merge=lfs -text | ||
*.parquet filter=lfs diff=lfs merge=lfs -text | ||
*.pb filter=lfs diff=lfs merge=lfs -text | ||
*.pt filter=lfs diff=lfs merge=lfs -text | ||
*.pth filter=lfs diff=lfs merge=lfs -text | ||
*.rar filter=lfs diff=lfs merge=lfs -text | ||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text | ||
*.tar.* filter=lfs diff=lfs merge=lfs -text | ||
*.tflite filter=lfs diff=lfs merge=lfs -text | ||
*.tgz filter=lfs diff=lfs merge=lfs -text | ||
*.wasm filter=lfs diff=lfs merge=lfs -text | ||
*.xz filter=lfs diff=lfs merge=lfs -text | ||
*.zip filter=lfs diff=lfs merge=lfs -text | ||
*.zstandard filter=lfs diff=lfs merge=lfs -text | ||
*tfevents* filter=lfs diff=lfs merge=lfs -text |
437 changes: 437 additions & 0 deletions
437
emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/LICENSE
Large diffs are not rendered by default.
Oops, something went wrong.
127 changes: 127 additions & 0 deletions
127
emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
--- | ||
language: en | ||
datasets: | ||
- msp-podcast | ||
inference: true | ||
tags: | ||
- speech | ||
- audio | ||
- wav2vec2 | ||
- audio-classification | ||
- emotion-recognition | ||
license: cc-by-nc-sa-4.0 | ||
pipeline_tag: audio-classification | ||
--- | ||
|
||
# Model for Dimensional Speech Emotion Recognition based on Wav2vec 2.0 | ||
|
||
The model expects a raw audio signal as input and outputs predictions for arousal, dominance and valence in a range of approximately 0...1. In addition, it also provides the pooled states of the last transformer layer. The model was created by fine-tuning [ | ||
Wav2Vec2-Large-Robust](https://huggingface.co/facebook/wav2vec2-large-robust) on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) (v1.7). The model was pruned from 24 to 12 transformer layers before fine-tuning. An [ONNX](https://onnx.ai/") export of the model is available from [doi:10.5281/zenodo.6221127](https://zenodo.org/record/6221127). Further details are given in the associated [paper](https://arxiv.org/abs/2203.07378) and [tutorial](https://github.com/audeering/w2v2-how-to). | ||
|
||
# Usage | ||
|
||
```python | ||
import numpy as np | ||
import torch | ||
import torch.nn as nn | ||
from transformers import Wav2Vec2Processor | ||
from transformers.models.wav2vec2.modeling_wav2vec2 import ( | ||
Wav2Vec2Model, | ||
Wav2Vec2PreTrainedModel, | ||
) | ||
|
||
|
||
class RegressionHead(nn.Module): | ||
r"""Classification head.""" | ||
|
||
def __init__(self, config): | ||
|
||
super().__init__() | ||
|
||
self.dense = nn.Linear(config.hidden_size, config.hidden_size) | ||
self.dropout = nn.Dropout(config.final_dropout) | ||
self.out_proj = nn.Linear(config.hidden_size, config.num_labels) | ||
|
||
def forward(self, features, **kwargs): | ||
|
||
x = features | ||
x = self.dropout(x) | ||
x = self.dense(x) | ||
x = torch.tanh(x) | ||
x = self.dropout(x) | ||
x = self.out_proj(x) | ||
|
||
return x | ||
|
||
|
||
class EmotionModel(Wav2Vec2PreTrainedModel): | ||
r"""Speech emotion classifier.""" | ||
|
||
def __init__(self, config): | ||
|
||
super().__init__(config) | ||
|
||
self.config = config | ||
self.wav2vec2 = Wav2Vec2Model(config) | ||
self.classifier = RegressionHead(config) | ||
self.init_weights() | ||
|
||
def forward( | ||
self, | ||
input_values, | ||
): | ||
|
||
outputs = self.wav2vec2(input_values) | ||
hidden_states = outputs[0] | ||
hidden_states = torch.mean(hidden_states, dim=1) | ||
logits = self.classifier(hidden_states) | ||
|
||
return hidden_states, logits | ||
|
||
|
||
|
||
# load model from hub | ||
device = 'cpu' | ||
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim' | ||
processor = Wav2Vec2Processor.from_pretrained(model_name) | ||
model = EmotionModel.from_pretrained(model_name) | ||
|
||
# dummy signal | ||
sampling_rate = 16000 | ||
signal = np.zeros((1, sampling_rate), dtype=np.float32) | ||
|
||
|
||
def process_func( | ||
x: np.ndarray, | ||
sampling_rate: int, | ||
embeddings: bool = False, | ||
) -> np.ndarray: | ||
r"""Predict emotions or extract embeddings from raw audio signal.""" | ||
|
||
# run through processor to normalize signal | ||
# always returns a batch, so we just get the first entry | ||
# then we put it on the device | ||
y = processor(x, sampling_rate=sampling_rate) | ||
y = y['input_values'][0] | ||
y = y.reshape(1, -1) | ||
y = torch.from_numpy(y).to(device) | ||
|
||
# run through model | ||
with torch.no_grad(): | ||
y = model(y)[0 if embeddings else 1] | ||
|
||
# convert to numpy | ||
y = y.detach().cpu().numpy() | ||
|
||
return y | ||
|
||
|
||
print(process_func(signal, sampling_rate)) | ||
# Arousal dominance valence | ||
# [[0.5460754 0.6062266 0.40431657]] | ||
|
||
print(process_func(signal, sampling_rate, embeddings=True)) | ||
# Pooled hidden states of last transformer layer | ||
# [[-0.00752167 0.0065819 -0.00746342 ... 0.00663632 0.00848748 | ||
# 0.00599211]] | ||
``` |
122 changes: 122 additions & 0 deletions
122
emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
{ | ||
"_name_or_path": "torch", | ||
"activation_dropout": 0.1, | ||
"adapter_kernel_size": 3, | ||
"adapter_stride": 2, | ||
"add_adapter": false, | ||
"apply_spec_augment": true, | ||
"architectures": [ | ||
"Wav2Vec2ForSpeechClassification" | ||
], | ||
"attention_dropout": 0.1, | ||
"bos_token_id": 1, | ||
"classifier_proj_size": 256, | ||
"codevector_dim": 768, | ||
"contrastive_logits_temperature": 0.1, | ||
"conv_bias": true, | ||
"conv_dim": [ | ||
512, | ||
512, | ||
512, | ||
512, | ||
512, | ||
512, | ||
512 | ||
], | ||
"conv_kernel": [ | ||
10, | ||
3, | ||
3, | ||
3, | ||
3, | ||
2, | ||
2 | ||
], | ||
"conv_stride": [ | ||
5, | ||
2, | ||
2, | ||
2, | ||
2, | ||
2, | ||
2 | ||
], | ||
"ctc_loss_reduction": "sum", | ||
"ctc_zero_infinity": false, | ||
"diversity_loss_weight": 0.1, | ||
"do_stable_layer_norm": true, | ||
"eos_token_id": 2, | ||
"feat_extract_activation": "gelu", | ||
"feat_extract_dropout": 0.0, | ||
"feat_extract_norm": "layer", | ||
"feat_proj_dropout": 0.1, | ||
"feat_quantizer_dropout": 0.0, | ||
"final_dropout": 0.1, | ||
"finetuning_task": "wav2vec2_reg", | ||
"gradient_checkpointing": false, | ||
"hidden_act": "gelu", | ||
"hidden_dropout": 0.1, | ||
"hidden_dropout_prob": 0.1, | ||
"hidden_size": 1024, | ||
"id2label": { | ||
"0": "arousal", | ||
"1": "dominance", | ||
"2": "valence" | ||
}, | ||
"initializer_range": 0.02, | ||
"intermediate_size": 4096, | ||
"label2id": { | ||
"arousal": 0, | ||
"dominance": 1, | ||
"valence": 2 | ||
}, | ||
"layer_norm_eps": 1e-05, | ||
"layerdrop": 0.1, | ||
"mask_feature_length": 10, | ||
"mask_feature_min_masks": 0, | ||
"mask_feature_prob": 0.0, | ||
"mask_time_length": 10, | ||
"mask_time_min_masks": 2, | ||
"mask_time_prob": 0.05, | ||
"model_type": "wav2vec2", | ||
"num_adapter_layers": 3, | ||
"num_attention_heads": 16, | ||
"num_codevector_groups": 2, | ||
"num_codevectors_per_group": 320, | ||
"num_conv_pos_embedding_groups": 16, | ||
"num_conv_pos_embeddings": 128, | ||
"num_feat_extract_layers": 7, | ||
"num_hidden_layers": 12, | ||
"num_negatives": 100, | ||
"output_hidden_size": 1024, | ||
"pad_token_id": 0, | ||
"pooling_mode": "mean", | ||
"problem_type": "regression", | ||
"proj_codevector_dim": 768, | ||
"tdnn_dilation": [ | ||
1, | ||
2, | ||
3, | ||
1, | ||
1 | ||
], | ||
"tdnn_dim": [ | ||
512, | ||
512, | ||
512, | ||
512, | ||
1500 | ||
], | ||
"tdnn_kernel": [ | ||
5, | ||
3, | ||
3, | ||
1, | ||
1 | ||
], | ||
"torch_dtype": "float32", | ||
"transformers_version": "4.17.0.dev0", | ||
"use_weighted_layer_sum": false, | ||
"vocab_size": null, | ||
"xvector_output_dim": 512 | ||
} |
9 changes: 9 additions & 0 deletions
9
emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/preprocessor_config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
{ | ||
"do_normalize": true, | ||
"feature_extractor_type": "Wav2Vec2FeatureExtractor", | ||
"feature_size": 1, | ||
"padding_side": "right", | ||
"padding_value": 0.0, | ||
"return_attention_mask": true, | ||
"sampling_rate": 16000 | ||
} |
1 change: 1 addition & 0 deletions
1
emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/vocab.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters