Skip to content

Commit

Permalink
Merge pull request #233 from jiangyuxiaoxiao/master
Browse files Browse the repository at this point in the history
fix: support infer 2.1 models 兼容bug修复
  • Loading branch information
jiangyuxiaoxiao authored Dec 13, 2023
2 parents ce26468 + 79efeec commit 2a37dbd
Show file tree
Hide file tree
Showing 8 changed files with 746 additions and 22 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bin.* filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
437 changes: 437 additions & 0 deletions emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/LICENSE

Large diffs are not rendered by default.

127 changes: 127 additions & 0 deletions emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
---
language: en
datasets:
- msp-podcast
inference: true
tags:
- speech
- audio
- wav2vec2
- audio-classification
- emotion-recognition
license: cc-by-nc-sa-4.0
pipeline_tag: audio-classification
---

# Model for Dimensional Speech Emotion Recognition based on Wav2vec 2.0

The model expects a raw audio signal as input and outputs predictions for arousal, dominance and valence in a range of approximately 0...1. In addition, it also provides the pooled states of the last transformer layer. The model was created by fine-tuning [
Wav2Vec2-Large-Robust](https://huggingface.co/facebook/wav2vec2-large-robust) on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) (v1.7). The model was pruned from 24 to 12 transformer layers before fine-tuning. An [ONNX](https://onnx.ai/") export of the model is available from [doi:10.5281/zenodo.6221127](https://zenodo.org/record/6221127). Further details are given in the associated [paper](https://arxiv.org/abs/2203.07378) and [tutorial](https://github.com/audeering/w2v2-how-to).

# Usage

```python
import numpy as np
import torch
import torch.nn as nn
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
Wav2Vec2Model,
Wav2Vec2PreTrainedModel,
)


class RegressionHead(nn.Module):
r"""Classification head."""

def __init__(self, config):

super().__init__()

self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.final_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

def forward(self, features, **kwargs):

x = features
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)

return x


class EmotionModel(Wav2Vec2PreTrainedModel):
r"""Speech emotion classifier."""

def __init__(self, config):

super().__init__(config)

self.config = config
self.wav2vec2 = Wav2Vec2Model(config)
self.classifier = RegressionHead(config)
self.init_weights()

def forward(
self,
input_values,
):

outputs = self.wav2vec2(input_values)
hidden_states = outputs[0]
hidden_states = torch.mean(hidden_states, dim=1)
logits = self.classifier(hidden_states)

return hidden_states, logits



# load model from hub
device = 'cpu'
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name)

# dummy signal
sampling_rate = 16000
signal = np.zeros((1, sampling_rate), dtype=np.float32)


def process_func(
x: np.ndarray,
sampling_rate: int,
embeddings: bool = False,
) -> np.ndarray:
r"""Predict emotions or extract embeddings from raw audio signal."""

# run through processor to normalize signal
# always returns a batch, so we just get the first entry
# then we put it on the device
y = processor(x, sampling_rate=sampling_rate)
y = y['input_values'][0]
y = y.reshape(1, -1)
y = torch.from_numpy(y).to(device)

# run through model
with torch.no_grad():
y = model(y)[0 if embeddings else 1]

# convert to numpy
y = y.detach().cpu().numpy()

return y


print(process_func(signal, sampling_rate))
# Arousal dominance valence
# [[0.5460754 0.6062266 0.40431657]]

print(process_func(signal, sampling_rate, embeddings=True))
# Pooled hidden states of last transformer layer
# [[-0.00752167 0.0065819 -0.00746342 ... 0.00663632 0.00848748
# 0.00599211]]
```
122 changes: 122 additions & 0 deletions emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
{
"_name_or_path": "torch",
"activation_dropout": 0.1,
"adapter_kernel_size": 3,
"adapter_stride": 2,
"add_adapter": false,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2ForSpeechClassification"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"classifier_proj_size": 256,
"codevector_dim": 768,
"contrastive_logits_temperature": 0.1,
"conv_bias": true,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"diversity_loss_weight": 0.1,
"do_stable_layer_norm": true,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "layer",
"feat_proj_dropout": 0.1,
"feat_quantizer_dropout": 0.0,
"final_dropout": 0.1,
"finetuning_task": "wav2vec2_reg",
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"id2label": {
"0": "arousal",
"1": "dominance",
"2": "valence"
},
"initializer_range": 0.02,
"intermediate_size": 4096,
"label2id": {
"arousal": 0,
"dominance": 1,
"valence": 2
},
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_feature_length": 10,
"mask_feature_min_masks": 0,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_min_masks": 2,
"mask_time_prob": 0.05,
"model_type": "wav2vec2",
"num_adapter_layers": 3,
"num_attention_heads": 16,
"num_codevector_groups": 2,
"num_codevectors_per_group": 320,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 12,
"num_negatives": 100,
"output_hidden_size": 1024,
"pad_token_id": 0,
"pooling_mode": "mean",
"problem_type": "regression",
"proj_codevector_dim": 768,
"tdnn_dilation": [
1,
2,
3,
1,
1
],
"tdnn_dim": [
512,
512,
512,
512,
1500
],
"tdnn_kernel": [
5,
3,
3,
1,
1
],
"torch_dtype": "float32",
"transformers_version": "4.17.0.dev0",
"use_weighted_layer_sum": false,
"vocab_size": null,
"xvector_output_dim": 512
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"do_normalize": true,
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
"feature_size": 1,
"padding_side": "right",
"padding_value": 0.0,
"return_attention_mask": true,
"sampling_rate": 16000
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
42 changes: 21 additions & 21 deletions infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from oldVersion.V101.models import SynthesizerTrn as V101SynthesizerTrn
from oldVersion.V101.text import symbols as V101symbols

from oldVersion import V111, V110, V101, V200
from oldVersion import V111, V110, V101, V200, V210

# 当前版本信息
latest_version = "2.2"
Expand Down Expand Up @@ -157,9 +157,9 @@ def infer(
):
# 2.2版本参数位置变了
# 2.1 参数新增 emotion reference_audio skip_start skip_end
# inferMap_V3 = {
# "2.1": V210.infer,
# }
inferMap_V3 = {
"2.1": V210.infer,
}
# 支持中日英三语版本
inferMap_V2 = {
"2.0.2-fix": V200.infer,
Expand All @@ -180,23 +180,23 @@ def infer(
version = hps.version if hasattr(hps, "version") else latest_version
# 非当前版本,根据版本号选择合适的infer
if version != latest_version:
# if version in inferMap_V3.keys():
# return inferMap_V3[version](
# text,
# sdp_ratio,
# noise_scale,
# noise_scale_w,
# length_scale,
# sid,
# language,
# hps,
# net_g,
# device,
# reference_audio,
# emotion,
# skip_start,
# skip_end,
# )
if version in inferMap_V3.keys():
return inferMap_V3[version](
text,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
sid,
language,
hps,
net_g,
device,
reference_audio,
emotion,
skip_start,
skip_end,
)
if version in inferMap_V2.keys():
return inferMap_V2[version](
text,
Expand Down
2 changes: 1 addition & 1 deletion oldVersion/V210/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from vector_quantize_pytorch import VectorQuantize

from commons import init_weights, get_padding
from text import symbols, num_tones, num_languages
from .text import symbols, num_tones, num_languages


class DurationDiscriminator(nn.Module): # vits2
Expand Down

0 comments on commit 2a37dbd

Please sign in to comment.