alexandrainst · saattrupdan · Aug 29, 2023 · Jul 31, 2023 · Jul 31, 2023 · Jul 31, 2023
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -17,7 +17,7 @@ jobs:
     strategy:
         matrix:
             os: [windows-latest, macos-latest, ubuntu-latest]
-            python-version: ["3.10"]
+            python-version: ["3.10", "3.11"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v3

diff --git a/.gitignore b/.gitignore
@@ -106,4 +106,4 @@ data/
 models/
 
 # Weights and Biases experiment tracking
-wandb/
+wandb/
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ ______________________________________________________________________
 [![Documentation](https://img.shields.io/badge/docs-passing-green)](https://alexandrainst.github.io/CoRal-models/coral_models.html)
 [![License](https://img.shields.io/github/license/alexandrainst/CoRal-models)](https://github.com/alexandrainst/CoRal-models/blob/main/LICENSE)
 [![LastCommit](https://img.shields.io/github/last-commit/alexandrainst/CoRal-models)](https://github.com/alexandrainst/CoRal-models/commits/main)
-[![Code Coverage](https://img.shields.io/badge/Coverage-53%25-orange.svg)](https://github.com/alexandrainst/CoRal-models/tree/main/tests)
+[![Code Coverage](https://img.shields.io/badge/Coverage-61%25-yellow.svg)](https://github.com/alexandrainst/CoRal-models/tree/main/tests)
 
 
 Developers:

diff --git a/config/config.yaml b/config/config.yaml
@@ -11,17 +11,25 @@ dirs:
   final: final
   models: models
 
+seed: 4242
+
 # Model parameters
 pipeline_id: ${model.name}-${dataset.name}
 hub_id: alexandrainst/${pipeline_id}
 model_dir: ${dirs.models}/${pipeline_id}
 push_to_hub: false
 
-# Data parameters
-characters_to_keep: 'abcdefghijklmnopqrstuvwxyzæøå0123456789é|'
-
 # Training parameters
 resume_from_checkpoint: false
 ignore_data_skip: false
 wandb: false
-wandb_name: default
+wandb_project: CoRal
+wandb_group: null
+wandb_name: ${pipeline_id}
+logging_steps: 10
+eval_steps: 100
+save_steps: 100
+save_total_limit: 2
+early_stopping: true
+early_stopping_patience: 10
+fp16: true
diff --git a/config/dataset/common_voice_da.yaml b/config/dataset/common_voice_da.yaml
@@ -5,4 +5,3 @@ train_name: train
 val_name: validation
 test_name: test
 text_column: sentence
-sampling_rate: 16_000
diff --git a/config/dataset/common_voice_nn.yaml b/config/dataset/common_voice_nn.yaml
@@ -0,0 +1,7 @@
+name: common_voice_nn
+id: mozilla-foundation/common_voice_13_0
+subset: nn-NO
+train_name: train
+val_name: validation
+test_name: test
+text_column: sentence
diff --git a/config/dataset/common_voice_sv.yaml b/config/dataset/common_voice_sv.yaml
@@ -0,0 +1,7 @@
+name: common_voice_sv
+id: mozilla-foundation/common_voice_13_0
+subset: sv-SE
+train_name: train
+val_name: validation
+test_name: test
+text_column: sentence
diff --git a/config/dataset/fleurs_da.yaml b/config/dataset/fleurs_da.yaml
@@ -0,0 +1,7 @@
+name: fleurs_da
+id: google/fleurs
+subset: da_dk
+train_name: train
+val_name: validation
+test_name: test
+text_column: raw_transcription
diff --git a/config/dataset/fleurs_nb.yaml b/config/dataset/fleurs_nb.yaml
@@ -0,0 +1,7 @@
+name: fleurs_nb
+id: google/fleurs
+subset: nb_no
+train_name: train
+val_name: validation
+test_name: test
+text_column: raw_transcription
diff --git a/config/dataset/fleurs_sv.yaml b/config/dataset/fleurs_sv.yaml
@@ -0,0 +1,7 @@
+name: fleurs_sv
+id: google/fleurs
+subset: sv_se
+train_name: train
+val_name: validation
+test_name: test
+text_column: raw_transcription
diff --git a/config/dataset/ftspeech.yaml b/config/dataset/ftspeech.yaml
@@ -5,4 +5,3 @@ train_name: train
 val_name: dev_balanced
 test_name: test_balanced
 text_column: sentence
-sampling_rate: 16_000
diff --git a/config/dataset/nota.yaml b/config/dataset/nota.yaml
@@ -0,0 +1,7 @@
+name: nota
+id: arpelarpe/nota
+subset: null
+train_name: train
+val_name: null
+test_name: null
+text_column: sentence
diff --git a/config/dataset/test.yaml → config/dataset/test_dataset.yaml b/config/dataset/test.yaml → config/dataset/test_dataset.yaml
@@ -1,8 +1,7 @@
-name: test
+name: test_dataset
 id: alexandrainst/audio_test_dataset
 subset: null
 train_name: train
 val_name: validation
 test_name: test
 text_column: sentence
-sampling_rate: 16_000
diff --git a/config/model/test.yaml → config/model/test_wav2vec2.yaml b/config/model/test.yaml → config/model/test_wav2vec2.yaml
@@ -1,8 +1,12 @@
-name: wav2vec2-300m-ngram
+name: test_wav2vec2
 type: wav2vec2
 pretrained_model_id: chcaa/xls-r-300m-danish
 freeze_feature_encoder: true
 
+# Data hyperparameters
+clean_dataset: true
+characters_to_keep: 'abcdefghijklmnopqrstuvwxyzæøå0123456789é|'
+
 # Model hyperparameters
 sampling_rate: 16_000
 activation_dropout: 0.1
@@ -11,6 +15,7 @@ hidden_dropout: 0.1
 feat_proj_dropout: 0.1
 final_dropout: 0.1
 mask_time_prob: 0.075
+mask_time_length: 10
 mask_feature_prob: 0.075
 mask_feature_length: 10
 layerdrop: 0.1
@@ -28,7 +33,3 @@ warmup_steps: 1
 early_stopping: true
 early_stopping_patience: 5
 fp16: false
-eval_steps: 500
-save_steps: 500
-logging_steps: 100
-save_total_limit: 2
diff --git a/config/model/test_whisper.yaml b/config/model/test_whisper.yaml
@@ -0,0 +1,28 @@
+name: test_whisper
+type: whisper
+pretrained_model_id: openai/whisper-tiny
+freeze_feature_encoder: true
+
+# Data hyperparameters
+clean_dataset: false
+
+# Model hyperparameters
+sampling_rate: 16_000
+dropout: 0.1
+activation_dropout: 0.1
+attention_dropout: 0.1
+mask_time_prob: 0.5
+mask_time_length: 10
+mask_feature_prob: 0.5
+mask_feature_length: 64
+
+# Training hyperparameters
+batch_size: 1
+gradient_accumulation: 1
+max_steps: 3
+learning_rate: 4e-5
+warmup_steps: 1
+early_stopping: true
+early_stopping_patience: 5
+fp16: false
+generation_max_length: 1
diff --git a/config/model/wav2vec2.yaml b/config/model/wav2vec2.yaml
@@ -1,8 +1,12 @@
-name: wav2vec2-300m-ngram
+name: wav2vec2
 type: wav2vec2
 pretrained_model_id: chcaa/xls-r-300m-danish
 freeze_feature_encoder: false
 
+# Data hyperparameters
+clean_dataset: true
+characters_to_keep: 'abcdefghijklmnopqrstuvwxyzæøå0123456789é|ü'
+
 # Model hyperparameters
 sampling_rate: 16_000
 activation_dropout: 0.1
@@ -11,24 +15,23 @@ hidden_dropout: 0.1
 feat_proj_dropout: 0.1
 final_dropout: 0.1
 mask_time_prob: 0.5
+mask_time_length: 10
 mask_feature_prob: 0.5
 mask_feature_length: 64
 layerdrop: 0.1
 ctc_loss_reduction: sum
 
 # Decoder hyperparameters
-language_model_decoder: null
+language_model_decoder: ngram
+decoder:
+  dataset_id: DDSC/reddit-da-asr-preprocessed
+  dataset_subset: null
+  dataset_split: train
+  n: 5
 
 # Training hyperparameters
 batch_size: 2
 gradient_accumulation: 16
 max_steps: 120_000
 learning_rate: 3e-5
-warmup_steps: 1000
-early_stopping: true
-early_stopping_patience: 10
-fp16: true
-eval_steps: 1000
-save_steps: 1000
-logging_steps: 100
-save_total_limit: 2
+warmup_steps: 500
diff --git a/config/model/wav2vec2_with_lm.yaml b/config/model/wav2vec2_with_lm.yaml
diff --git a/config/model/whisper.yaml b/config/model/whisper.yaml
diff --git a/config/model/whisper_large.yaml b/config/model/whisper_large.yaml
@@ -0,0 +1,25 @@
+name: whisper_large
+type: whisper
+pretrained_model_id: openai/whisper-large-v2
+freeze_feature_encoder: false
+
+# Data hyperparameters
+clean_dataset: false
+
+# Model hyperparameters
+sampling_rate: 16_000
+dropout: 0.1
+activation_dropout: 0.1
+attention_dropout: 0.1
+mask_time_prob: 0.5
+mask_time_length: 10
+mask_feature_prob: 0.5
+mask_feature_length: 64
+
+# Training hyperparameters
+batch_size: 1
+gradient_accumulation: 32
+max_steps: 120_000
+learning_rate: 3e-5
+warmup_steps: 500
+generation_max_length: 225
diff --git a/config/model/whisper_medium.yaml b/config/model/whisper_medium.yaml
@@ -0,0 +1,25 @@
+name: whisper_medium
+type: whisper
+pretrained_model_id: openai/whisper-medium
+freeze_feature_encoder: false
+
+# Data hyperparameters
+clean_dataset: false
+
+# Model hyperparameters
+sampling_rate: 16_000
+dropout: 0.1
+activation_dropout: 0.1
+attention_dropout: 0.1
+mask_time_prob: 0.5
+mask_time_length: 10
+mask_feature_prob: 0.5
+mask_feature_length: 64
+
+# Training hyperparameters
+batch_size: 8
+gradient_accumulation: 4
+max_steps: 120_000
+learning_rate: 3e-5
+warmup_steps: 500
+generation_max_length: 225
diff --git a/config/model/whisper_small.yaml b/config/model/whisper_small.yaml
@@ -0,0 +1,25 @@
+name: whisper_small
+type: whisper
+pretrained_model_id: openai/whisper-small
+freeze_feature_encoder: false
+
+# Data hyperparameters
+clean_dataset: false
+
+# Model hyperparameters
+sampling_rate: 16_000
+dropout: 0.1
+activation_dropout: 0.1
+attention_dropout: 0.1
+mask_time_prob: 0.5
+mask_time_length: 10
+mask_feature_prob: 0.5
+mask_feature_length: 64
+
+# Training hyperparameters
+batch_size: 32
+gradient_accumulation: 1
+max_steps: 120_000
+learning_rate: 3e-5
+warmup_steps: 500
+generation_max_length: 225
diff --git a/config/model/whisper_xsmall.yaml b/config/model/whisper_xsmall.yaml
@@ -0,0 +1,25 @@
+name: whisper_xsmall
+type: whisper
+pretrained_model_id: openai/whisper-base
+freeze_feature_encoder: false
+
+# Data hyperparameters
+clean_dataset: false
+
+# Model hyperparameters
+sampling_rate: 16_000
+dropout: 0.1
+activation_dropout: 0.1
+attention_dropout: 0.1
+mask_time_prob: 0.5
+mask_time_length: 10
+mask_feature_prob: 0.5
+mask_feature_length: 64
+
+# Training hyperparameters
+batch_size: 32
+gradient_accumulation: 1
+max_steps: 120_000
+learning_rate: 3e-5
+warmup_steps: 500
+generation_max_length: 225