diff --git a/docs/src/pages/operators/audio-vec-embedding.mdx b/docs/src/pages/operators/audio-vec-embedding.mdx new file mode 100644 index 00000000..584ceb06 --- /dev/null +++ b/docs/src/pages/operators/audio-vec-embedding.mdx @@ -0,0 +1,27 @@ +## Documentation for Audio Vector Embeddings + +[[Article Link](https://www.elastic.co/blog/searching-by-music-leveraging-vector-search-audio-information-retrieval)] [[GitHub](https://github.com/salgado/music-search)] + +Given an audio file, this methods finds a vector of 2048 dimensions using PANNs. PANN is a CNN that is pre-trained on lot of audio files. They have been used for audio tagging and sound event detection. The PANNs have been used to fine-tune several audio pattern recognition tasks, and have outperformed several state-of-the-art systems. + +### Embeddings for vector audio search + +Audio embeddings are often generated using spectrograms or other audio signal features. In the context of audio signal processing for machine learning, the process of feature extraction from spectrograms is a crucial step. Spectrograms are visual representations of the frequency content of audio signals over time. The identified features in this context encompass three specific types: +- Mel-frequency cepstral coefficients (MFCCs) +- Chroma features: Chroma features represent the 12 distinct pitch classes of the musical octave and are particularly useful in music-related tasks. +- Spectral contrast: Spectral contrast focuses on the perceptual brightness of different frequency bands within an audio signal. + +## Implementation left +To run this operator, we also have to install `wget`. this is not currently being installed in the Dockerfile. A step to perform is +```sh +apt-get install wget +``` +This will now make sure the operator is running + +## How to Run the Test +The operator and the test file can be found at `src/api/core/operators` folder in the codebase. The operator is named `audio_vec_embedding.py` and the test file is named `test_audio_vec_embedding.py` + +To run the test, simply just run the test file +``` +python -m unittest test_audio_vec_embedding.py +``` \ No newline at end of file diff --git a/src/api/core/operators/audio_vec_embedding.py b/src/api/core/operators/audio_vec_embedding.py new file mode 100644 index 00000000..ff69c749 --- /dev/null +++ b/src/api/core/operators/audio_vec_embedding.py @@ -0,0 +1,46 @@ +def initialize(param): + global model + global librosa + global np + + import numpy as np + import librosa + from panns_inference import AudioTagging + + # load the default model into cpu. + model = AudioTagging(checkpoint_path=None, device='cpu') + print('model successfully downloaded') + +# Function to normalize a vector. Normalizing a vector means adjusting the values measured in different scales to a common scale. +def normalize(v): + # np.linalg.norm computes the vector's norm (magnitude). The norm is the total length of all vectors in a space. + norm = np.linalg.norm(v) + if norm == 0: + return v + # Return the normalized vector. + return v / norm + +def run(audio_file): + # Load the audio file using librosa's load function, which returns an audio time series and its corresponding sample rate. + a, _ = librosa.load(audio_file, sr=44100) + # Reshape the audio time series to have an extra dimension, which is required by the model's inference function. + query_audio = a[None, :] + # Perform inference on the reshaped audio using the model. This returns an embedding of the audio. + _, emb = model.inference(query_audio) + # Normalize the embedding. This scales the embedding to have a length (magnitude) of 1, while maintaining its direction. + normalized_v = normalize(emb[0]) + # Return the normalized embedding required for dot_product elastic similarity dense vector + return normalized_v + +# if __name__ == "__main__": +# import json +# import os +# audio_file_path = r'sample_data/google-dataset/a-cappella-chorus.wav' +# initialize(param={}) +# audio_filename = os.path.splitext(os.path.basename(audio_file_path))[0] +# audio_emb = run(audio_file_path) +# audio_emb_list = audio_emb.tolist() +# print(audio_emb_list) + # json_filename = fr"sample_data/jsons/{audio_filename}_emb.json" + # with open(json_filename, 'w') as f: + # json.dump(audio_emb_list, f) \ No newline at end of file diff --git a/src/api/core/operators/test_audio_vec_embedding.py b/src/api/core/operators/test_audio_vec_embedding.py new file mode 100644 index 00000000..9e9e1cdd --- /dev/null +++ b/src/api/core/operators/test_audio_vec_embedding.py @@ -0,0 +1,19 @@ +import unittest +from unittest.case import skip +import audio_vec_embedding + +class Test(unittest.TestCase): + @classmethod + def setUpClass(cls): + # initialize operator + audio_vec_embedding.initialize(param={}) + + @classmethod + def tearDownClass(cls): + # delete config files + pass + + def test_sample_audio_from_disk(self): + audio_file_path = r'sample_data/audio.wav' + audio_emb = audio_vec_embedding.run(audio_file_path) + self.assertEqual(2048, len(audio_emb)) \ No newline at end of file