-
Notifications
You must be signed in to change notification settings - Fork 0
/
dVectorSA.py
84 lines (64 loc) · 2.62 KB
/
dVectorSA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import streamlit as st
import torch
from sklearn.metrics.pairwise import cosine_similarity
import librosa
import numpy as np
import pinecone
# Initialize Pinecone with API key
pinecone.init(api_key="f9571b23-70be-4556-893a-7342b0bb51d1", environment="us-central1-gcp")
pc = pinecone
# Create an index in Pinecone
index = pc.Index('id-index')
def compute_dvector(audio_file):
st.write(audio_file)
# Extract speaker name from the audio file name
speaker_name = (audio_file.name.split('/')[-1]).split('.')[0]
# Load the audio file and set the sampling rate to 16000
audio, sr = librosa.load(audio_file, sr=16000)
# Defining parameters for feature extraction
frame_length = int(sr * 0.025) # 25 ms
hop_length = int(sr * 0.010) # 10 ms
n_fft = 512
n_mels = 40
n_components = 10
# Computing the mel spectrogram of the audio signal
S = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
# Apply logarithm to stabilize the spectrogram
S = np.log(S + 1e-9)
# Compute delta and delta-delta features
delta = librosa.feature.delta(S, width=3)
delta_delta = librosa.feature.delta(S, order=2, width=3)
# Feature Contatenation
features = np.concatenate((S, delta, delta_delta), axis=0)
# The mean and covariance of the features
mean = np.mean(features, axis=1)
cov = np.cov(features)
# Singular value decomposition (SVD) on the covariance matrix
U, s, Vh = np.linalg.svd(cov)
# Construct the affine transformation matrix A
A = np.dot(np.dot(U, np.diag(np.sqrt(s))), U.T)
# Compute the d-vector using the affine transformation and mean features
x = np.mean(features, axis=1)
d = np.dot(np.dot(A, U.T), (x))
# Query the Pinecone index to find the nearest speaker
dvector1 = index.query(
vector=list(d),
top_k=1,
include_values=True
)['matches'][0]
# Return the nearest speaker's ID and score
return "Nearest Speaker Found :: " + dvector1['id'] + " with score of :: " + str(dvector1['score'])
# Define Streamlit app
st.title("Audio Analysis")
st.write("Comparison of two audio samples using d-vectors.")
# Create audio input component for the first audio file
audio_file1 = st.file_uploader("Choose 1st audio file", type=["mp3", "wav", "flac", "m4a"])
if audio_file1 is not None:
# Compute d-vector for the first audio file
rs = compute_dvector(audio_file1)
try:
# Display the nearest speaker ID and score
st.write("Nearest Speaker Found :: " + dvector1[1] + " with score of :: " + dvector1[0])
st.write(rs)
except:
pass