# Import necessary modules
import librosa
import numpy as np
import matplotlib.pyplot as plt

# Set file path and load audio data
wavsFolder = "../../Wavs/NormalizedAudio/"
fileName = "usc_timit_033.wav"
filePath = wavsFolder + fileName

try:
    audio, sr = librosa.load(filePath, sr=None, mono=True)
except FileNotFoundError:
    print(f"File not found: {filePath}")
# Build the time axis
time = np.linspace(0,len(audio)/sr,len(audio))

# Plot audio waveform
plt.figure(figsize=(14, 5))
plt.plot(time, audio)
plt.title(f"Audio waveform: {fileName}")
plt.xlabel("Time [s]")
plt.ylabel("Amplitude")
plt.show()


def preEmphasis(audio, a=0.95):
    """
    Applies pre-emphasis to an audio signal.

    Parameters:
    audio (numpy.ndarray): Audio signal data.
    a (float): Pre-emphasis coefficient.

    Returns:
    numpy.ndarray: Pre-emphasized audio signal data.
    """
    if not isinstance(audio, np.ndarray):
        raise TypeError("Input audio must be a numpy array.")

    if a < 0 or a > 1:
        raise ValueError("Pre-emphasis coefficient must be between 0 and 1.")

    audio_preEmphasis = np.zeros_like(audio)
    audio_preEmphasis[0] = audio[0]
    for i in range(1, len(audio)):
        audio_preEmphasis[i] = audio[i] - a*audio[i-1]
    return audio_preEmphasis


from scipy.signal import hamming

winLen = len(audio)
win = hamming(winLen)
n = 2 ** int(np.ceil(np.log2(len(audio))))

# Plot of the original audio file
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16,8))
axs[0][0].plot(time, audio, color='#008000', label='original audio')
axs[0][0].legend()
axs[0][0].set_xlabel("Time [s]")
axs[0][0].set_ylabel("Audio amplitude")
axs[0][0].set_title("Original audio")

# Amplitude Spectrum of the original audio file
audio_spectrum = np.fft.fft(audio*win,n) # computes the n-point DFT
freqAxis = np.fft.fftfreq(n, 1.0/sr)[0:n//2+1] # returns the n-point frequency axis, given a certain sample spacing
nyqFreq = sr/2
freqAxis[-1] = nyqFreq
axs[1][0].plot(freqAxis, 2*np.abs(audio_spectrum)[0:n//2+1], color='#008000', label='original audio')
axs[1][0].set_xlabel("Freq [Hz]")
axs[1][0].set_ylabel("Amplitude Spectrum")

# Pre-emphasis
audio_preEmphasis = preEmphasis(audio)
axs[0][1].plot(time, audio_preEmphasis, color='#FFA500', label='pre-emphasized audio')
axs[0][1].legend()
axs[0][1].set_xlabel("Time [s]")
axs[0][1].set_ylabel("Audio amplitude")
axs[0][1].set_title("Pre-emphasized audio")
ylim = abs(max(np.concatenate([audio,audio_preEmphasis])))
axs[0][0].set_ylim([-ylim,ylim])
axs[0][1].set_ylim([-ylim,ylim])

# Amplitude Spectrum of the pre-emphasized audio
audio_preEmphasis_spectrum = np.fft.fft(audio_preEmphasis*win,n) # computes the n-point DFT
axs[1][1].plot(freqAxis, 2*np.abs(audio_preEmphasis_spectrum)[0:n//2+1], color='#FFA500', label='pre-emphasized audio')
axs[1][1].set_xlabel("Freq [Hz]")
axs[1][1].set_ylabel("Amplitude Spectrum")
plt.show()


# Verify what happens to the remaining audio segment at the end
def compute_spectrogram(audio, sr, winLen_seconds=0.02, winOverlap_seconds=0.01, n=None):
    """
    Computes the spectrogram of an audio signal.

    Args:
        audio (array): The input audio signal.
        win_len_seconds (float): The length of the analysis window in seconds.
        win_overlap_seconds (float): The overlap between adjacent windows in seconds.
        n (int): The FFT length. If None, it will use the next power of two greater than the window length.

    Returns:
        The spectrogram, time axis, and frequency axis of the audio signal.
    """
    # Determine the window length and overlap in samples
    winLen = int(winLen_seconds * sr)
    winOverlap = int(winOverlap_seconds * sr)
    
    # Use default value for n if it is not specified
    if n is None:
        n = 2 ** int(np.ceil(np.log2(winLen)))
    
    # Create a Hamming window of the appropriate length
    win = hamming(winLen)
    nWin = int((len(audio) - 2*(winLen - winOverlap))/(winLen - winOverlap))
    
    # Initialize the spectrogram array
    spectrogram = np.zeros((nWin, n // 2 + 1))    

    # Compute the spectrogram for each window
    for i in range(nWin):
        # Extract the windowed segment from the audio signal
        start = (winLen - winOverlap)*i
        stop = start + winLen
        audio_segment = audio[start:stop]

        # Apply the Hamming window to the audio segment
        audio_segment_windowed = audio_segment * win
        # Compute the fft of the windowed segment and take the single-sided spectrum
        audio_segment_windowed_spectrum = np.fft.fft(audio_segment_windowed,n)[0:n // 2 + 1]

        # Store the Amplitude Spectrum in the spectrogram
        spectrogram[i,:] = np.abs(audio_segment_windowed_spectrum)
    
    # Computhe the frequency and time axis
    audio_segment_windowed_freqAxis = np.fft.fftfreq(n,1.0/sr)[0:n // 2 + 1]
    audio_segment_windowed_timeAxis = np.arange((winLen)/2, (winLen - winOverlap)*(nWin-1) + winLen, (winLen - winOverlap)) / sr
    return spectrogram, n, audio_segment_windowed_timeAxis, audio_segment_windowed_freqAxis


audio_spectrogram, n_fft, audio_windowed_time, audio_windowed_freq = compute_spectrogram(audio, sr)
audio_preEmphasis_spectrogram, _,  _, _ = compute_spectrogram(audio_preEmphasis, sr)

# Create a figure with two subplots side-by-side
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))

# Original audio
axs[0].set_title("Audio-Original Spectrogram")
axs[0].set_xlabel("Time [s]")
axs[0].set_ylabel("Frequency [KHz]")
audioSpecrogramImage = axs[0].imshow(20*np.log10(audio_spectrogram.T),origin='lower',aspect="auto",
                                     extent=[audio_windowed_time[0],audio_windowed_time[-1], audio_windowed_freq[0]/1000, audio_windowed_freq.max()/1000]) # log for dB

# Emphasized audio
axs[1].set_title("Audio-Emphasized Spectrogram")
axs[1].set_xlabel("Time [s]")
axs[1].set_ylabel("Frequency [KHz]")
audioEmphasizedSpectrogramImage = axs[1].imshow(20*np.log10(audio_preEmphasis_spectrogram.T),origin="lower",aspect="auto",
                                                extent=[audio_windowed_time[0],audio_windowed_time[-1], audio_windowed_freq[0]/1000, audio_windowed_freq.max()/1000]) # log for dB

# Set a tight layout to avoid overlapping
fig.tight_layout()


def hertzToMel(f):
    """
    Convert frequency in Hertz to Mel scale
    
    Parameters:
    -----------
    f: float
        Frequency in Hertz
    
    Returns:
    --------
    float
        Frequency in Mel scale
    """
    return (np.log10(1+f/1000) / np.log10(2)) * 1000

def melToHertz(m):
    """
    Convert frequency in Mel scale to Hertz
    
    Parameters:
    -----------
    m: float
        Frequency in Mel scale
    
    Returns:
    --------
    float
        Frequency in Hertz
    """
    return (10**((m/1000)*(np.log10(2))) - 1) * 1000

def designFilterbank(sr, n_fft, fMax=None, fMin=0, nMel=24):
    """
    Design a filterbank in the Mel domain
    
    Parameters:
    -----------
    sr: int
        Sampling frequency in Hz
    nFFT: int
        Number of points in the FFT
    fMax: int or str, optional
        Upper limit frequency in Hz, default is 'Nyq' (half the sampling frequency)
    fMin: int, optional
        Lower limit frequency in Hz, default is 0
    nMel: int, optional
        Number of Mel bands to use, default is 24
        
    Returns:
    --------
    ndarray
        A matrix of shape (nMel, nFFT // 2 + 1) representing the filterbank
    """
    if fMax is None:
        fMax = sr/2
    melMin = hertzToMel(fMin)
    melMax = hertzToMel(fMax)
    melPoints = np.linspace(melMin, melMax, nMel+2) # I want nMel overlapped triangular filters, so I need nMel + 2 points on the x-axis
    # Filterbank design in the Hertz domain
    hertzPoints = melToHertz(melPoints)
    # Number of freq points in each band (depending on the n-point FFT)
    pointsPerHertz = (n_fft // 2 + 1)/(fMax - fMin)
    fftBins = np.floor(hertzPoints * pointsPerHertz).astype(int)
    filterbank = np.zeros((nMel, n_fft // 2 + 1))
    
    for i in range(1,nMel+1):
        left = fftBins[i-1]
        center = fftBins[i]
        right = fftBins[i+1]

        # Computation of triangular filter
        for j in range(np.floor(left).astype(int), np.floor(center).astype(int)):
            filterbank[i-1,j] = (center - j) / (center - left)
        for j in range(center, right):
            filterbank[i-1,j] = (right - j) / (right - center)
    
    return filterbank, melPoints


# Number of filters to apply in the filterbank. 24 are the critical bands found in the cochlear map. 
# nMel is an hyperparameter
nMel = 24

# Design the filterbank
filterbank, melPoints = designFilterbank(sr, n_fft, nMel=nMel)

# Apply the filterbank to the spectrogram
audio_melSpectrogram = audio_spectrogram@filterbank.T
audio_preEmphasis_melSpectrogram = audio_preEmphasis_spectrogram@filterbank.T

# Convert in dB
audio_melSpectrogram = 20*np.log10(audio_melSpectrogram + 1e-9)
audio_preEmphasis_melSpectrogram = 20*np.log10(audio_preEmphasis_melSpectrogram + 1e-9)

# Plot spectrogram
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20,30))
aspect = audio_windowed_time[-1]/melPoints[-1]

axs[0].set_title("Audio Mel Spectrogram")
axs[0].set_xlabel("Time [s]")
axs[0].set_ylabel("Mel Scale")
axs[0].imshow(audio_melSpectrogram.T, aspect=aspect, origin='lower', extent=[audio_windowed_time[0],audio_windowed_time[-1], melPoints[0], melPoints[-1]])

axs[1].set_title("Audio Emphasized Mel Spectrogram")
axs[1].set_xlabel("Time [s]")
axs[1].set_ylabel("Mel Scale")
axs[1].imshow(audio_preEmphasis_melSpectrogram.T, aspect=aspect, origin='lower', extent=[audio_windowed_time[0],audio_windowed_time[-1], melPoints[0], melPoints[-1]])

# Set a tight layout to avoid overlapping
fig.tight_layout()
plt.show()


def dctBasis(n, m):
    basis = np.zeros((n,m))
    for i in range(n):
        basis[i,:] = (i+1) * ((np.arange(m) + 0.5) * (np.pi/m))
    return np.cos(basis)

def computeMFCC(audio_melSpectrogram, n_mfcc=13):
    # Number of mel filters
    nMel = audio_melSpectrogram.shape[1]
    # Generate the DCT basis
    dctBasisFunctions = dctBasis(n_mfcc, nMel)
    # Compute the DCT of the log-amplitude mel spectrogram
    audio_mfccs = dctBasisFunctions@audio_melSpectrogram.T
    # Mean and variance normalization. Cepstral mean variance normalization (CMVN) reduces performance in speaker verification
    for i in range(audio_mfccs.shape[0]):
        audio_mfccs[i,:] = (audio_mfccs[i,:] - np.mean(audio_mfccs[i,:])) / np.sqrt(np.var(audio_mfccs[i,:]))
    return audio_mfccs


audio_mfccs = computeMFCC(audio_melSpectrogram)
audio_preEmphasis_mfccs = computeMFCC(audio_preEmphasis_melSpectrogram)

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20,30))

axs[0].set_title("Audio MFCCs")
axs[0].set_xlabel("Time [s]")
axs[0].set_ylabel("MFCCs")
axs[0].imshow(audio_mfccs, origin="lower", aspect=20)

axs[1].imshow(audio_preEmphasis_mfccs, origin="lower", aspect=20)
# Set a tight layout to avoid overlapping
fig.tight_layout()
plt.show()


# Initialize the means and covariances of the Gaussians randomly
mu1 = np.random.randn(nDims)
cov1 = np.eye(nDims)
mu2 = np.random.randn(nDims)
cov2 = np.eye(nDims)

# Run the EM algorithm
logLikelihood = -np.inf
tolerance = 1e-4
converged = False
while not converged:
    # E-step: compute the posterior probabilities
    p1 = multivariate_normal.pdf(mfccs, mean=mu1, cov=cov1)
    p2 = multivariate_normal.pdf(mfccs, mean=mu2, cov=cov2)
    gamma = p1 / (p1 + p2)
    
    # M-step: update the parameters of the Gaussians
    mu1 = np.sum(gamma[:, np.newaxis] * mfccs, axis=0) / np.sum(gamma)
    cov1 = np.dot((gamma[:, np.newaxis] * (mfccs - mu1)).T, (mfccs - mu1)) / np.sum(gamma)
    mu2 = np.sum((1 - gamma)[:, np.newaxis] * mfccs, axis=0) / np.sum(1 - gamma)
    cov2 = np.dot(((1 - gamma)[:, np.newaxis] * (mfccs - mu2)).T, (mfccs - mu2)) / np.sum(1 - gamma)
    
    # Compute the log-likelihood of the data
    newLogLikelihood = np.sum(np.log(p1 + p2))
    
    # Check for convergence
    if np.abs(newLogLikelihood - logLikelihood) < tolerance:
        converged = True
    else:
        logLikelihood = newLogLikelihood

# Discard the feature vectors that are likely to correspond to silence or background noise
threshold = 0.5
p1 = multivariate_normal.pdf(mfccs, mean=mu1, cov=cov1)
p2 = multivariate_normal.pdf(mfccs, mean=mu2, cov=cov2)
gamma = p1 / (p1 + p2)
mfccsFiltered = mfccs[gamma >= threshold, :]

MFCCS¶

How to extract meaningful features from audio¶

(from "A tutoral on Text-Independent Speaker Verification" by Bimbot et al., 2004)¶

Loading and Preprocessing Audio Data¶

Speech Parametrization: Pre-emphasis (Optional)¶

The sense of the pre-emphasis¶

Speech Parametrization: Computing Spectrogram¶

Speech Parametrization: Mel-Scale Filterbank Design¶

Speech Parametrization: Extracting MFCCs from Mel Spectrogram¶

Bi-Gaussian Model of Feature Vectors¶