Source code for rythm_forge.core.feature_functions

import numpy as np
from ..exceptions.exceptions import RythmForgeValueError
from .._lib import rythm_forge_core_cpp as core_backend
from .fft_functions import stft



[docs]
def resample(y: np.ndarray, sr: int, new_sr=8000) -> tuple[np.ndarray, int]:
    """
       Resample a time series from sr to new_sr
    :param y: np.ndarray
        A 1D or 2D numpy array of input audio samples, with each row being different channel
    :param sr: int
        Original sampling rate at which y has been acquired.
    :param new_sr: int
        Target sampling rate
    :return: ,int
        y_hat: mp.ndarray, y resampled from sr to new_sr
        new_sr:int sampling rate used in resampling
    """

    resampled_y, new_sr = core_backend.resample(y, sr, new_sr)
    return resampled_y, new_sr




[docs]
def mel_filter_bank(sr: int, n_fft: int, n_mel: int) -> np.ndarray:
    """
    Create a Mel filter-bank.
    This produces a linear transformation matrix to project FFT bins onto Mel-frequency bins.
    :param sr : int > 0 [scalar]
        Sampling rate of the incoming signal.

    :param n_fft: int > 0 [scalar]
        Number of FFT components

    :param n_mel: int > 0 [scalar]
        number of Mel bands to generate

    :return: M np.ndarray [shape=(n_mels, 1 + n_fft/2)]
        Mel transform matrix
    """

    mel_filter = core_backend.mel_filter_bank(sr, n_fft, n_mel)
    return mel_filter.T




[docs]
def hz_to_mel(array: np.ndarray):
    """
    Converts Hz to Mels.
    :param array: np.ndarray of values in Hz to be converted to Mels
    """

    return core_backend.hz_to_mel(array)




[docs]
def mel_to_hz(array: np.ndarray):
    """
    Converts Mels to Hz
    :param array: np.ndarray of values in Mels to be converted to Hz
    """

    return core_backend.mel_to_hz(array)




[docs]
def magnitude(complex_matrix: np.ndarray):
    """
    Converts matrix filled with complex values to matrix of magnitudes of elements, similar to np.abs(array)
    :param complex_matrix: np.ndarray
        Array with complex values, most often from stft
    :return: np.ndarray
    """

    return np.abs(complex_matrix)



def find_peaks(y: np.ndarray):
    """
    Iterates through 1D array and returns indices of peak samples, meaning sample before and after are
     smaller than checked.
    :param y: 1D array, like onset_strength envelope, which elements will be compared
    :return: 1D np.nadrray, with elements being indices of beats that are peaks in the series
    """

    if y.ndim != 2:
        raise ValueError(f"Given vector y must be 1D, but {y.ndim}  was given")
    return core_backend.find_peaks(y)


def onset_strength(y: np.ndarray, sr: int):
    """
    Compute the onset strength envelope of an audio signal.

        The onset strength envelope is a measure of the sudden increases in energy across frequency bands,
        which typically correspond to note onsets or other transient events in the audio signal.

        :param y: np.ndarray
            The input audio signal as a 2D numpy array.
        :param sr: int
            The sample rate of the input audio signal.

        :return: np.ndarray
            The onset strength envelope of the audio signal as a 1D numpy array.
    """

    y_resampled, sr = resample(y, sr, 8000)
    S = stft(y_resampled, 2048, 512)
    S = magnitude(S)
    mel_filter = mel_filter_bank(sr, 2048, 40)
    mel_filter_expanded = mel_filter[np.newaxis, :, :]
    S = np.abs(np.dot(mel_filter_expanded, S)[0])
    S = power_to_dB(S)
    ref = S

    lag = 3
    onset_env = S[..., lag:] - ref[..., :-lag]
    onset_env = np.maximum(0.0, onset_env)

    pad_width = lag
    pad_width += 2048 // (2 * 512)

    padding = [(0, 0) for _ in onset_env.shape]
    padding[-1] = (int(pad_width), 0)
    onset_env = np.pad(onset_env, padding, mode="constant")

    return np.sum(onset_env, axis=0)


def tempo_estimation(y: np.ndarray, sr: int):
    """
    Does not work, don't know why
    Estimates tempo of the signal, returns bpm
    :param y: np.ndarray
        The input audio signal as a 1D numpy array.
    :param sr: int
        The sample rate of the input audio signal.
    :return: int representing calculated tempo in bpm
    """

    envelope = onset_strength(y, sr)
    peaks = find_peaks(envelope)
    peak_intervals = np.diff(peaks) / sr
    return 60 / np.mean(peak_intervals)


def beat_estimation(y: np.ndarray, sr: int):
    """
    Does not work, don't know why
    :param y: np.ndarray
        The input audio signal as a 1D numpy array.
    :param sr: int
        The sample rate of the input audio signal.
    :return: np.ndarray, with samples numbers being beats
    """

    tempo = tempo_estimation(y, sr)
    beat_interval = 60 / tempo
    beat_interval_samples = int(beat_interval / 8000)
    peaks = find_peaks(y)
    beat_location = [p * beat_interval_samples for p in peaks]
    return beat_location



[docs]
def amplitude_to_dB(A, ref=1.0, amin=1e-10, top_db=80.0):
    """
    Convert an amplitude spectrogram to decibel (dB) units.

    :param A: np.ndarray
        Input amplitude spectrogram.
    :param ref: float or callable
        Reference value. If scalar, amplitude is scaled relative to `ref`. If callable, the reference value is computed as `ref(A)`.
    :param amin: float
        Minimum threshold for `A` and `ref`.
    :param top_db: float
        Threshold the output at `top_db` below the peak.
    :return: np.ndarray
        The dB-scaled spectrogram.
    """

    A = np.asarray(A)
    if callable(ref):
        ref_value = ref(A)
    else:
        ref_value = ref

    log_spec = 20.0 * np.log10(np.maximum(amin, A) / np.maximum(amin, ref_value))
    log_spec = np.maximum(log_spec, log_spec.max() - top_db)




[docs]
def melspectrogram(
    stft_matrix: np.ndarray, n_fft=2048, sr=44100, n_mels=128
) -> np.ndarray:
    """
    Convert an STFT matrix to a mel spectrogram.

    This function transforms a Short-Time Fourier Transform (STFT) matrix into a mel spectrogram,
    where the frequency axis is mapped to the mel scale, which is a perceptually motivated scale of pitches.

    :param stft_matrix: np.ndarray
        The input STFT matrix of shape (..., n_freqs, n_times), representing the magnitude of the STFT of the audio signal.
    :param n_fft: int, optional, default=2048
        The number of FFT components, corresponding to the number of frequency bins in the STFT. This value determines the resolution of the frequency axis.
    :param sr: int, optional, default=44100
        The sample rate of the audio signal. This is used to compute the mel filter bank.
    :param n_mels: int, optional, default=128
        The number of mel bands to generate. This determines the resolution of the mel scale.
    :return: np.ndarray
        The mel spectrogram of shape (..., n_mels, n_times), where the frequency bins are replaced by mel bands.
    """

    if stft_matrix.ndim != 2:
        raise RythmForgeValueError(
            "Wrong STFT matrix dim number! STFT should have ndim=2"
        )

    mel_filter = mel_filter_bank(sr, n_fft, n_mels)

    return np.einsum("...ft,mf->...mt", stft_matrix, mel_filter, optimize=True)




[docs]
def power_to_dB(S, ref=1.0, amin=1e-10, top_db=80.0):
    """
    Convert a power spectrogram to decibel (dB) units.

    :param S: np.ndarray
        Input power spectrogram.
    :param ref: float or callable
        Reference value. If scalar, amplitude is scaled relative to `ref`. If callable, the reference value is computed as `ref(S)`.
    :param amin: float
        Minimum threshold for `S` and `ref`.
    :param top_db: float
        Threshold the output at `top_db` below the peak.
    :return: np.ndarray
        The dB-scaled spectrogram.
    """
    S = np.asarray(S)
    if callable(ref):
        ref_value = ref(S)
    else:
        ref_value = ref

    log_spec = 10.0 * np.log10(np.maximum(amin, S) / np.maximum(amin, ref_value))
    log_spec = np.maximum(log_spec, log_spec.max() - top_db)

    return log_spec