当前位置：网站首页>[speech signal processing] 3 speech signal visualization -- prosody

[speech signal processing] 3 speech signal visualization -- prosody

2022-07-01 12:10:00 【Yang SiCheng】

Speech feature extraction and visualization

1. Get ready
2. final result
3. Code

1. Get ready

In the previous work, the prosody related feature extraction is added , Previously visible ：【 Voice signal processing 】1 Voice signal visualization —— Time domain 、 frequency domain 、 Spectrogram 、MFCC Detailed ideas and calculations 、 Difference

Install it. This library ：

pip install praat-parselmouth

There are others , Anyway, what's missing, Ann ：

pip install pydub
pip install python_speech_features
pip install librosa==0.9

2. final result

Insert picture description here

3. Code

"""
This script contains supporting function for the data processing.
It is used in several other scripts:
for generating bvh files, aligning sequences and calculation of speech features
"""

import librosa
import librosa.display

from pydub import AudioSegment # TODO(RN) add dependency!
import parselmouth as pm # TODO(RN) add dependency!
from python_speech_features import mfcc
import scipy.io.wavfile as wav

import numpy as np
import scipy

NFFT = 1024
MFCC_INPUTS=26 # How many features we will store for each MFCC vector
WINDOW_LENGTH = 0.1 #s
SUBSAMPL_RATE = 9

def derivative(x, f):
    """ Calculate numerical derivative (by FDM) of a 1d array
    Args:
        x: input space x
        f: Function of x
    Returns:
        der:  numerical derivative of f wrt x
    """

    x = 1000 * x  # from seconds to milliseconds

    # Normalization:
    dx = (x[1] - x[0])

    cf = np.convolve(f, [1, -1]) / dx

    # Remove unstable values
    der = cf[:-1].copy()
    der[0] = 0

    return der

def create_bvh(filename, prediction, frame_time):
    """
    Create BVH File
    Args:
        filename:    file, in which motion in bvh format should be written
        prediction:  motion sequences, to be written into file
        frame_time:  frame rate of the motion
    Returns:
        nothing, writes motion to the file
    """
    with open('hformat.txt', 'r') as ftemp:
        hformat = ftemp.readlines()

    with open(filename, 'w') as fo:
        prediction = np.squeeze(prediction)
        print("output vector shape: " + str(prediction.shape))
        offset = [0, 60, 0]
        offset_line = "\tOFFSET " + " ".join("{:.6f}".format(x) for x in offset) + '\n'
        fo.write("HIERARCHY\n")
        fo.write("ROOT Hips\n")
        fo.write("{\n")
        fo.write(offset_line)
        fo.writelines(hformat)
        fo.write("MOTION\n")
        fo.write("Frames: " + str(len(prediction)) + '\n')
        fo.write("Frame Time: " + frame_time + "\n")
        for row in prediction:
            row[0:3] = 0
            legs = np.zeros(24)
            row = np.concatenate((row, legs))
            label_line = " ".join("{:.6f}".format(x) for x in row) + " "
            fo.write(label_line + '\n')
        print("bvh generated")


def shorten(arr1, arr2, min_len=0):

    if min_len == 0:
        min_len = min(len(arr1), len(arr2))

    arr1 = arr1[:min_len]
    arr2 = arr2[:min_len]

    return arr1, arr2


def average(arr, n):
    """ Replace every "n" values by their average
    Args:
        arr: input array
        n:   number of elements to average on
    Returns:
        resulting array
    """
    end = n * int(len(arr)/n)
    return np.mean(arr[:end].reshape(-1, n), 1)


def calculate_spectrogram(audio_filename):
    """ Calculate spectrogram for the audio file
    Args:
        audio_filename: audio file name
        duration: the duration (in seconds) that should be read from the file (can be used to load just a part of the audio file)
    Returns:
        log spectrogram values
    """

    DIM = 64

    audio, sample_rate = librosa.load(audio_filename)
    # Make stereo audio being mono
    if len(audio.shape) == 2:
        audio = (audio[:, 0] + audio[:, 1]) / 2

    spectr = librosa.feature.melspectrogram(y=audio, sr=sample_rate, window = scipy.signal.hanning,
                                            #win_length=int(WINDOW_LENGTH * sample_rate),
                                            hop_length = int(WINDOW_LENGTH* sample_rate / 2),
                                            fmax=7500, fmin=100, n_mels=DIM)

    # Shift into the log scale
    eps = 1e-10
    log_spectr = np.log(abs(spectr)+eps)

    return np.transpose(log_spectr)

def calculate_mfcc(audio_filename):
    """
    Calculate MFCC features for the audio in a given file
    Args:
        audio_filename: file name of the audio
    Returns:
        feature_vectors: MFCC feature vector for the given audio file
    """
    fs, audio = wav.read(audio_filename)
    # Make stereo audio being mono
    if len(audio.shape) == 2:
        audio = (audio[:, 0] + audio[:, 1]) / 2

    # Calculate MFCC feature with the window frame it was designed for
    input_vectors = mfcc(audio, winlen=0.02, winstep=0.01, samplerate=fs, numcep=MFCC_INPUTS, nfft=NFFT)

    input_vectors = [average(input_vectors[:, i], 5) for i in range(MFCC_INPUTS)]

    feature_vectors = np.transpose(input_vectors)

    return feature_vectors

def extract_prosodic_features(audio_filename):
    """
    Extract all 5 prosodic features
    Args:
        audio_filename:   file name for the audio to be used
    Returns:
        pros_feature:     energy, energy_der, pitch, pitch_der, pitch_ind
    """

    WINDOW_LENGTH = 5

    # Read audio from file
    sound = AudioSegment.from_file(audio_filename, format="wav")

    # Alternative prosodic features
    pitch, energy = compute_prosody(audio_filename, WINDOW_LENGTH / 1000)

    duration = len(sound) / 1000
    t = np.arange(0, duration, WINDOW_LENGTH / 1000)

    energy_der = derivative(t, energy)
    pitch_der = derivative(t, pitch)

    # Average everything in order to match the frequency
    energy = average(energy, 10)
    energy_der = average(energy_der, 10)
    pitch = average(pitch, 10)
    pitch_der = average(pitch_der, 10)

    # Cut them to the same size
    min_size = min(len(energy), len(energy_der), len(pitch_der), len(pitch_der))
    energy = energy[:min_size]
    energy_der = energy_der[:min_size]
    pitch = pitch[:min_size]
    pitch_der = pitch_der[:min_size]

    # Stack them all together
    pros_feature = np.stack((energy, energy_der, pitch, pitch_der))#, pitch_ind))

    # And reshape
    pros_feature = np.transpose(pros_feature)

    return pros_feature

def compute_prosody(audio_filename, time_step=0.05):
    print(pm.__file__)
    audio = pm.Sound(audio_filename)

    # Extract pitch and intensity
    pitch = audio.to_pitch(time_step=time_step)
    intensity = audio.to_intensity(time_step=time_step)

    # Evenly spaced time steps
    times = np.arange(0, audio.get_total_duration() - time_step, time_step)

    # Compute prosodic features at each time step
    pitch_values = np.nan_to_num(
        np.asarray([pitch.get_value_at_time(t) for t in times]))
    intensity_values = np.nan_to_num(
        np.asarray([intensity.get_value(t) for t in times]))

    intensity_values = np.clip(
        intensity_values, np.finfo(intensity_values.dtype).eps, None)

    # Normalize features [Chiu '11]
    pitch_norm = np.clip(np.log(pitch_values + 1) - 4, 0, None)
    intensity_norm = np.clip(np.log(intensity_values) - 3, 0, None)

    return pitch_norm, intensity_norm


def read_wav(path_wav):
    import wave
    f = wave.open(path_wav, 'rb')
    params = f.getparams()
    nchannels, sampwidth, framerate, nframes = params[:4]  #  The channel number 、 Number of bytes sampled 、 Sampling rate 、 Number of sampling frames 
    voiceStrData = f.readframes(nframes)
    waveData = np.frombuffer(voiceStrData, dtype=np.short)  #  Converts the original character data to an integer 
    waveData = waveData * 1.0 / max(abs(waveData))  #  Audio data normalization , instead of .fromstring
    waveData = np.reshape(waveData, [nframes, nchannels]).T  # .T  Means transpose ,  The audio signal is regularized by the format of one channel signal in each line , That is, the matrix acts as the sampling point of one channel , common nchannels That's ok 
    f.close()
    return waveData, nframes, framerate

import matplotlib.pyplot as plt
def draw_time_domain_image(x1, waveData, nframes, framerate):       #  Time domain characteristics 
    time = np.arange(0,nframes) * (1.0/framerate)
    # plt.figure(1)
    x1.plot(time,waveData[0,:],c='b')
    plt.xlabel('time')
    plt.ylabel('am')
    # plt.show()



def draw_frequency_domain_image(x2, waveData):      #  Frequency domain features 
    fftdata = np.fft.fft(waveData[0, :])
    fftdata = abs(fftdata)
    hz_axis = np.arange(0, len(fftdata))
    # plt.figure(2)
    x2.plot(hz_axis, fftdata, c='b')
    plt.xlabel('hz')
    plt.ylabel('am')
    # plt.show()


def draw_Spectrogram(x3, waveData, framerate):     #  Spectrogram 
    framelength = 0.025     #  Frame length 20~30ms
    framesize = framelength * framerate       #  Points per frame  N = t*fs, Typically, the value is 256 or 512, To work with NFFT equal ,  and NFFT Better take 2 Integer power of , namely framesize The best integer power 
    nfftdict = {}
    lists = [32, 64, 128, 256, 512, 1024]
    for i in lists:     #  Find and present framesize The closest 2 To the power of a positive integer 
        nfftdict[i] = abs(framesize - i)
    sortlist = sorted(nfftdict.items(), key=lambda x: x[1])  #  Press the same as the current framesize The differences are arranged in ascending order 
    framesize = int(sortlist[0][0])  #  Take the closest to the current framesize the 2 The positive integer power value of is new framesize
    NFFT = framesize  # NFFT Must match the number of points in the time domain framsize equal , That is to say, if zero is not filled FFT
    overlapSize = 1.0 / 3 * framesize  #  The number of sampling points in the overlapping part overlapSize About... Of points per frame 1/3~1/2
    overlapSize = int(round(overlapSize))  #  integer 
    spectrum, freqs, ts, fig = x3.specgram(waveData[0], NFFT=NFFT, Fs=framerate, window=np.hanning(M=framesize),
                                            noverlap=overlapSize, mode='default', scale_by_freq=True, sides='default',
                                            scale='dB', xextent=None)  #  Plot the spectrum 
    plt.ylabel('Frequency')
    plt.xlabel('Time(s)')
    plt.title('Spectrogram')
    # plt.show()


def mfcc_librosa(ax, path):
    y, sr = librosa.load(path, sr=None)
    '''
    librosa.feature.mfcc(y=None, sr=22050, S=None, n_mfcc=20, dct_type=2, norm='ortho', **kwargs)
    y： Time domain sequence of sound signal 
    sr： sampling frequency ( Default 22050)
    S： Logarithmic energy Mel spectrum ( The default is empty. )
    n_mfcc： Number of Mel cepstrum coefficients （ Default access 20）
    dct_type: Discrete cosine transform (DCT) The type of ( The default is type 2)
    norm： If DCT The type of is 2 perhaps 3, Parameter set to "ortho", Use orthogonal normalization DCT The base . Normalization does not support DCT The type is 1
    kwargs： If processing time series input , reference melspectrogram
     return ：
    M：MFCC Sequence 
    '''
    mfcc_data = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    ax.matshow(mfcc_data)
    plt.title('MFCC')
    # plt.show()

from scipy.io import wavfile
from python_speech_features import mfcc, logfbank
def mfcc_python_speech_features(ax, path):
    sampling_freq, audio = wavfile.read(path)       #  Read the input audio file 
    mfcc_features = mfcc(audio, sampling_freq)      #  extract MFCC And filter bank characteristics 
    filterbank_features = logfbank(audio, sampling_freq)        # numpy.ndarray, (999, 26)
    print(filterbank_features.shape)        # (200, 26)
    print('\nMFCC:\n Number of windows  =', mfcc_features.shape[0])
    print(' The length of each feature  =', mfcc_features.shape[1])
    print('\nFilter bank:\n Number of windows  =', filterbank_features.shape[0])
    print(' The length of each feature  =', filterbank_features.shape[1])
    mfcc_features = mfcc_features.T     #  Draw a feature map , take MFCC visualization . Transpose matrix , Make the time domain horizontal 
    ax.matshow(mfcc_features)
    plt.title('MFCC')
    filterbank_features = filterbank_features.T     #  Visualize the filter bank features . Transpose matrix , Make the time domain horizontal 
    ax.matshow(filterbank_features)
    plt.title('Filter bank')
    plt.show()



if __name__ == "__main__":
    Debug=1

    if Debug:

        audio_filename = "your path.wav"

        # feature = calculate_spectrogram(audio_filename)


        waveData, nframes, framerate = read_wav(audio_filename)
        ax1 = plt.subplot(3, 3, 1)
        draw_time_domain_image(ax1, waveData, nframes, framerate)
        ax2 = plt.subplot(3, 3, 2)
        draw_frequency_domain_image(ax2, waveData)
        ax3 = plt.subplot(3, 3, 3)
        draw_Spectrogram(ax3, waveData, framerate)
        ax4 = plt.subplot(3, 3, 4)
        mfcc_librosa(ax4, audio_filename)

        x = calculate_spectrogram(audio_filename)
        print(x.shape)      # (145, 64)
        ax5 = plt.subplot(3, 3, 5)
        ax5.plot(x)
        x = calculate_mfcc(audio_filename)
        print(x.shape)      # (143, 26)
        ax6 = plt.subplot(3, 3, 6)
        ax6.plot(x)

        x = extract_prosodic_features(audio_filename)
        print(x.shape)  # (143, 4)
        ax7 = plt.subplot(3, 3, 7)
        ax7.plot(x)

        x, y = compute_prosody(audio_filename, time_step=0.05)
        print(x.shape)  # (143,)
        print(y.shape)  # (143,)
        ax8 = plt.subplot(3, 3, 8)
        ax8.plot(x)
        ax9 = plt.subplot(3, 3, 9)
        ax9.plot(y)
        plt.tight_layout()

        plt.savefig('1.jpg')

原网站

版权声明
本文为[Yang SiCheng]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/182/202207011204158819.html