Speech Features - Jupyter Notebooks Gallery

Có thể bạn quan tâm

notebook.community

Edit and run

In [1]: import numpy as np from matplotlib import pyplot as plt from matplotlib import style import librosa import IPython.display import librosa.display import os import random from matplotlib.pyplot import specgram import glob

Speech Features

Tonnetz

Networks or lattices of tones. The tonnetz tonal centroids — the “central” tones. These are features that help in Detecting Harmonic Change in Musical Audio or variances due to tones in audio.

STFT

Short-term fourier transform. Segments the signal into short frames and computes the fourier transform on each short segment.

Spectral Contrast

Relative distribution of energies.

Chromagrams

Chromagrams are based on the pitch scales

In [2]: def extract_feature(file_name): X, sample_rate = librosa.load(file_name) np.nan_to_num(X) stft = np.abs(librosa.stft(X)) mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0) chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0) mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0) contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0) tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0) return mfccs, chroma, mel, contrast, tonnetz def parse_audio_files(parent_dir, sub_dirs, classes, file_ext='*.ogg'): features, labels = np.empty((0, 193)), np.empty(0) for label, sub_dir in enumerate(sub_dirs): for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)): mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn) ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz]) features = np.vstack([features,ext_features]) labels = np.append(labels, classes.get(sub_dir)) return np.array(features), np.array(labels, dtype = np.int) def one_hot_encode(labels): n_labels = len(labels) n_unique_labels = len(np.unique(labels)) one_hot_encode = np.zeros((n_labels, n_unique_labels)) one_hot_encode[np.arange(n_labels), labels] = 1 return one_hot_encode In [3]: data_dir = '../data/esc-50' sample_dir = os.path.join(data_dir, 'sample') train_dir = os.path.join(data_dir, 'train') test_dir = os.path.join(data_dir, 'test') print 'Sound Sample Classes' print '--------------------' for d in os.listdir(sample_dir): print d Sound Sample Classes -------------------- rooster coughing insects laughing In [4]: samples_dict = dict() for d in os.listdir(sample_dir): sample_class_dir = os.path.join(sample_dir, d) samples_dict[d] = [os.path.join(sample_class_dir, f) for f in os.listdir(sample_class_dir)] In [5]: mfccs, chroma, mel, contrast,tonnetz = extract_feature(samples_dict.get('insects')[0]) ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz]) In [6]: mfccs, chroma, mel, contrast,tonnetz = extract_feature(samples_dict.get('insects')[0]) ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz]) print len(ext_features) 193 In [7]: features = np.empty((0,193)) print features.shape (0, 193) In [8]: features = np.vstack([features,ext_features]) print features [[ -2.71008667e+02 1.11177134e+02 -2.66969894e+01 1.64818867e+01 -4.28178103e-01 -1.32390633e+01 1.21125811e+00 -5.36048044e+00 -1.12257893e+01 -1.54257043e+01 -1.81542393e+01 -1.80371958e+01 -1.07932755e+01 -8.53481359e+00 -6.64451044e+00 -1.93843792e+00 -1.12454952e+01 -1.68509149e+01 -1.28325805e+01 -1.23412747e+01 -9.02519197e+00 -5.68524379e+00 -5.22570074e+00 -7.41173323e+00 -1.38022800e+01 -1.52868561e+01 -1.31671584e+01 -9.81586918e+00 -6.73705615e+00 -8.49936622e+00 -1.17248014e+01 -1.46298240e+01 -1.01961189e+01 3.74741798e+00 2.14096003e+01 3.56850792e+01 3.83353708e+01 2.89771481e+01 1.19066973e+01 -2.89713765e+00 1.59088475e-01 3.19025165e-01 3.33885749e-01 1.52062355e-01 1.77843989e-01 2.93502119e-01 8.13136173e-01 8.85314120e-01 3.31957176e-01 1.98507119e-01 3.94651468e-01 3.02763397e-01 8.68044150e-02 1.83963744e-02 1.35715236e-02 2.39104148e-02 1.39743146e-01 6.72779917e+00 4.90004694e+02 2.73282780e+02 8.93248234e-01 5.59734983e-02 1.70156021e-02 2.59211479e-02 9.61986968e-01 4.17390167e+01 8.62789041e+01 5.58839101e+00 4.88813554e-02 6.79680806e-03 4.78441937e-03 1.48819929e-01 3.20957326e+00 1.22941049e+01 4.12018888e+00 9.93672660e-02 4.37581389e-03 2.98869747e-03 1.00605985e-02 2.59755824e-01 7.90625275e-01 6.10696306e-01 7.46318680e-02 5.72895169e-03 3.96717497e-03 9.20663652e-02 1.41673099e+00 6.02122098e+00 6.44586128e+00 1.63331443e+00 1.09673649e-01 5.98863026e-03 1.22476459e-02 7.09265921e-02 2.40077103e-01 3.19510478e-01 8.61640183e-02 6.65731303e-03 1.16871322e-02 1.31723973e-01 2.78013311e-01 1.51962472e-01 4.00416660e-02 6.17122649e-03 7.81794417e-02 1.96918157e-01 1.51670048e-01 3.32580226e-02 7.61031437e-03 3.29178922e-02 6.04684941e-02 1.53753055e-02 7.11120307e-03 1.68225158e-02 2.11778417e-02 8.03014126e-03 1.11053473e-02 3.95009299e-02 3.38501458e-02 1.31685055e-02 2.22509392e-02 2.41385683e-02 1.13896898e-02 1.60631428e-02 1.59461956e-02 6.22479372e-03 8.93443731e-03 1.07098259e-02 6.18478379e-03 9.12808856e-03 7.54452137e-03 9.15637982e-03 7.81352579e-03 6.35637785e-03 8.36853838e-03 6.21336188e-03 6.53785908e-03 4.66720774e-03 4.59873942e-03 3.29128604e-03 2.36636812e-03 1.52167571e-03 1.33269869e-03 1.14961995e-03 1.01464148e-03 5.93145876e-04 4.99635008e-04 4.98640086e-04 5.03423621e-04 4.61343924e-04 4.50671235e-04 4.80561314e-04 3.64230392e-04 3.45524261e-04 3.06339818e-04 2.57827978e-04 1.84518869e-04 1.36144799e-04 9.49975459e-05 9.42402109e-05 9.45126118e-05 9.04012059e-05 7.84361819e-05 5.63061283e-05 3.90798580e-05 2.93396714e-05 2.11918386e-05 1.61612281e-05 1.21721234e-05 9.67922533e-06 6.89963830e-06 5.43285045e-06 3.83645753e-06 2.90355287e-06 2.12411428e-06 1.89239558e-06 1.59997136e-06 1.24610975e-06 5.06919169e-07 1.81620320e-07 3.73000535e+01 2.72948934e+01 2.49868627e+01 2.53811632e+01 1.81814493e+01 1.97994342e+01 3.85436245e+01 3.70963525e-02 -2.24606709e-02 1.88734775e-01 -2.03544718e-01 5.38986268e-02 1.56129083e-02]] In [9]: sample_dir = os.path.join(data_dir, 'sample') sub_dirs = ['laughing', 'coughing', 'insects', 'rooster'] classes = {'laughing': 0, 'coughing': 1, 'insects': 2, 'rooster': 3} features, labels = parse_audio_files(sample_dir, sub_dirs, classes) In [10]: print features.shape (156, 193) In [11]: one_hot = one_hot_encode(labels) In [12]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(features, one_hot, test_size=0.15, random_state=42) print X_train.shape print y_train.shape (132, 193) (132, 4) In [56]: n_hidden_units_one = 50 n_hidden_units_two = 50 n_classes = 4 n_dim = X_train.shape[1] In [57]: from keras.models import Sequential from keras.layers import Dense import numpy In [58]: model = Sequential() model.add(Dense(n_hidden_units_one, input_dim=n_dim, kernel_initializer='uniform', activation='relu')) model.add(Dense(n_hidden_units_two, kernel_initializer='uniform', activation='relu')) model.add(Dense(n_classes, kernel_initializer='uniform', activation='softmax')) In [59]: model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) In [60]: model.fit(X_train, y_train, nb_epoch=10, batch_size=20) Epoch 1/10 132/132 [==============================] - 0s - loss: 1.3210 - acc: 0.3636 Epoch 2/10 132/132 [==============================] - 0s - loss: 1.1869 - acc: 0.5530 Epoch 3/10 132/132 [==============================] - 0s - loss: 1.0305 - acc: 0.6136 Epoch 4/10 132/132 [==============================] - 0s - loss: 0.8609 - acc: 0.6970 Epoch 5/10 132/132 [==============================] - 0s - loss: 0.7235 - acc: 0.7500 Epoch 6/10 132/132 [==============================] - 0s - loss: 0.6231 - acc: 0.7879 Epoch 7/10 132/132 [==============================] - 0s - loss: 0.5371 - acc: 0.8258 Epoch 8/10 132/132 [==============================] - 0s - loss: 0.4258 - acc: 0.8788 Epoch 9/10 132/132 [==============================] - 0s - loss: 0.3891 - acc: 0.8712 Epoch 10/10 132/132 [==============================] - 0s - loss: 0.3196 - acc: 0.9015 Out[60]: <keras.callbacks.History at 0x7f03e13c67d0> In [61]: scores = model.evaluate(X_test, y_test) print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100)) 24/24 [==============================] - 0s acc: 83.33% In [14]: from sklearn.decomposition import PCA from sklearn import preprocessing import pandas as pd features_df = pd.DataFrame(features) data_scaled = pd.DataFrame(preprocessing.scale(features_df), columns=features_df.columns) pca = PCA(n_components=5) pca_results = pca.fit_transform(data_scaled) print 'Shape of the transformed feature vector:', pca_results.shape print 'Original training sample:', list(features_df.loc[0].values) print 'Training sample after PCA:', list(pca_results[0]) print '\n' # Percentage of variance explained for each components print 'Explained variance ratio (first five components)' print '------------------------------------------------' for idx, r in enumerate(pca.explained_variance_ratio_): print 'Principal Component', idx, ':', r Shape of the transformed feature vector: (156, 5) Original training sample: [-211.95342873967343, 126.77311337329317, -34.790516564858109, -10.512615152990707, -36.476548766848765, -15.265458386539253, -25.452147024852891, -5.373759791325627, -17.113109621494466, 9.623134344549225, -10.928963826555067, -10.522028894620606, -8.9315416259630567, -4.2521794394544665, -10.441037332644855, -7.5970087415701366, -8.8785380383648729, -1.1939344747472949, -7.0908558885145743, 0.76262883957394723, -6.841079475167918, -6.7085224890121546, -4.9157763803929528, -2.7260106406212623, -6.4026770846192411, -2.3801003698167018, -6.0828079226609626, -1.005822607342773, -4.4659888523414537, -1.1620847727575108, -4.0551530678673311, -0.10143759270967795, -5.5333870628773516, -1.0524975933573175, -3.2479466396041592, -1.5511760810055846, -3.3662660397266997, -2.0985671973177058, -3.8624834878166756, -0.23053952005125813, 0.51834445566621268, 0.50731462792430559, 0.54732341782571448, 0.54456663467689226, 0.55724919803408046, 0.63912478985697185, 0.69991761882011527, 0.71094263997045593, 0.70488867331070892, 0.66897406320086661, 0.58024313023707208, 0.54668903682846348, 0.00030898208816695281, 0.0011037863654963064, 0.016088965129742488, 0.048029955612885047, 0.062542814079642906, 0.13947951636564837, 0.23387052426737462, 0.22624280428373164, 0.19506449493940808, 0.29063113954904291, 0.4549219218919513, 0.73933110146098002, 0.69277632511016096, 0.50986683000985145, 0.60366819441108444, 0.80981155644544678, 1.3543601534067828, 1.320178095013623, 1.5462867580937649, 0.95649853417901898, 1.0331258896554507, 1.8075995803564224, 2.6111118874054506, 2.7215665094646804, 3.3259071652120826, 5.0641070479079797, 6.0979488185737845, 7.1051678508019638, 7.535051336622236, 7.6710790670789937, 2.8148214272133782, 2.7522209223856491, 2.5889673357936207, 1.4791161752630282, 1.1608426587244347, 1.2826584912174799, 1.5683734273342689, 1.0036368136086244, 0.72538112258986454, 0.49079046565747814, 0.39725536292546915, 0.31822184116950575, 0.54006440146321222, 0.48064187985263823, 0.34986304285334202, 0.24974973458894739, 0.19838699669755813, 0.17349815778111813, 0.12901942709396025, 0.11860289924781658, 0.15270080616908899, 0.17868360046591006, 0.19715498444407642, 0.21941677772091334, 0.23211492768070943, 0.25149226279331072, 0.18776629751326335, 0.17411642277448908, 0.17889784925497204, 0.096644062036501277, 0.082507450725597647, 0.066139808612824799, 0.05174036781666911, 0.046590424206075565, 0.038960488665702071, 0.038236938679630399, 0.02557265558608306, 0.021394920531427115, 0.016140895581957311, 0.015475472783039103, 0.011011644560546208, 0.010356935038617588, 0.0099145786292644516, 0.011856377501241988, 0.014797492405052352, 0.013112522923081406, 0.015755074069384874, 0.020782871349228575, 0.022437630686839058, 0.021883271850380963, 0.01657473185440838, 0.018241886561177423, 0.022016965476869915, 0.015009248627246756, 0.013437150754335971, 0.0084712427644161557, 0.0078142587165301747, 0.006294118115443466, 0.0046241263855647703, 0.0046417836148709589, 0.0040455707002778384, 0.0035629338770325389, 0.0035737071288845118, 0.0031781355610891746, 0.0022911490950632291, 0.0019606707835456614, 0.0014439297620097654, 0.0012277669175469535, 0.0010700037495356531, 0.0010706933391297537, 0.0012123657109182897, 0.0011216158135388798, 0.0010196895226450143, 0.0013053526270285333, 0.0017245984804699101, 0.0018016985150939903, 0.0016583987379953005, 0.0014503483148989862, 0.0011736064986719386, 0.0011487815179094805, 0.00098804534315929753, 0.00097006047956999658, 0.00076859565645174838, 0.00074170568164964358, 0.00055791487710688448, 0.00056102393166497023, 0.00038626005762788242, 0.00034570871649306054, 0.00029332994200811788, 0.00018045878544668763, 0.00014152537898716433, 0.00012862934087420268, 0.00011241898990175759, 0.00010035055321958268, 8.6017515487433528e-05, 5.7183426737683098e-05, 1.5404341109415376e-05, 1.0205875575903759e-06, 23.027444126555952, 12.259515859469532, 15.806141550857673, 17.774268658297565, 18.162222180882441, 18.832451488517545, 38.495340230619959, 0.0039265619510271983, 0.0031208547093051074, 0.016112377021180464, -0.0033134670188252908, -0.0038187426081049991, -0.0027186458454636149] Training sample after PCA: [-2.6068373184520963, 2.4242388793647009, -0.65683308107782468, 4.2573390545248051, 1.079539326884505] Explained variance ratio (first five components) ------------------------------------------------ Principal Component 0 : 0.18766387691 Principal Component 1 : 0.0877785919013 Principal Component 2 : 0.071878311726 Principal Component 3 : 0.0639192643987 Principal Component 4 : 0.0450622371579 In [18]: from ggplot import * df_pca = features_df.copy() df_pca['x-pca'] = pca_results[:,0] df_pca['y-pca'] = pca_results[:,1] df_pca['label'] = labels chart = ggplot( df_pca, aes(x='x-pca', y='y-pca', color='label') ) \ + geom_point(size=75,alpha=0.8) \ + ggtitle("First and Second Principal Components colored by gender") chart Out[18]: <ggplot: (8730928897673)> In [22]: from sklearn.manifold import TSNE print("Computing t-SNE embedding") tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=500) tsne_results = tsne.fit_transform(features_df) Computing t-SNE embedding [t-SNE] Computing pairwise distances... [t-SNE] Computing 151 nearest neighbors... [t-SNE] Computed conditional probabilities for sample 156 / 156 [t-SNE] Mean sigma: 46.734307 [t-SNE] KL divergence after 100 iterations with early exaggeration: 1.395291 [t-SNE] Error after 400 iterations: 1.395291 In [23]: df_tsne = features_df.copy() df_tsne['x-tsne'] = tsne_results[:,0] df_tsne['y-tsne'] = tsne_results[:,1] df_tsne['label'] = labels chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \ + geom_point(size=70,alpha=0.1) \ + ggtitle("tSNE dimensions colored by gender") chart Out[23]: <ggplot: (8730928702225)> In [28]: from sklearn.manifold import TSNE print("Computing t-SNE embedding") tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=1000) tsne_results = tsne.fit_transform(pca_results) Computing t-SNE embedding [t-SNE] Computing pairwise distances... [t-SNE] Computing 151 nearest neighbors... [t-SNE] Computed conditional probabilities for sample 156 / 156 [t-SNE] Mean sigma: 3.671794 [t-SNE] KL divergence after 100 iterations with early exaggeration: 1.414827 [t-SNE] Error after 325 iterations: 1.414827 In [29]: df_tsne = features_df.copy() df_tsne['x-tsne'] = tsne_results[:,0] df_tsne['y-tsne'] = tsne_results[:,1] df_tsne['label'] = labels chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \ + geom_point(size=70,alpha=0.1) \ + ggtitle("tSNE dimensions colored by gender") chart Out[29]: <ggplot: (8730928573909)> In [ ]:

Content source: sanket-patil/speech-vectors

Similar notebooks:

classification
index
ideal highpass
Erlang_1,k-checkpoint
lec07-blending_and_bagging
Filter design-checkpoint
tonal
FourierTransforms
Tut-checkpoint
Fourier-Transform-Primer

notebook.community | gallery | about

Từ khóa » N_mfcc=40

Speech Features - Jupyter Notebooks Gallery

Speech Features

Tonnetz

STFT

Spectral Contrast

Chromagrams

Librosa.feature.mfcc — Librosa 0.10.v0 Documentation

Librosa.feature.mfcc — Librosa 0.7.2 Documentation

Explanation Of Librosa MFCC Parameter With N_mfcc=40

Some Question When Extracting MFCC Features · Issue #595 - GitHub

Librosa.feature.mfcc

Mel-frequency Cepstrum Coefficients — Transform_mfcc • Torchaudio

Why N_mfcc Value = 40 , While Extracting Mfcc Features | By Naveen K

Librosa.feature.mfcc Example - Program Talk

Continue With Machine Learning - Noise Detection (Classification)

Speech Recognition - Audio Data Analysis Post Doubts (MFCCS)

Audio Spectrogram — NVIDIA DALI 1.16.0 Documentation

Class For Audio Feature Extraction - Notebooks

Liên Hệ