Speech Features - Jupyter Notebooks Gallery
notebook.community
Edit and run
In [1]: import numpy as np from matplotlib import pyplot as plt from matplotlib import style import librosa import IPython.display import librosa.display import os import random from matplotlib.pyplot import specgram import globSpeech Features
Tonnetz
Networks or lattices of tones. The tonnetz tonal centroids — the “central” tones. These are features that help in Detecting Harmonic Change in Musical Audio or variances due to tones in audio.
STFT
Short-term fourier transform. Segments the signal into short frames and computes the fourier transform on each short segment.
Spectral Contrast
Relative distribution of energies.
Chromagrams
Chromagrams are based on the pitch scales
In [2]: def extract_feature(file_name): X, sample_rate = librosa.load(file_name) np.nan_to_num(X) stft = np.abs(librosa.stft(X)) mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0) chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0) mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0) contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0) tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0) return mfccs, chroma, mel, contrast, tonnetz def parse_audio_files(parent_dir, sub_dirs, classes, file_ext='*.ogg'): features, labels = np.empty((0, 193)), np.empty(0) for label, sub_dir in enumerate(sub_dirs): for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)): mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn) ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz]) features = np.vstack([features,ext_features]) labels = np.append(labels, classes.get(sub_dir)) return np.array(features), np.array(labels, dtype = np.int) def one_hot_encode(labels): n_labels = len(labels) n_unique_labels = len(np.unique(labels)) one_hot_encode = np.zeros((n_labels, n_unique_labels)) one_hot_encode[np.arange(n_labels), labels] = 1 return one_hot_encode In [3]: data_dir = '../data/esc-50' sample_dir = os.path.join(data_dir, 'sample') train_dir = os.path.join(data_dir, 'train') test_dir = os.path.join(data_dir, 'test') print 'Sound Sample Classes' print '--------------------' for d in os.listdir(sample_dir): print d Sound Sample Classes -------------------- rooster coughing insects laughing In [4]: samples_dict = dict() for d in os.listdir(sample_dir): sample_class_dir = os.path.join(sample_dir, d) samples_dict[d] = [os.path.join(sample_class_dir, f) for f in os.listdir(sample_class_dir)] In [5]: mfccs, chroma, mel, contrast,tonnetz = extract_feature(samples_dict.get('insects')[0]) ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz]) In [6]: mfccs, chroma, mel, contrast,tonnetz = extract_feature(samples_dict.get('insects')[0]) ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz]) print len(ext_features) 193 In [7]: features = np.empty((0,193)) print features.shape (0, 193) In [8]: features = np.vstack([features,ext_features]) print features [[ -2.71008667e+02 1.11177134e+02 -2.66969894e+01 1.64818867e+01 -4.28178103e-01 -1.32390633e+01 1.21125811e+00 -5.36048044e+00 -1.12257893e+01 -1.54257043e+01 -1.81542393e+01 -1.80371958e+01 -1.07932755e+01 -8.53481359e+00 -6.64451044e+00 -1.93843792e+00 -1.12454952e+01 -1.68509149e+01 -1.28325805e+01 -1.23412747e+01 -9.02519197e+00 -5.68524379e+00 -5.22570074e+00 -7.41173323e+00 -1.38022800e+01 -1.52868561e+01 -1.31671584e+01 -9.81586918e+00 -6.73705615e+00 -8.49936622e+00 -1.17248014e+01 -1.46298240e+01 -1.01961189e+01 3.74741798e+00 2.14096003e+01 3.56850792e+01 3.83353708e+01 2.89771481e+01 1.19066973e+01 -2.89713765e+00 1.59088475e-01 3.19025165e-01 3.33885749e-01 1.52062355e-01 1.77843989e-01 2.93502119e-01 8.13136173e-01 8.85314120e-01 3.31957176e-01 1.98507119e-01 3.94651468e-01 3.02763397e-01 8.68044150e-02 1.83963744e-02 1.35715236e-02 2.39104148e-02 1.39743146e-01 6.72779917e+00 4.90004694e+02 2.73282780e+02 8.93248234e-01 5.59734983e-02 1.70156021e-02 2.59211479e-02 9.61986968e-01 4.17390167e+01 8.62789041e+01 5.58839101e+00 4.88813554e-02 6.79680806e-03 4.78441937e-03 1.48819929e-01 3.20957326e+00 1.22941049e+01 4.12018888e+00 9.93672660e-02 4.37581389e-03 2.98869747e-03 1.00605985e-02 2.59755824e-01 7.90625275e-01 6.10696306e-01 7.46318680e-02 5.72895169e-03 3.96717497e-03 9.20663652e-02 1.41673099e+00 6.02122098e+00 6.44586128e+00 1.63331443e+00 1.09673649e-01 5.98863026e-03 1.22476459e-02 7.09265921e-02 2.40077103e-01 3.19510478e-01 8.61640183e-02 6.65731303e-03 1.16871322e-02 1.31723973e-01 2.78013311e-01 1.51962472e-01 4.00416660e-02 6.17122649e-03 7.81794417e-02 1.96918157e-01 1.51670048e-01 3.32580226e-02 7.61031437e-03 3.29178922e-02 6.04684941e-02 1.53753055e-02 7.11120307e-03 1.68225158e-02 2.11778417e-02 8.03014126e-03 1.11053473e-02 3.95009299e-02 3.38501458e-02 1.31685055e-02 2.22509392e-02 2.41385683e-02 1.13896898e-02 1.60631428e-02 1.59461956e-02 6.22479372e-03 8.93443731e-03 1.07098259e-02 6.18478379e-03 9.12808856e-03 7.54452137e-03 9.15637982e-03 7.81352579e-03 6.35637785e-03 8.36853838e-03 6.21336188e-03 6.53785908e-03 4.66720774e-03 4.59873942e-03 3.29128604e-03 2.36636812e-03 1.52167571e-03 1.33269869e-03 1.14961995e-03 1.01464148e-03 5.93145876e-04 4.99635008e-04 4.98640086e-04 5.03423621e-04 4.61343924e-04 4.50671235e-04 4.80561314e-04 3.64230392e-04 3.45524261e-04 3.06339818e-04 2.57827978e-04 1.84518869e-04 1.36144799e-04 9.49975459e-05 9.42402109e-05 9.45126118e-05 9.04012059e-05 7.84361819e-05 5.63061283e-05 3.90798580e-05 2.93396714e-05 2.11918386e-05 1.61612281e-05 1.21721234e-05 9.67922533e-06 6.89963830e-06 5.43285045e-06 3.83645753e-06 2.90355287e-06 2.12411428e-06 1.89239558e-06 1.59997136e-06 1.24610975e-06 5.06919169e-07 1.81620320e-07 3.73000535e+01 2.72948934e+01 2.49868627e+01 2.53811632e+01 1.81814493e+01 1.97994342e+01 3.85436245e+01 3.70963525e-02 -2.24606709e-02 1.88734775e-01 -2.03544718e-01 5.38986268e-02 1.56129083e-02]] In [9]: sample_dir = os.path.join(data_dir, 'sample') sub_dirs = ['laughing', 'coughing', 'insects', 'rooster'] classes = {'laughing': 0, 'coughing': 1, 'insects': 2, 'rooster': 3} features, labels = parse_audio_files(sample_dir, sub_dirs, classes) In [10]: print features.shape (156, 193) In [11]: one_hot = one_hot_encode(labels) In [12]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(features, one_hot, test_size=0.15, random_state=42) print X_train.shape print y_train.shape (132, 193) (132, 4) In [56]: n_hidden_units_one = 50 n_hidden_units_two = 50 n_classes = 4 n_dim = X_train.shape[1] In [57]: from keras.models import Sequential from keras.layers import Dense import numpy In [58]: model = Sequential() model.add(Dense(n_hidden_units_one, input_dim=n_dim, kernel_initializer='uniform', activation='relu')) model.add(Dense(n_hidden_units_two, kernel_initializer='uniform', activation='relu')) model.add(Dense(n_classes, kernel_initializer='uniform', activation='softmax')) In [59]: model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) In [60]: model.fit(X_train, y_train, nb_epoch=10, batch_size=20) Epoch 1/10 132/132 [==============================] - 0s - loss: 1.3210 - acc: 0.3636 Epoch 2/10 132/132 [==============================] - 0s - loss: 1.1869 - acc: 0.5530 Epoch 3/10 132/132 [==============================] - 0s - loss: 1.0305 - acc: 0.6136 Epoch 4/10 132/132 [==============================] - 0s - loss: 0.8609 - acc: 0.6970 Epoch 5/10 132/132 [==============================] - 0s - loss: 0.7235 - acc: 0.7500 Epoch 6/10 132/132 [==============================] - 0s - loss: 0.6231 - acc: 0.7879 Epoch 7/10 132/132 [==============================] - 0s - loss: 0.5371 - acc: 0.8258 Epoch 8/10 132/132 [==============================] - 0s - loss: 0.4258 - acc: 0.8788 Epoch 9/10 132/132 [==============================] - 0s - loss: 0.3891 - acc: 0.8712 Epoch 10/10 132/132 [==============================] - 0s - loss: 0.3196 - acc: 0.9015 Out[60]: <keras.callbacks.History at 0x7f03e13c67d0> In [61]: scores = model.evaluate(X_test, y_test) print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100)) 24/24 [==============================] - 0s acc: 83.33% In [14]: from sklearn.decomposition import PCA from sklearn import preprocessing import pandas as pd features_df = pd.DataFrame(features) data_scaled = pd.DataFrame(preprocessing.scale(features_df), columns=features_df.columns) pca = PCA(n_components=5) pca_results = pca.fit_transform(data_scaled) print 'Shape of the transformed feature vector:', pca_results.shape print 'Original training sample:', list(features_df.loc[0].values) print 'Training sample after PCA:', list(pca_results[0]) print '\n' # Percentage of variance explained for each components print 'Explained variance ratio (first five components)' print '------------------------------------------------' for idx, r in enumerate(pca.explained_variance_ratio_): print 'Principal Component', idx, ':', r Shape of the transformed feature vector: (156, 5) Original training sample: [-211.95342873967343, 126.77311337329317, -34.790516564858109, -10.512615152990707, -36.476548766848765, -15.265458386539253, -25.452147024852891, -5.373759791325627, -17.113109621494466, 9.623134344549225, -10.928963826555067, -10.522028894620606, -8.9315416259630567, -4.2521794394544665, -10.441037332644855, -7.5970087415701366, -8.8785380383648729, -1.1939344747472949, -7.0908558885145743, 0.76262883957394723, -6.841079475167918, -6.7085224890121546, -4.9157763803929528, -2.7260106406212623, -6.4026770846192411, -2.3801003698167018, -6.0828079226609626, -1.005822607342773, -4.4659888523414537, -1.1620847727575108, -4.0551530678673311, -0.10143759270967795, -5.5333870628773516, -1.0524975933573175, -3.2479466396041592, -1.5511760810055846, -3.3662660397266997, -2.0985671973177058, -3.8624834878166756, -0.23053952005125813, 0.51834445566621268, 0.50731462792430559, 0.54732341782571448, 0.54456663467689226, 0.55724919803408046, 0.63912478985697185, 0.69991761882011527, 0.71094263997045593, 0.70488867331070892, 0.66897406320086661, 0.58024313023707208, 0.54668903682846348, 0.00030898208816695281, 0.0011037863654963064, 0.016088965129742488, 0.048029955612885047, 0.062542814079642906, 0.13947951636564837, 0.23387052426737462, 0.22624280428373164, 0.19506449493940808, 0.29063113954904291, 0.4549219218919513, 0.73933110146098002, 0.69277632511016096, 0.50986683000985145, 0.60366819441108444, 0.80981155644544678, 1.3543601534067828, 1.320178095013623, 1.5462867580937649, 0.95649853417901898, 1.0331258896554507, 1.8075995803564224, 2.6111118874054506, 2.7215665094646804, 3.3259071652120826, 5.0641070479079797, 6.0979488185737845, 7.1051678508019638, 7.535051336622236, 7.6710790670789937, 2.8148214272133782, 2.7522209223856491, 2.5889673357936207, 1.4791161752630282, 1.1608426587244347, 1.2826584912174799, 1.5683734273342689, 1.0036368136086244, 0.72538112258986454, 0.49079046565747814, 0.39725536292546915, 0.31822184116950575, 0.54006440146321222, 0.48064187985263823, 0.34986304285334202, 0.24974973458894739, 0.19838699669755813, 0.17349815778111813, 0.12901942709396025, 0.11860289924781658, 0.15270080616908899, 0.17868360046591006, 0.19715498444407642, 0.21941677772091334, 0.23211492768070943, 0.25149226279331072, 0.18776629751326335, 0.17411642277448908, 0.17889784925497204, 0.096644062036501277, 0.082507450725597647, 0.066139808612824799, 0.05174036781666911, 0.046590424206075565, 0.038960488665702071, 0.038236938679630399, 0.02557265558608306, 0.021394920531427115, 0.016140895581957311, 0.015475472783039103, 0.011011644560546208, 0.010356935038617588, 0.0099145786292644516, 0.011856377501241988, 0.014797492405052352, 0.013112522923081406, 0.015755074069384874, 0.020782871349228575, 0.022437630686839058, 0.021883271850380963, 0.01657473185440838, 0.018241886561177423, 0.022016965476869915, 0.015009248627246756, 0.013437150754335971, 0.0084712427644161557, 0.0078142587165301747, 0.006294118115443466, 0.0046241263855647703, 0.0046417836148709589, 0.0040455707002778384, 0.0035629338770325389, 0.0035737071288845118, 0.0031781355610891746, 0.0022911490950632291, 0.0019606707835456614, 0.0014439297620097654, 0.0012277669175469535, 0.0010700037495356531, 0.0010706933391297537, 0.0012123657109182897, 0.0011216158135388798, 0.0010196895226450143, 0.0013053526270285333, 0.0017245984804699101, 0.0018016985150939903, 0.0016583987379953005, 0.0014503483148989862, 0.0011736064986719386, 0.0011487815179094805, 0.00098804534315929753, 0.00097006047956999658, 0.00076859565645174838, 0.00074170568164964358, 0.00055791487710688448, 0.00056102393166497023, 0.00038626005762788242, 0.00034570871649306054, 0.00029332994200811788, 0.00018045878544668763, 0.00014152537898716433, 0.00012862934087420268, 0.00011241898990175759, 0.00010035055321958268, 8.6017515487433528e-05, 5.7183426737683098e-05, 1.5404341109415376e-05, 1.0205875575903759e-06, 23.027444126555952, 12.259515859469532, 15.806141550857673, 17.774268658297565, 18.162222180882441, 18.832451488517545, 38.495340230619959, 0.0039265619510271983, 0.0031208547093051074, 0.016112377021180464, -0.0033134670188252908, -0.0038187426081049991, -0.0027186458454636149] Training sample after PCA: [-2.6068373184520963, 2.4242388793647009, -0.65683308107782468, 4.2573390545248051, 1.079539326884505] Explained variance ratio (first five components) ------------------------------------------------ Principal Component 0 : 0.18766387691 Principal Component 1 : 0.0877785919013 Principal Component 2 : 0.071878311726 Principal Component 3 : 0.0639192643987 Principal Component 4 : 0.0450622371579 In [18]: from ggplot import * df_pca = features_df.copy() df_pca['x-pca'] = pca_results[:,0] df_pca['y-pca'] = pca_results[:,1] df_pca['label'] = labels chart = ggplot( df_pca, aes(x='x-pca', y='y-pca', color='label') ) \ + geom_point(size=75,alpha=0.8) \ + ggtitle("First and Second Principal Components colored by gender") chart Out[18]: <ggplot: (8730928897673)> In [22]: from sklearn.manifold import TSNE print("Computing t-SNE embedding") tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=500) tsne_results = tsne.fit_transform(features_df) Computing t-SNE embedding [t-SNE] Computing pairwise distances... [t-SNE] Computing 151 nearest neighbors... [t-SNE] Computed conditional probabilities for sample 156 / 156 [t-SNE] Mean sigma: 46.734307 [t-SNE] KL divergence after 100 iterations with early exaggeration: 1.395291 [t-SNE] Error after 400 iterations: 1.395291 In [23]: df_tsne = features_df.copy() df_tsne['x-tsne'] = tsne_results[:,0] df_tsne['y-tsne'] = tsne_results[:,1] df_tsne['label'] = labels chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \ + geom_point(size=70,alpha=0.1) \ + ggtitle("tSNE dimensions colored by gender") chart Out[23]: <ggplot: (8730928702225)> In [28]: from sklearn.manifold import TSNE print("Computing t-SNE embedding") tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=1000) tsne_results = tsne.fit_transform(pca_results) Computing t-SNE embedding [t-SNE] Computing pairwise distances... [t-SNE] Computing 151 nearest neighbors... [t-SNE] Computed conditional probabilities for sample 156 / 156 [t-SNE] Mean sigma: 3.671794 [t-SNE] KL divergence after 100 iterations with early exaggeration: 1.414827 [t-SNE] Error after 325 iterations: 1.414827 In [29]: df_tsne = features_df.copy() df_tsne['x-tsne'] = tsne_results[:,0] df_tsne['y-tsne'] = tsne_results[:,1] df_tsne['label'] = labels chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \ + geom_point(size=70,alpha=0.1) \ + ggtitle("tSNE dimensions colored by gender") chart Out[29]: <ggplot: (8730928573909)> In [ ]:Content source: sanket-patil/speech-vectors
Similar notebooks:
- classification
- index
- ideal highpass
- Erlang_1,k-checkpoint
- lec07-blending_and_bagging
- Filter design-checkpoint
- tonal
- FourierTransforms
- Tut-checkpoint
- Fourier-Transform-Primer
Từ khóa » N_mfcc=40
-
Librosa.feature.mfcc — Librosa 0.10.v0 Documentation
-
Librosa.feature.mfcc — Librosa 0.7.2 Documentation
-
Explanation Of Librosa MFCC Parameter With N_mfcc=40
-
Some Question When Extracting MFCC Features · Issue #595 - GitHub
-
Librosa.feature.mfcc
-
Mel-frequency Cepstrum Coefficients — Transform_mfcc • Torchaudio
-
Why N_mfcc Value = 40 , While Extracting Mfcc Features | By Naveen K
-
Librosa.feature.mfcc Example - Program Talk
-
Continue With Machine Learning - Noise Detection (Classification)
-
Speech Recognition - Audio Data Analysis Post Doubts (MFCCS)
-
Audio Spectrogram — NVIDIA DALI 1.16.0 Documentation
-
Class For Audio Feature Extraction - Notebooks