Source code for gnes.encoder.audio.vggish_cores.vggish_postprocess

# ==============================================================================

"""Post-process embeddings from VGGish."""

import numpy as np
from ..vggish_cores import vggish_params


[docs]class Postprocessor(object): """Post-processes VGGish embeddings. The initial release of AudioSet included 128-D VGGish embeddings for each segment of AudioSet. These released embeddings were produced by applying a PCA transformation (technically, a whitening transform is included as well) and 8-bit quantization to the raw embedding output from VGGish, in order to stay compatible with the YouTube-8M project which provides visual embeddings in the same format for a large set of YouTube videos. This class implements the same PCA (with whitening) and quantization transformations. """ def __init__(self, pca_params_npz_path): """Constructs a postprocessor. Args: pca_params_npz_path: Path to a NumPy-format .npz file that contains the PCA parameters used in postprocessing. """ params = np.load(pca_params_npz_path) self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] # Load means into a column vector for easier broadcasting later. self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) assert self._pca_matrix.shape == ( vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 'Bad PCA means shape: %r' % (self._pca_means.shape,))
[docs] def postprocess(self, embeddings_batch): """Applies postprocessing to a batch of embeddings. Args: embeddings_batch: An nparray of shape [batch_size, embedding_size] containing output from the embedding layer of VGGish. Returns: An nparray of the same shape as the input but of type uint8, containing the PCA-transformed and quantized version of the input. """ assert len(embeddings_batch.shape) == 2, ( 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 'Bad batch shape: %r' % (embeddings_batch.shape,)) # Apply PCA. # - Embeddings come in as [batch_size, embedding_size]. # - Transpose to [embedding_size, batch_size]. # - Subtract pca_means column vector from each column. # - Premultiply by PCA matrix of shape [output_dims, input_dims] # where both are are equal to embedding_size in our case. # - Transpose result back to [batch_size, embedding_size]. pca_applied = np.dot(self._pca_matrix, (embeddings_batch.T - self._pca_means)).T # Quantize by: # - clipping to [min, max] range clipped_embeddings = np.clip( pca_applied, vggish_params.QUANTIZE_MIN_VAL, vggish_params.QUANTIZE_MAX_VAL) # - convert to 8-bit in range [0.0, 255.0] quantized_embeddings = ( (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * (255.0 / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) quantized_embeddings = quantized_embeddings.astype(np.float32) return quantized_embeddings