Source code for gnes.encoder.video.yt8m_feature_extractor

#  Tencent is pleased to support the open source community by making GNES available.
#
#  Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

from typing import List
import numpy as np
from PIL import Image
from ..base import BaseVideoEncoder
from ...helper import batching, get_first_available_gpu


[docs]class YouTube8MFeatureExtractor(BaseVideoEncoder):
    """Extracts YouTube8M features for RGB frames.

    First time constructing this class will create directory `yt8m` inside your
    home directory, and will download inception model (85 MB) and YouTube8M PCA
    matrix (15 MB). If you want to use another directory, then pass it to argument
    `model_dir` of constructor.

    If the model_dir exist and contains the necessary files, then files will be
    re-used without download.

    Usage Example:

        from PIL import Image
        import numpy

        # Instantiate extractor. Slow if called first time on your machine, as it
        # needs to download 100 MB.
        extractor = YouTube8MFeatureExtractor()

        image_file = os.path.join(extractor._model_dir, 'cropped_panda.jpg')

        im = numpy.array(Image.open(image_file))
        features = extractor.extract_rgb_frame_features(im)

    ** Note: OpenCV reverses the order of channels (i.e. orders channels as BGR
    instead of RGB). If you are using OpenCV, then you must do:

        im = im[:, :, ::-1]  # Reverses order on last (i.e. channel) dimension.

    then call `extractor.extract_rgb_frame_features(im)`
    """
    batch_size = 64

    def __init__(self, model_dir: str,
                 pca_dir: str,
                 select_layer: str = 'PreLogits',
                 ignore_audio_feature: bool = True,
                 *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.model_dir = model_dir
        self.pca_dir = pca_dir
        self.select_layer = select_layer
        self.ignore_audio_feature = ignore_audio_feature
        self.audio_dim = 128
        self.incep_dim = 2048
        self.pca_dim = 1024
        self.inception_size_x = 299
        self.inception_size_y = 299

[docs]    def post_init(self):
        import tensorflow as tf
        from .yt8m_feature_extractor_cores.inception_v3 import inception_v3
        from .yt8m_feature_extractor_cores.inception_utils import inception_arg_scope
        import os
        os.environ['CUDA_VISIBLE_DEVICES'] = str(get_first_available_gpu())

        self.pca_mean = np.load(os.path.join(self.pca_dir, 'mean.npy'))[:, 0]
        self.pca_eigenvals = np.load(os.path.join(self.pca_dir, 'eigenvals.npy'))[:self.pca_dim, 0]
        self.pca_eigenvecs = np.load(os.path.join(self.pca_dir, 'eigenvecs.npy')).T[:, :self.pca_dim]

        g = tf.Graph()
        with g.as_default():
            arg_scope = inception_arg_scope()
            inception_v3.default_image_size = self.inception_size_x
            self.inputs = tf.placeholder(tf.float32, (None,
                                                      self.inception_size_x,
                                                      self.inception_size_y, 3))

            with tf.contrib.slim.arg_scope(arg_scope):
                self.logits, self.end_points = inception_v3(self.inputs,
                                                            num_classes=1001,
                                                            is_training=False,
                                                            dropout_keep_prob=1.0)

            config = tf.ConfigProto(log_device_placement=False)
            if self.on_gpu:
                config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            self.saver = tf.train.Saver()
            self.saver.restore(self.sess, self.model_dir)

[docs]    def encode(self, data: List['np.ndarray'], *args, **kwargs) -> List['np.ndarray']:
        video_length = [len(d) for d in data]
        for i in range(1, len(video_length)):
            video_length[i] = video_length[i] + video_length[i - 1]

        data = [(np.array(Image.fromarray(im).resize((self.inception_size_x,
                                                      self.inception_size_y)), dtype=np.float32) * 2 / 255. - 1.)
                for video in data for im in video]

        data = np.stack((list(data[i] for i in range(len(data)))), axis=0)

        @batching
        def _encode(_, data):
            def _pca(data):
                data = np.squeeze(data, axis=(1, 2))
                data = (data - self.pca_mean).reshape((len(data), self.incep_dim))
                data = np.matmul(data, self.pca_eigenvecs)
                data = data / np.sqrt(self.pca_eigenvals + 1e-4)
                return data

            _, end_points_ = self.sess.run((self.logits, self.end_points),
                                           feed_dict={self.inputs: data})
            data = _pca(end_points_[self.select_layer])
            return data

        def _fill_audio_feature(data):
            return list(map(lambda x: np.concatenate((x, np.zeros(shape=(x.shape[0], self.audio_dim))), axis=1), data))

        data = _encode(self, data)

        data = np.split(data, video_length[:-1])

        if self.ignore_audio_feature:
            return _fill_audio_feature(data)
        else:
            return data.astype(np.float32)