Source code for gnes.score_fn.chunk

#  Tencent is pleased to support the open source community by making GNES available.
#
#  Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

from .base import get_unary_score, CombinedScoreFn
from typing import List, Tuple
import numpy as np


[docs]class WeightedChunkScoreFn(CombinedScoreFn):
    """score = d_chunk.weight * relevance * q_chunk.weight"""

    def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
                 q_chunk: 'gnes_pb2.Chunk',
                 d_chunk: 'gnes_pb2.Chunk', *args, **kwargs):
        q_chunk_weight = get_unary_score(value=q_chunk.weight,
                                         name='query chunk weight',
                                         offset=q_chunk.offset)
        d_chunk_weight = get_unary_score(value=d_chunk.weight,
                                         name='document chunk weight',
                                         doc_id=d_chunk.doc_id,
                                         offset=d_chunk.offset)

        return super().__call__(last_score, q_chunk_weight, d_chunk_weight)


[docs]class WeightedChunkOffsetScoreFn(CombinedScoreFn):
    """
    score = d_chunk.weight * relevance * offset_divergence * q_chunk.weight
    offset_divergence is calculated based on doc_type:
        TEXT && VIDEO && AUDIO: offset is 1-D
        IMAGE: offset is 2-D
    """

    def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
                 q_chunk: 'gnes_pb2.Chunk',
                 d_chunk: 'gnes_pb2.Chunk', *args, **kwargs):
        q_chunk_weight = get_unary_score(value=q_chunk.weight,
                                         name='query chunk weight',
                                         offset=str(q_chunk.offset))
        d_chunk_weight = get_unary_score(value=d_chunk.weight,
                                         name='document chunk weight',
                                         doc_id=d_chunk.doc_id,
                                         offset=str(d_chunk.offset))
        offset_divergence = get_unary_score(value=self._cal_divergence(q_chunk, d_chunk),
                                            name='offset divergence')
        return super().__call__(last_score, q_chunk_weight, d_chunk_weight, offset_divergence)

    @staticmethod
    def _cal_divergence(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk'):
        if q_chunk.offset_nd and d_chunk.offset_nd:
            return 1 / (1 + np.sqrt((q_chunk.offset_nd[0] - d_chunk.offset_nd[0]) ** 2 +
                                    (q_chunk.offset_nd[1] - d_chunk.offset_nd[1]) ** 2))
        else:
            return np.abs(q_chunk.offset - d_chunk.offset)


[docs]class CoordChunkScoreFn(CombinedScoreFn):
    """
    score = relevance * query_coordination
    query_coordination: #chunks return / #chunks in this doc(query doc)
    """

    def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
                 q_chunk: 'gnes_pb2.Chunk',
                 d_chunk: 'gnes_pb2.Chunk',
                 queried_results: List[List[Tuple]],
                 *args, **kwargs):
        query_coordination = get_unary_score(value=self._cal_query_coord(d_chunk, queried_results),
                                             name='query coordination')
        return super().__call__(last_score, query_coordination)

    def _cal_query_coord(self, d_chunk: 'gnes_pb2.Chunk', queried_results: List[List[Tuple]]):
        doc_id = d_chunk.doc_id
        total_chunks = self._context.num_chunks_in_doc(doc_id)
        queried_doc_id, _, _, _ = zip(*(queried_results[0]))
        recall_chunks = queried_doc_id.count(doc_id)
        return recall_chunks / total_chunks


[docs]class TFIDFChunkScoreFn(CombinedScoreFn):
    """
    score = relevance * tf(q_chunk) * (idf(q_chunk)**2)
    tf(q_chunk) is calculated based on the relevance of query result.
    tf(q_chunk) = number of queried chunks where relevance >= threshold
    idf(q_chunk) = log(total_chunks / tf(q_chunk) + 1)
    """

    def __init__(self, threshold: float = 0.8, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.threshold = threshold

    def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
                 q_chunk: 'gnes_pb2.Chunk',
                 d_chunk: 'gnes_pb2.Chunk',
                 queried_results: List[List[Tuple]],
                 *args, **kwargs):
        tf_idf = get_unary_score(value=self._cal_tf_idf(queried_results),
                                             name='query tf-idf')
        return super().__call__(last_score, tf_idf)

    def _cal_tf_idf(self, queried_results: List[List[Tuple]]):
        _, _, _, queried_relevance = zip(*(queried_results[0]))
        tf = len(list(filter(lambda x: x >= self.threshold, queried_relevance)))

        total_chunks = self._context.num_chunks
        idf = np.log10(total_chunks / (tf + 1))
        return tf * (idf ** 2)


[docs]class BM25ChunkScoreFn(CombinedScoreFn):
    """
    score = relevance * idf(q_chunk) * tf(q_chunk) * (k1 + 1) / (tf(q_chunk) +
                            k1 * (1 - b + b * (chunk_in_doc / avg_chunk_in_doc)))

    in bm25 algorithm:
             idf(q_chunk) = log(1 + (doc_count - f(q_chunk) +0.5) / (f(q_chunk) + 0.5)),
    where f(q_chunk) is number of docs that contains q_chunk. In our system, this denotes number of docs
    appearing in query results.

    In elastic search, b = 0.75, k1 = 1.2
    """

    def __init__(self, threshold: float = 0.8, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.threshold = threshold
        self.k1 = 1.2
        self.b = 0.75

    def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score',
                 q_chunk: 'gnes_pb2.Chunk',
                 d_chunk: 'gnes_pb2.Chunk',
                 queried_results: List[List[Tuple]],
                 *args, **kwargs):
        bm25 = get_unary_score(value=self._cal_bm25(d_chunk, queried_results),
                                             name='query bm25')
        return super().__call__(last_score, bm25)

    def _cal_bm25(self, d_chunk: 'gnes_pb2.Chunk', queried_results: List[List[Tuple]]):
        doc_id = d_chunk.doc_id
        _, _, _, queried_relevance = zip(*(queried_results[0]))
        tf = len(list(filter(lambda x: x >= self.threshold, queried_relevance)))

        total_chunks = self._context.num_chunks
        idf = np.log10(1 + (total_chunks - tf + 0.5) / (tf + 0.5))
        return idf * tf * (self.k1 + 1) / (tf + self.k1 * (1 - self.b + self.b *
                                (self._context.num_chunks_in_doc(doc_id) * self._context.num_docs / self._context.num_chunks)))