Source code for gnes.score_fn.chunk

#  Tencent is pleased to support the open source community by making GNES available.
#
#  Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

from .base import get_unary_score, CombinedScoreFn
from typing import List, Tuple
import numpy as np


[docs]class WeightedChunkScoreFn(CombinedScoreFn): """score = d_chunk.weight * relevance * q_chunk.weight""" def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', *args, **kwargs): q_chunk_weight = get_unary_score(value=q_chunk.weight, name='query chunk weight', offset=q_chunk.offset) d_chunk_weight = get_unary_score(value=d_chunk.weight, name='document chunk weight', doc_id=d_chunk.doc_id, offset=d_chunk.offset) return super().__call__(last_score, q_chunk_weight, d_chunk_weight)
[docs]class WeightedChunkOffsetScoreFn(CombinedScoreFn): """ score = d_chunk.weight * relevance * offset_divergence * q_chunk.weight offset_divergence is calculated based on doc_type: TEXT && VIDEO && AUDIO: offset is 1-D IMAGE: offset is 2-D """ def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', *args, **kwargs): q_chunk_weight = get_unary_score(value=q_chunk.weight, name='query chunk weight', offset=str(q_chunk.offset)) d_chunk_weight = get_unary_score(value=d_chunk.weight, name='document chunk weight', doc_id=d_chunk.doc_id, offset=str(d_chunk.offset)) offset_divergence = get_unary_score(value=self._cal_divergence(q_chunk, d_chunk), name='offset divergence') return super().__call__(last_score, q_chunk_weight, d_chunk_weight, offset_divergence) @staticmethod def _cal_divergence(q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk'): if q_chunk.offset_nd and d_chunk.offset_nd: return 1 / (1 + np.sqrt((q_chunk.offset_nd[0] - d_chunk.offset_nd[0]) ** 2 + (q_chunk.offset_nd[1] - d_chunk.offset_nd[1]) ** 2)) else: return np.abs(q_chunk.offset - d_chunk.offset)
[docs]class CoordChunkScoreFn(CombinedScoreFn): """ score = relevance * query_coordination query_coordination: #chunks return / #chunks in this doc(query doc) """ def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', queried_results: List[List[Tuple]], *args, **kwargs): query_coordination = get_unary_score(value=self._cal_query_coord(d_chunk, queried_results), name='query coordination') return super().__call__(last_score, query_coordination) def _cal_query_coord(self, d_chunk: 'gnes_pb2.Chunk', queried_results: List[List[Tuple]]): doc_id = d_chunk.doc_id total_chunks = self._context.num_chunks_in_doc(doc_id) queried_doc_id, _, _, _ = zip(*(queried_results[0])) recall_chunks = queried_doc_id.count(doc_id) return recall_chunks / total_chunks
[docs]class TFIDFChunkScoreFn(CombinedScoreFn): """ score = relevance * tf(q_chunk) * (idf(q_chunk)**2) tf(q_chunk) is calculated based on the relevance of query result. tf(q_chunk) = number of queried chunks where relevance >= threshold idf(q_chunk) = log(total_chunks / tf(q_chunk) + 1) """ def __init__(self, threshold: float = 0.8, *args, **kwargs): super().__init__(*args, **kwargs) self.threshold = threshold def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', queried_results: List[List[Tuple]], *args, **kwargs): tf_idf = get_unary_score(value=self._cal_tf_idf(queried_results), name='query tf-idf') return super().__call__(last_score, tf_idf) def _cal_tf_idf(self, queried_results: List[List[Tuple]]): _, _, _, queried_relevance = zip(*(queried_results[0])) tf = len(list(filter(lambda x: x >= self.threshold, queried_relevance))) total_chunks = self._context.num_chunks idf = np.log10(total_chunks / (tf + 1)) return tf * (idf ** 2)
[docs]class BM25ChunkScoreFn(CombinedScoreFn): """ score = relevance * idf(q_chunk) * tf(q_chunk) * (k1 + 1) / (tf(q_chunk) + k1 * (1 - b + b * (chunk_in_doc / avg_chunk_in_doc))) in bm25 algorithm: idf(q_chunk) = log(1 + (doc_count - f(q_chunk) +0.5) / (f(q_chunk) + 0.5)), where f(q_chunk) is number of docs that contains q_chunk. In our system, this denotes number of docs appearing in query results. In elastic search, b = 0.75, k1 = 1.2 """ def __init__(self, threshold: float = 0.8, *args, **kwargs): super().__init__(*args, **kwargs) self.threshold = threshold self.k1 = 1.2 self.b = 0.75 def __call__(self, last_score: 'gnes_pb2.Response.QueryResponse.ScoredResult.Score', q_chunk: 'gnes_pb2.Chunk', d_chunk: 'gnes_pb2.Chunk', queried_results: List[List[Tuple]], *args, **kwargs): bm25 = get_unary_score(value=self._cal_bm25(d_chunk, queried_results), name='query bm25') return super().__call__(last_score, bm25) def _cal_bm25(self, d_chunk: 'gnes_pb2.Chunk', queried_results: List[List[Tuple]]): doc_id = d_chunk.doc_id _, _, _, queried_relevance = zip(*(queried_results[0])) tf = len(list(filter(lambda x: x >= self.threshold, queried_relevance))) total_chunks = self._context.num_chunks idf = np.log10(1 + (total_chunks - tf + 0.5) / (tf + 0.5)) return idf * tf * (self.k1 + 1) / (tf + self.k1 * (1 - self.b + self.b * (self._context.num_chunks_in_doc(doc_id) * self._context.num_docs / self._context.num_chunks)))