# -*- coding: utf-8 -*- """BM25 backend selection with a small local fallback.""" import math from collections import Counter import numpy as np class SimpleBM25Okapi(object): """Small BM25Okapi-compatible fallback used when rank_bm25 is unavailable.""" def __init__(self, corpus, k1=1.5, b=0.75, epsilon=0.25): self.corpus = [list(doc or []) for doc in corpus] self.k1 = k1 self.b = b self.epsilon = epsilon self.corpus_size = len(self.corpus) self.doc_len = [len(doc) for doc in self.corpus] self.avgdl = float(sum(self.doc_len)) / self.corpus_size if self.corpus_size else 0.0 self.doc_freqs = [Counter(doc) for doc in self.corpus] self.idf = self._calc_idf() def _calc_idf(self): nd = {} for freqs in self.doc_freqs: for word in freqs: nd[word] = nd.get(word, 0) + 1 idf = {} negative_idfs = [] for word, freq in nd.items(): value = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5) idf[word] = value if value < 0: negative_idfs.append(value) average_idf = sum(idf.values()) / len(idf) if idf else 0.0 eps = self.epsilon * average_idf for word in idf: if idf[word] < 0: idf[word] = eps return idf def get_scores(self, query): scores = np.zeros(self.corpus_size) if not query or not self.corpus_size or self.avgdl <= 0: return scores for token in query: token_idf = self.idf.get(token) if token_idf is None: continue for i, freqs in enumerate(self.doc_freqs): freq = freqs.get(token, 0) if freq == 0: continue denominator = freq + self.k1 * (1 - self.b + self.b * self.doc_len[i] / self.avgdl) scores[i] += token_idf * freq * (self.k1 + 1) / denominator return scores def get_bm25_okapi(): try: from rank_bm25 import BM25Okapi return BM25Okapi except ImportError: return SimpleBM25Okapi