| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- # -*- coding: utf-8 -*-
- """BM25 backend selection with a small local fallback."""
- import math
- from collections import Counter
- import numpy as np
- class SimpleBM25Okapi(object):
- """Small BM25Okapi-compatible fallback used when rank_bm25 is unavailable."""
- def __init__(self, corpus, k1=1.5, b=0.75, epsilon=0.25):
- self.corpus = [list(doc or []) for doc in corpus]
- self.k1 = k1
- self.b = b
- self.epsilon = epsilon
- self.corpus_size = len(self.corpus)
- self.doc_len = [len(doc) for doc in self.corpus]
- self.avgdl = float(sum(self.doc_len)) / self.corpus_size if self.corpus_size else 0.0
- self.doc_freqs = [Counter(doc) for doc in self.corpus]
- self.idf = self._calc_idf()
- def _calc_idf(self):
- nd = {}
- for freqs in self.doc_freqs:
- for word in freqs:
- nd[word] = nd.get(word, 0) + 1
- idf = {}
- negative_idfs = []
- for word, freq in nd.items():
- value = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
- idf[word] = value
- if value < 0:
- negative_idfs.append(value)
- average_idf = sum(idf.values()) / len(idf) if idf else 0.0
- eps = self.epsilon * average_idf
- for word in idf:
- if idf[word] < 0:
- idf[word] = eps
- return idf
- def get_scores(self, query):
- scores = np.zeros(self.corpus_size)
- if not query or not self.corpus_size or self.avgdl <= 0:
- return scores
- for token in query:
- token_idf = self.idf.get(token)
- if token_idf is None:
- continue
- for i, freqs in enumerate(self.doc_freqs):
- freq = freqs.get(token, 0)
- if freq == 0:
- continue
- denominator = freq + self.k1 * (1 - self.b + self.b * self.doc_len[i] / self.avgdl)
- scores[i] += token_idf * freq * (self.k1 + 1) / denominator
- return scores
- def get_bm25_okapi():
- try:
- from rank_bm25 import BM25Okapi
- return BM25Okapi
- except ImportError:
- return SimpleBM25Okapi
|