luojiehua
/
BidiRag


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
							# -*- coding: utf-8 -*-
"""Tokenization helpers for keyword-based retrieval."""
import re
import unicodedata

try:
    import jieba
except ImportError:  # pragma: no cover - used only in minimal installs.
    jieba = None


_TOKEN_PATTERN = re.compile(
    r"[a-z0-9]+(?:[-_./][a-z0-9]+)*|[\u4e00-\u9fff]+",
    re.IGNORECASE,
)


_PHRASE_ALIASES = {
    "预算金额": ["project", "budget", "amount"],
    "项目预算": ["project", "budget"],
    "采购预算": ["procurement", "budget"],
    "最高限价": ["price", "ceiling", "budget"],
    "预算": ["budget"],
    "投标保证金": ["bid", "bond", "amount"],
    "履约保证金": ["performance", "bond", "amount"],
    "资格要求": ["qualification", "requirements"],
    "资质要求": ["qualification", "requirements"],
    "评标方法": ["evaluation", "method"],
    "评审方法": ["evaluation", "method"],
    "综合评分": ["comprehensive", "scoring", "method", "evaluation"],
    "综合评估": ["comprehensive", "evaluation", "method"],
    "质保期": ["warranty", "period"],
    "保修期": ["warranty", "period"],
    "三年": ["3", "years"],
    "付款方式": ["payment", "terms"],
    "支付方式": ["payment", "terms"],
    "项目编号": ["project", "code"],
    "采购编号": ["project", "code"],
    "招标编号": ["project", "code"],
    "交货时间": ["delivery", "time"],
    "交付时间": ["delivery", "time"],
    "投标截止": ["bid", "submission", "deadline"],
    "开标时间": ["bid", "opening", "time"],
    "采购人": ["purchaser"],
    "招标代理": ["agency"],
    "代理机构": ["agency"],
    "联系方式": ["contact", "phone"],
}


def bm25_tokenize(text):
    """
    Tokenize mixed Chinese/English/number text for BM25.

    Bidding documents commonly mix Chinese labels, English labels, project
    codes and amounts.  The tokenizer keeps exact lexical tokens, splits
    compound identifiers such as ``XX-ZB-2024-001``, adds Chinese bigrams, and
    expands common bidding-domain phrases to English aliases so Chinese field
    queries can recall English or bilingual source text.
    """
    if text is None:
        return []

    normalized = unicodedata.normalize("NFKC", str(text)).lower()
    normalized = re.sub(r"(?<=\d),(?=\d)", "", normalized)
    tokens = []

    for match in _TOKEN_PATTERN.finditer(normalized):
        raw = match.group(0)
        if not raw:
            continue

        if re.search(r"[\u4e00-\u9fff]", raw):
            if jieba is not None:
                tokens.extend(t for t in jieba.cut(raw) if t.strip())
            else:
                tokens.append(raw)
            if len(raw) > 1:
                tokens.extend(raw[i:i + 2] for i in range(len(raw) - 1))
            continue

        tokens.append(raw)
        if re.search(r"[-_./]", raw):
            tokens.extend(part for part in re.split(r"[-_./]+", raw) if part)

    for phrase, aliases in _PHRASE_ALIASES.items():
        if phrase.lower() in normalized:
            tokens.extend(aliases)

    return tokens