# -*- coding: utf-8 -*- """Tokenization helpers for keyword-based retrieval.""" import re import unicodedata try: import jieba except ImportError: # pragma: no cover - used only in minimal installs. jieba = None _TOKEN_PATTERN = re.compile( r"[a-z0-9]+(?:[-_./][a-z0-9]+)*|[\u4e00-\u9fff]+", re.IGNORECASE, ) _PHRASE_ALIASES = { "预算金额": ["project", "budget", "amount"], "项目预算": ["project", "budget"], "采购预算": ["procurement", "budget"], "最高限价": ["price", "ceiling", "budget"], "预算": ["budget"], "投标保证金": ["bid", "bond", "amount"], "履约保证金": ["performance", "bond", "amount"], "资格要求": ["qualification", "requirements"], "资质要求": ["qualification", "requirements"], "评标方法": ["evaluation", "method"], "评审方法": ["evaluation", "method"], "综合评分": ["comprehensive", "scoring", "method", "evaluation"], "综合评估": ["comprehensive", "evaluation", "method"], "质保期": ["warranty", "period"], "保修期": ["warranty", "period"], "三年": ["3", "years"], "付款方式": ["payment", "terms"], "支付方式": ["payment", "terms"], "项目编号": ["project", "code"], "采购编号": ["project", "code"], "招标编号": ["project", "code"], "交货时间": ["delivery", "time"], "交付时间": ["delivery", "time"], "投标截止": ["bid", "submission", "deadline"], "开标时间": ["bid", "opening", "time"], "采购人": ["purchaser"], "招标代理": ["agency"], "代理机构": ["agency"], "联系方式": ["contact", "phone"], } def bm25_tokenize(text): """ Tokenize mixed Chinese/English/number text for BM25. Bidding documents commonly mix Chinese labels, English labels, project codes and amounts. The tokenizer keeps exact lexical tokens, splits compound identifiers such as ``XX-ZB-2024-001``, adds Chinese bigrams, and expands common bidding-domain phrases to English aliases so Chinese field queries can recall English or bilingual source text. """ if text is None: return [] normalized = unicodedata.normalize("NFKC", str(text)).lower() normalized = re.sub(r"(?<=\d),(?=\d)", "", normalized) tokens = [] for match in _TOKEN_PATTERN.finditer(normalized): raw = match.group(0) if not raw: continue if re.search(r"[\u4e00-\u9fff]", raw): if jieba is not None: tokens.extend(t for t in jieba.cut(raw) if t.strip()) else: tokens.append(raw) if len(raw) > 1: tokens.extend(raw[i:i + 2] for i in range(len(raw) - 1)) continue tokens.append(raw) if re.search(r"[-_./]", raw): tokens.extend(part for part in re.split(r"[-_./]+", raw) if part) for phrase, aliases in _PHRASE_ALIASES.items(): if phrase.lower() in normalized: tokens.extend(aliases) return tokens