| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- # -*- coding: utf-8 -*-
- """Tokenization helpers for keyword-based retrieval."""
- import re
- import unicodedata
- try:
- import jieba
- except ImportError: # pragma: no cover - used only in minimal installs.
- jieba = None
- _TOKEN_PATTERN = re.compile(
- r"[a-z0-9]+(?:[-_./][a-z0-9]+)*|[\u4e00-\u9fff]+",
- re.IGNORECASE,
- )
- _PHRASE_ALIASES = {
- "预算金额": ["project", "budget", "amount"],
- "项目预算": ["project", "budget"],
- "采购预算": ["procurement", "budget"],
- "最高限价": ["price", "ceiling", "budget"],
- "预算": ["budget"],
- "投标保证金": ["bid", "bond", "amount"],
- "履约保证金": ["performance", "bond", "amount"],
- "资格要求": ["qualification", "requirements"],
- "资质要求": ["qualification", "requirements"],
- "评标方法": ["evaluation", "method"],
- "评审方法": ["evaluation", "method"],
- "综合评分": ["comprehensive", "scoring", "method", "evaluation"],
- "综合评估": ["comprehensive", "evaluation", "method"],
- "质保期": ["warranty", "period"],
- "保修期": ["warranty", "period"],
- "三年": ["3", "years"],
- "付款方式": ["payment", "terms"],
- "支付方式": ["payment", "terms"],
- "项目编号": ["project", "code"],
- "采购编号": ["project", "code"],
- "招标编号": ["project", "code"],
- "交货时间": ["delivery", "time"],
- "交付时间": ["delivery", "time"],
- "投标截止": ["bid", "submission", "deadline"],
- "开标时间": ["bid", "opening", "time"],
- "采购人": ["purchaser"],
- "招标代理": ["agency"],
- "代理机构": ["agency"],
- "联系方式": ["contact", "phone"],
- }
- def bm25_tokenize(text):
- """
- Tokenize mixed Chinese/English/number text for BM25.
- Bidding documents commonly mix Chinese labels, English labels, project
- codes and amounts. The tokenizer keeps exact lexical tokens, splits
- compound identifiers such as ``XX-ZB-2024-001``, adds Chinese bigrams, and
- expands common bidding-domain phrases to English aliases so Chinese field
- queries can recall English or bilingual source text.
- """
- if text is None:
- return []
- normalized = unicodedata.normalize("NFKC", str(text)).lower()
- normalized = re.sub(r"(?<=\d),(?=\d)", "", normalized)
- tokens = []
- for match in _TOKEN_PATTERN.finditer(normalized):
- raw = match.group(0)
- if not raw:
- continue
- if re.search(r"[\u4e00-\u9fff]", raw):
- if jieba is not None:
- tokens.extend(t for t in jieba.cut(raw) if t.strip())
- else:
- tokens.append(raw)
- if len(raw) > 1:
- tokens.extend(raw[i:i + 2] for i in range(len(raw) - 1))
- continue
- tokens.append(raw)
- if re.search(r"[-_./]", raw):
- tokens.extend(part for part in re.split(r"[-_./]+", raw) if part)
- for phrase, aliases in _PHRASE_ALIASES.items():
- if phrase.lower() in normalized:
- tokens.extend(aliases)
- return tokens
|