tokenization.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. # -*- coding: utf-8 -*-
  2. """Tokenization helpers for keyword-based retrieval."""
  3. import re
  4. import unicodedata
  5. try:
  6. import jieba
  7. except ImportError: # pragma: no cover - used only in minimal installs.
  8. jieba = None
  9. _TOKEN_PATTERN = re.compile(
  10. r"[a-z0-9]+(?:[-_./][a-z0-9]+)*|[\u4e00-\u9fff]+",
  11. re.IGNORECASE,
  12. )
  13. _PHRASE_ALIASES = {
  14. "预算金额": ["project", "budget", "amount"],
  15. "项目预算": ["project", "budget"],
  16. "采购预算": ["procurement", "budget"],
  17. "最高限价": ["price", "ceiling", "budget"],
  18. "预算": ["budget"],
  19. "投标保证金": ["bid", "bond", "amount"],
  20. "履约保证金": ["performance", "bond", "amount"],
  21. "资格要求": ["qualification", "requirements"],
  22. "资质要求": ["qualification", "requirements"],
  23. "评标方法": ["evaluation", "method"],
  24. "评审方法": ["evaluation", "method"],
  25. "综合评分": ["comprehensive", "scoring", "method", "evaluation"],
  26. "综合评估": ["comprehensive", "evaluation", "method"],
  27. "质保期": ["warranty", "period"],
  28. "保修期": ["warranty", "period"],
  29. "三年": ["3", "years"],
  30. "付款方式": ["payment", "terms"],
  31. "支付方式": ["payment", "terms"],
  32. "项目编号": ["project", "code"],
  33. "采购编号": ["project", "code"],
  34. "招标编号": ["project", "code"],
  35. "交货时间": ["delivery", "time"],
  36. "交付时间": ["delivery", "time"],
  37. "投标截止": ["bid", "submission", "deadline"],
  38. "开标时间": ["bid", "opening", "time"],
  39. "采购人": ["purchaser"],
  40. "招标代理": ["agency"],
  41. "代理机构": ["agency"],
  42. "联系方式": ["contact", "phone"],
  43. }
  44. def bm25_tokenize(text):
  45. """
  46. Tokenize mixed Chinese/English/number text for BM25.
  47. Bidding documents commonly mix Chinese labels, English labels, project
  48. codes and amounts. The tokenizer keeps exact lexical tokens, splits
  49. compound identifiers such as ``XX-ZB-2024-001``, adds Chinese bigrams, and
  50. expands common bidding-domain phrases to English aliases so Chinese field
  51. queries can recall English or bilingual source text.
  52. """
  53. if text is None:
  54. return []
  55. normalized = unicodedata.normalize("NFKC", str(text)).lower()
  56. normalized = re.sub(r"(?<=\d),(?=\d)", "", normalized)
  57. tokens = []
  58. for match in _TOKEN_PATTERN.finditer(normalized):
  59. raw = match.group(0)
  60. if not raw:
  61. continue
  62. if re.search(r"[\u4e00-\u9fff]", raw):
  63. if jieba is not None:
  64. tokens.extend(t for t in jieba.cut(raw) if t.strip())
  65. else:
  66. tokens.append(raw)
  67. if len(raw) > 1:
  68. tokens.extend(raw[i:i + 2] for i in range(len(raw) - 1))
  69. continue
  70. tokens.append(raw)
  71. if re.search(r"[-_./]", raw):
  72. tokens.extend(part for part in re.split(r"[-_./]+", raw) if part)
  73. for phrase, aliases in _PHRASE_ALIASES.items():
  74. if phrase.lower() in normalized:
  75. tokens.extend(aliases)
  76. return tokens