trie.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. #!/usr/bin/env python
  2. # -*-coding:utf-8-*-
  3. import queue
  4. class Match(object):
  5. def __init__(self, start, end, keyword):
  6. self.start = start
  7. self.end = end
  8. self.keyword = keyword
  9. def __str__(self):
  10. return str(self.start) + ":" + str(self.end) + "=" + self.keyword
  11. class State(object):
  12. def __init__(self, word, deepth):
  13. self.success = {}
  14. self.failure = None
  15. self.emits = set()
  16. self.deepth = deepth
  17. def add_word(self, word):
  18. if word in self.success:
  19. return self.success.get(word)
  20. else:
  21. state = State(word, self.deepth + 1)
  22. self.success[word] = state
  23. return state
  24. def add_one_emit(self, keyword):
  25. self.emits.add(keyword)
  26. def add_emits(self, emits):
  27. if not isinstance(emits, set):
  28. raise Exception("keywords need a set")
  29. self.emits = self.emits | emits
  30. def set_failure(self, state):
  31. self.failure = state
  32. def get_transitions(self):
  33. return self.success.keys()
  34. def next_state(self, word):
  35. return self.success.get(word)
  36. class Trie(object):
  37. def __init__(self, words=[]):
  38. self.root = State("", 0)
  39. self.root.set_failure(self.root)
  40. self.is_create_failure = False
  41. if words:
  42. self.create_trie(words)
  43. def create_trie(self, keyword_list):
  44. for keyword in keyword_list:
  45. self.add_keyword(keyword)
  46. return self
  47. def add_keyword(self, keyword):
  48. current_state = self.root
  49. word_list = list(keyword)
  50. for word in word_list:
  51. current_state = current_state.add_word(word)
  52. current_state.add_one_emit(keyword)
  53. def create_failure(self):
  54. root = self.root
  55. state_queue = queue.Queue()
  56. for k, v in self.root.success.items():
  57. state_queue.put(v)
  58. v.set_failure(root)
  59. while (not state_queue.empty()):
  60. current_state = state_queue.get()
  61. transitions = current_state.get_transitions()
  62. for word in transitions:
  63. target_state = current_state.next_state(word)
  64. state_queue.put(target_state)
  65. trace_state = current_state.failure
  66. while (trace_state.next_state(word) is None and trace_state.deepth != 0):
  67. trace_state = trace_state.failure
  68. if trace_state.next_state(word) is not None:
  69. target_state.set_failure(trace_state.next_state(word))
  70. target_state.add_emits(trace_state.next_state(word).emits)
  71. else:
  72. target_state.set_failure(trace_state)
  73. self.is_create_failure = True
  74. def get_state(self, current_state, word):
  75. new_current_state = current_state.next_state(word)
  76. while (new_current_state is None and current_state.deepth != 0):
  77. current_state = current_state.failure
  78. new_current_state = current_state.next_state(word)
  79. return new_current_state
  80. def parse_text(self, text, allow_over_laps=True):
  81. matchs = []
  82. if not self.is_create_failure:
  83. self.create_failure()
  84. position = 0
  85. current_state = self.root
  86. for word in list(text):
  87. position += 1
  88. current_state = self.get_state(current_state, word)
  89. if not current_state:
  90. current_state = self.root
  91. continue
  92. for mw in current_state.emits:
  93. m = Match(position - len(mw), position, mw)
  94. matchs.append(m)
  95. # todo remove over laps
  96. return matchs
  97. def create_trie(words):
  98. return Trie().create_trie(words)