segmenter.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
  2. from collections import namedtuple
  3. # Representation of Segments that a Segmenter found
  4. RawSegment = namedtuple('RawSegment', 'offset offset_end entity_occurrences')
  5. class SyntacticSegmenterRunner(BasePreProcessStepRunner):
  6. step = PreProcessSteps.segmentation
  7. def __init__(self, override=False, increment=True):
  8. self.override = override
  9. self.increment = increment
  10. def __call__(self, doc):
  11. was_done = doc.was_preprocess_step_done # just a shortcut
  12. if not was_done(PreProcessSteps.ner) or not was_done(PreProcessSteps.sentencer):
  13. # preconditions not met.
  14. return
  15. if self.increment or self.override or not was_done(self.step):
  16. segments = self.build_syntactic_segments(doc)
  17. doc.set_segmentation_result(
  18. segments, override=self.override, increment=self.increment)
  19. doc.save()
  20. def build_syntactic_segments(self, doc):
  21. # Returns a list of RawSegments.
  22. # For sentence in the document with at least 2 Entity Occurrences,
  23. # a RawSegment is created
  24. result = []
  25. entity_occs = list(doc.get_entity_occurrences())
  26. eo_counter = 0
  27. L = len(doc.sentences)
  28. for i, start in enumerate(doc.sentences):
  29. end = doc.sentences[i + 1] if i + 1 < L else len(doc.tokens)
  30. # At this point, tokens[start:end] has a sentence
  31. # We need to check that it has at least 2 entities before
  32. # building a segment
  33. sentence_occurrences = []
  34. for eo_counter in range(eo_counter, len(entity_occs)):
  35. # Skip entities before start of sentence
  36. # If sentences are contiguous, and start at token 0,
  37. # this loop should never advance. But we don't know what the
  38. # sentencer does, so it's better to be careful
  39. if entity_occs[eo_counter].offset >= start:
  40. break
  41. # Since "eo_counter" was over-used when iterating n previous for,
  42. # it will be updated with the last seen Entity Occurrence
  43. for eo_counter in range(eo_counter, len(entity_occs)):
  44. # Count entities inside the sentence
  45. eo = entity_occs[eo_counter]
  46. if eo.offset >= end or eo.offset_end > end:
  47. # occurrence is not completely inside the sentence, then
  48. # better to not consider it inside
  49. break
  50. sentence_occurrences.append(eo)
  51. # Again, when leaving the for-loop, "eo_counter" was increased.
  52. # Given that sentences are already ordered, it's safe to to that
  53. if len(sentence_occurrences) >= 2:
  54. result.append(RawSegment(start, end, sentence_occurrences))
  55. return result