1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
- from collections import namedtuple
- # Representation of Segments that a Segmenter found
- RawSegment = namedtuple('RawSegment', 'offset offset_end entity_occurrences')
- class SyntacticSegmenterRunner(BasePreProcessStepRunner):
- step = PreProcessSteps.segmentation
- def __init__(self, override=False, increment=True):
- self.override = override
- self.increment = increment
- def __call__(self, doc):
- was_done = doc.was_preprocess_step_done # just a shortcut
- if not was_done(PreProcessSteps.ner) or not was_done(PreProcessSteps.sentencer):
- # preconditions not met.
- return
- if self.increment or self.override or not was_done(self.step):
- segments = self.build_syntactic_segments(doc)
- doc.set_segmentation_result(
- segments, override=self.override, increment=self.increment)
- doc.save()
- def build_syntactic_segments(self, doc):
- # Returns a list of RawSegments.
- # For sentence in the document with at least 2 Entity Occurrences,
- # a RawSegment is created
- result = []
- entity_occs = list(doc.get_entity_occurrences())
- eo_counter = 0
- L = len(doc.sentences)
- for i, start in enumerate(doc.sentences):
- end = doc.sentences[i + 1] if i + 1 < L else len(doc.tokens)
- # At this point, tokens[start:end] has a sentence
- # We need to check that it has at least 2 entities before
- # building a segment
- sentence_occurrences = []
- for eo_counter in range(eo_counter, len(entity_occs)):
- # Skip entities before start of sentence
- # If sentences are contiguous, and start at token 0,
- # this loop should never advance. But we don't know what the
- # sentencer does, so it's better to be careful
- if entity_occs[eo_counter].offset >= start:
- break
- # Since "eo_counter" was over-used when iterating n previous for,
- # it will be updated with the last seen Entity Occurrence
- for eo_counter in range(eo_counter, len(entity_occs)):
- # Count entities inside the sentence
- eo = entity_occs[eo_counter]
- if eo.offset >= end or eo.offset_end > end:
- # occurrence is not completely inside the sentence, then
- # better to not consider it inside
- break
- sentence_occurrences.append(eo)
- # Again, when leaving the for-loop, "eo_counter" was increased.
- # Given that sentences are already ordered, it's safe to to that
- if len(sentence_occurrences) >= 2:
- result.append(RawSegment(start, end, sentence_occurrences))
- return result
|