123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126 |
- from iepy.preprocess.ner.base import BaseNERRunner
- class CombinedNERRunner(BaseNERRunner):
- """A NER runner that is the combination of different NER runners
- (therefore, different NERs).
- The entities returned by each NER are combined by the method merge_entities
- without any check, possibly leading to duplicate or overlapping entities;
- but subclassing this combiner you may define something different.
- """
- def __init__(self, ners, override=False):
- """The NER runners should be instances of BasePreProcessStepRunner.
- Notes:
- - Each of the sub-ners will be configured to run with override-mode
- "on", no matter what is the global override value.
- The global override, will be used for determining wether to start
- or not the global-combined process.
- - Overriding only some NERs and not others is not allowed.
- """
- super(CombinedNERRunner, self).__init__(override=override)
- if not ners:
- raise ValueError(u'Empty NERs to combine')
- self.ners = ners
- for sub_ner in self.ners:
- sub_ner.override = True
- def merge_entities(self, sub_results):
- # Default merger does nothing but merging & sorting by offset
- all_entities = []
- for ner, sub_entities in sub_results:
- all_entities.extend(sub_entities)
- return sorted(all_entities, key=lambda x: x.offset)
- def run_ner(self, doc):
- sub_results = []
- for sub_ner in self.ners:
- sub_results.append(
- (sub_ner,
- sub_ner.run_ner(doc)
- )
- )
- return self.merge_entities(sub_results)
- class NoOverlapCombinedNERRunner(CombinedNERRunner):
- """
- Similar to the CombinedNERRunner, but when merging results from different
- taggers avoids overlapping by discarding those entities that were provided
- by later subners.
- It's assumed that each sub NER provides non overlapped entities.
- """
- def overlapped_entities(self, e1, e2):
- min1, max1 = e1.offset, e1.offset_end
- min2, max2 = e2.offset, e2.offset_end
- return bool(max(0, min(max1, max2) - max(min1, min2)))
- def merge_entities(self, sub_results):
- result = []
- for ner, sub_res in sub_results:
- if not result:
- # first ner returning something. all in.
- result.extend(sub_res)
- else:
- for ent in sub_res:
- if any(self.overlapped_entities(ent, e_i) for e_i in result):
- continue
- result.append(ent)
- return sorted(result, key=lambda x: x.offset)
- class KindPreferenceCombinedNERRunner(CombinedNERRunner):
- """
- Similar to the CombinedNERRunner, but when merging results from different
- taggers avoids overlapping by discarding those entities whose kind was worst
- ranked on the Combiner creation.
- If a given entity kind is not ranked on Combiner, will be treated worst than
- the worst ranked.
- If conflict remains, following rules apply:
- - shorter occurrences are preferred over larger
- - occurrences of former sub NERs are preferred.
- """
- def __init__(self, ners, override=False, rank=tuple()):
- """
- """
- # the lower the rank, the more important
- if not isinstance(rank, (tuple, list)):
- raise ValueError(u'rank can only be a list or tuple')
- self.kinds_rank = dict((k, i) for i, k in enumerate(rank))
- self.worst_rank = len(self.kinds_rank)
- super(KindPreferenceCombinedNERRunner, self).__init__(ners, override)
- def get_rank(self, found_entity):
- return self.kinds_rank.setdefault(found_entity.kind_name, self.worst_rank)
- def merge_entities(self, sub_results):
- sorted_occurrences = super(KindPreferenceCombinedNERRunner,
- self).merge_entities(sub_results)
- if not sorted_occurrences:
- return sorted_occurrences
- prev = sorted_occurrences[0]
- to_remove = set()
- # given that entities are sorted, cannot be the case that one entity
- # has offset lower than the previous one
- for eo in sorted_occurrences[1:]:
- if eo.offset < prev.offset_end:
- # there's an overlap. One of these 2 must be removed
- prev_criteria = (
- self.get_rank(prev), # kind rank
- -1 * (prev.offset_end - prev.offset) # inversed length
- )
- eo_criteria = (
- self.get_rank(eo),
- -1 * (eo.offset_end - eo.offset)
- )
- if prev_criteria <= eo_criteria:
- to_remove.add(eo)
- else:
- to_remove.add(prev)
- prev = eo
- else:
- prev = eo
- return [eo_i for eo_i in sorted_occurrences if eo_i not in to_remove]
|