123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032 |
- # This module is the nexus/connection between the UI definitions (django models)
- # and the IEPY models. Modifications of this file should be done with the
- # awareness of this dual-impact.
- from datetime import datetime
- import itertools
- import logging
- from operator import attrgetter
- from collections import namedtuple, defaultdict
- from django.db import models
- from iepy.utils import unzip
- from iepy.webui.corpus.fields import ListField, ListSyntacticTreeField
- import jsonfield
- CHAR_MAX_LENGHT = 256
- logger = logging.getLogger(__name__)
- RichToken = namedtuple("RichToken", "token lemma pos eo_ids eo_kinds offset")
- class BaseModel(models.Model):
- class Meta:
- abstract = True
- app_label = 'corpus' # Name of the django app.
- class EntityKind(BaseModel):
- # There's a fixture declaring an initial set of Entity Kinds, containing
- # PERSON, LOCATION, AND ORGANIZATION
- name = models.CharField(max_length=CHAR_MAX_LENGHT, unique=True)
- class Meta(BaseModel.Meta):
- ordering = ['name']
- def __str__(self):
- return self.name
- class Payroll(BaseModel):
- user = models.CharField(max_length=CHAR_MAX_LENGHT,db_index=True)
- begin_time = models.CharField(max_length=10,db_index=True)
- end_time = models.CharField(max_length=10,db_index=True)
- doc_count = models.IntegerField()
- t_count = models.IntegerField()
- r_count = models.IntegerField()
- wage = models.FloatField()
- _yield = models.FloatField(default=0)
- account = models.FloatField(default=0)
- class Meta(BaseModel.Meta):
- ordering = ['user']
- class Entity(BaseModel):
- # the "key" IS the "canonical-form". Alieses are stored on
- # Entity Occurrences
- key = models.CharField(max_length=CHAR_MAX_LENGHT)
- kind = models.ForeignKey(EntityKind)
- gazette = models.ForeignKey(
- "GazetteItem", on_delete=models.SET_NULL,
- blank=True, null=True
- )
- class Meta(BaseModel.Meta):
- ordering = ['kind', 'key']
- unique_together = (('key', 'kind'), )
- def __str__(self):
- return '%s (%s)' % (self.key, self.kind.name)
- class IEDocumentMetadata(BaseModel):
- title = models.CharField(max_length=CHAR_MAX_LENGHT, blank=True)
- url = models.URLField(blank=True)
- items = jsonfield.JSONField(blank=True)
- def __str__(self):
- try:
- doc_id = self.document.id
- except IEDocument.DoesNotExist:
- doc_id = 'None'
- return '<Metadata of IEDocument {0}>'.format(doc_id)
- class LabeledIEDocumentMetadata(BaseModel):
- title = models.CharField(max_length=CHAR_MAX_LENGHT, blank=True)
- url = models.URLField(blank=True)
- items = jsonfield.JSONField(blank=True)
- def __str__(self):
- try:
- doc_id = self.document.id
- except LabeledIEDocument.DoesNotExist:
- doc_id = 'None'
- return '<Metadata of IEDocument {0}>'.format(doc_id)
- class IEDocument(BaseModel):
- metadata = models.OneToOneField('IEDocumentMetadata', related_name='document',
- on_delete=models.PROTECT)
- human_identifier = models.CharField(
- max_length=CHAR_MAX_LENGHT,
- unique=True,
- db_index=True
- )
- sourcetext = models.TextField(null=True)
- edituser = models.TextField(null=True,db_index=True)
- edittime = models.DateTimeField(null=True,blank=True,db_index=True)
- reedittime = models.DateTimeField(null=True,blank=True,db_index=True)
- brat_done_at = models.DateTimeField(null=True, blank=True,db_index=True)
- text = models.TextField()
- creation_date = models.DateTimeField(auto_now_add=True)
- pre_label = models.TextField(blank=True)
- deleted = models.IntegerField(default=0)
- added = models.IntegerField(default=0)
- # The following 3 lists have 1 item per token
- tokens = ListField(blank=True) # strings
- lemmas = ListField(blank=True) # strings
- postags = ListField(blank=True) # strings
- offsets_to_text = ListField(blank=True) # ints, character offset for tokens, lemmas and postags
- syntactic_sentences = ListSyntacticTreeField(blank=True, editable=False)
- sentences = ListField(blank=True) # ints, it's a list of token-offsets
- jump_signal = models.fields.IntegerField(default=0)
- # Reversed fields:
- # entity_occurrences = Reversed ForeignKey of EntityOccurrence
- # segments = Reversed ForeignKey of TextSegment
- # Metadata annotations that're computed while traveling the pre-process pipeline
- tokenization_done_at = models.DateTimeField(null=True, blank=True)
- lemmatization_done_at = models.DateTimeField(null=True, blank=True)
- sentencer_done_at = models.DateTimeField(null=True, blank=True)
- tagging_done_at = models.DateTimeField(null=True, blank=True)
- ner_done_at = models.DateTimeField(null=True, blank=True)
- segmentation_done_at = models.DateTimeField(null=True, blank=True)
- syntactic_parsing_done_at = models.DateTimeField(null=True, blank=True)
- class Meta(BaseModel.Meta):
- ordering = ['id', ]
- def __str__(self):
- return '<IEDocument {0}>'.format(self.human_identifier)
- def get_sentences(self, enriched=False):
- """Iterator over the sentences, each sentence being a list of tokens.
- """
- tokens = self.tokens
- lemmas = self.lemmas
- postags = self.postags
- sentences = self.sentences
- start = 0
- eos = list(self.get_entity_occurrences())
- tkn_offset = 0
- for i, end in enumerate(sentences[1:]):
- if enriched:
- rich_tokens = []
- for i, (token, lemma, postag) in enumerate(zip(
- tokens[start:end], lemmas[start:end], postags[start:end]
- )):
- tkn_eos = [eo for eo in eos if eo.offset <= tkn_offset < eo.offset_end]
- rich_tokens.append(RichToken(
- token=token,
- lemma=lemma,
- pos=postag,
- eo_ids=[eo.id for eo in tkn_eos],
- eo_kinds=[eo.entity.kind for eo in tkn_eos],
- offset=tkn_offset,
- ))
- tkn_offset += 1
- yield rich_tokens
- else:
- yield tokens[start:end]
- start = end
- def get_entity_occurrences(self):
- """Returns an iterable of EntityOccurrences, sorted by offset"""
- return self.entity_occurrences.all().order_by('offset')
- def get_text_segments(self):
- """Returns the iterable of TextSegments, sorted by offset"""
- return self.segments.all().order_by('offset')
- ### Methods used for preprocess ###
- def was_preprocess_step_done(self, step):
- return getattr(self, '%s_done_at' % step.name) is not None
- def set_tokenization_result(self, offsets, tokens):
- """Sets the value to the correspondent storage format"""
- if not isinstance(offsets, list):
- raise ValueError("Tokenization expected result should be a list "
- "of tuples (token-offset on text (int), token-string).")
- if not isinstance(tokens, list):
- raise ValueError("Tokenization expected result should be a list "
- "of tuples (token-offset on text (int), token-string).")
- self.tokens = list(tokens)
- self.offsets_to_text = list(offsets)
- self.tokenization_done_at = datetime.now()
- return self
- def set_text(self,value):
- self.text = str(value)
- return self
- def set_lemmatization_result(self, value):
- if len(value) != len(self.tokens):
- raise ValueError(
- 'Lemmatization result must have same cardinality than tokens'
- )
- self.lemmas = list(value)
- self.lemmatization_done_at = datetime.now()
- return self
- def set_sentencer_result(self, value):
- if not isinstance(value, list):
- raise ValueError("Sentencer expected result should be a list.")
- if not all(isinstance(x, int) for x in value):
- raise ValueError('Sentencer result shall only contain ints: %r' % value)
- if sorted(value) != value:
- raise ValueError('Sentencer result shall be ordered.')
- if len(set(value)) < len(value):
- raise ValueError(
- 'Sentencer result shall not contain duplicates.')
- if value[0] != 0:
- raise ValueError(
- 'Sentencer result must start with 0. Actual=%r' % value[0])
- # if value[-1] != len(self.tokens):
- if value[-1] != len(self.text):
- raise ValueError(
- 'Sentencer result must end with token count=%d. Actual=%r' % (
- len(self.text), value[-1]))
- self.sentences = value
- self.sentencer_done_at = datetime.now()
- return self
- def set_tagging_result(self, value):
- if len(value) != len(self.tokens):
- raise ValueError(
- 'Tagging result must have same cardinality than tokens')
- self.postags = value
- self.tagging_done_at = datetime.now()
- return self
- def set_syntactic_parsing_result(self, parsed_sentences):
- if len(parsed_sentences) != len(list(self.get_sentences())):
- raise ValueError(
- 'Syntactic parsing must have same cardinality than sentences'
- )
- self.syntactic_sentences = parsed_sentences
- self.syntactic_parsing_done_at = datetime.now()
- return self
- def set_ner_result(self, value):
- # Before even doing anything, basic offset validation
- def feo_has_issues(feo):
- return (feo.offset < 0 or feo.offset >= feo.offset_end
- or feo.offset > len(self.tokens))
- invalids = [x for x in value if feo_has_issues(x)]
- if invalids:
- raise ValueError('Invalid FoundEvidences: {}'.format(invalids))
- existents = defaultdict(list)
- eo_clash_key = lambda x: (x.offset, x.offset_end)
- for eo in self.entity_occurrences.all():
- existents[eo_clash_key(eo)].append(eo)
- # No issue, let's create them
- for found_entity in value:
- key, kind_name, alias, offset, offset_end, from_gazette = found_entity
- if (offset, offset_end) in existents.keys():
- skip = False
- for existent in existents[offset, offset_end]:
- is_from_gazette = existent.entity.gazette is not None
- is_same_kind = existent.entity.kind.name == kind_name
- if is_from_gazette or is_same_kind:
- skip = True
- break
- if skip:
- continue
- kind, _ = EntityKind.objects.get_or_create(name=kind_name)
- if from_gazette:
- gazette_item = GazetteItem.objects.get(text=key, kind=kind)
- entity, created = Entity.objects.get_or_create(
- key=key, kind=kind,
- gazette=gazette_item
- )
- else:
- entity, created = Entity.objects.get_or_create(key=key, kind=kind)
- if len(alias) > CHAR_MAX_LENGHT:
- alias_ = alias[:CHAR_MAX_LENGHT]
- print('Alias "%s" reduced to "%s"' % (alias, alias_))
- alias = alias_
- obj, created = EntityOccurrence.objects.get_or_create(
- document=self,
- entity=entity,
- offset=offset,
- offset_end=offset_end,
- alias=alias
- )
- if created:
- existents[eo_clash_key(obj)].append(obj)
- self.ner_done_at = datetime.now()
- return self
- def set_segmentation_result(self, value, increment=True, override=False):
- if override:
- self.segments.all().delete()
- logger.info('Previous segments removed')
- get_offsets = attrgetter('offset', 'offset_end')
- value = sorted(value, key=get_offsets)
- logger.info('About to set %s segments for current doc', len(value))
- doc_ent_occurrences = list(self.entity_occurrences.all())
- currents = set(self.segments.all().values_list('offset', 'offset_end'))
- new_segs = []
- for i, raw_segment in enumerate(value):
- if (raw_segment.offset, raw_segment.offset_end) in currents:
- continue
- _segm = TextSegment(
- document=self, offset=raw_segment.offset,
- offset_end=raw_segment.offset_end)
- new_segs.append((_segm, raw_segment))
- if new_segs:
- TextSegment.objects.bulk_create(list(zip(*new_segs))[0])
- logger.info('New %s segments created', len(new_segs))
- # And now, taking care of setting Entity Occurrences
- doc_segments = dict((get_offsets(s), s) for s in self.segments.all())
- for _segm, raw_segment in new_segs:
- segm = doc_segments[get_offsets(_segm)]
- if raw_segment.entity_occurrences is None:
- # Entity Ocurrences not provided, need to compute them
- segm.entity_occurrences = [
- eo for eo in doc_ent_occurrences
- if eo.offset >= segm.offset
- and eo.offset_end <= segm.offset_end
- ]
- else:
- segm.entity_occurrences = raw_segment.entity_occurrences
- self.segmentation_done_at = datetime.now()
- return self
- class LabeledIEDocument(BaseModel):
- metadata = models.OneToOneField('LabeledIEDocumentMetadata', related_name='document',
- on_delete=models.PROTECT)
- human_identifier = models.CharField(
- max_length=CHAR_MAX_LENGHT,
- unique=True
- )
- sourcetext = models.TextField(null=True)
- edituser = models.TextField(null=True)
- edittime = models.DateTimeField(null=True,blank=True)
- reedittime = models.DateTimeField(null=True,blank=True)
- brat_done_at = models.DateTimeField(null=True, blank=True)
- text = models.TextField()
- creation_date = models.DateTimeField(auto_now_add=True)
- # The following 3 lists have 1 item per token
- tokens = ListField(blank=True) # strings
- lemmas = ListField(blank=True) # strings
- postags = ListField(blank=True) # strings
- offsets_to_text = ListField(blank=True) # ints, character offset for tokens, lemmas and postags
- syntactic_sentences = ListSyntacticTreeField(blank=True, editable=False)
- sentences = ListField(blank=True) # ints, it's a list of token-offsets
- jump_signal = models.fields.IntegerField(default=0)
- # Reversed fields:
- # entity_occurrences = Reversed ForeignKey of EntityOccurrence
- # segments = Reversed ForeignKey of TextSegment
- # Metadata annotations that're computed while traveling the pre-process pipeline
- tokenization_done_at = models.DateTimeField(null=True, blank=True)
- lemmatization_done_at = models.DateTimeField(null=True, blank=True)
- sentencer_done_at = models.DateTimeField(null=True, blank=True)
- tagging_done_at = models.DateTimeField(null=True, blank=True)
- ner_done_at = models.DateTimeField(null=True, blank=True)
- segmentation_done_at = models.DateTimeField(null=True, blank=True)
- syntactic_parsing_done_at = models.DateTimeField(null=True, blank=True)
- class Meta(BaseModel.Meta):
- ordering = ['id', ]
- def __str__(self):
- return '<IEDocument {0}>'.format(self.human_identifier)
- def get_sentences(self, enriched=False):
- """Iterator over the sentences, each sentence being a list of tokens.
- """
- tokens = self.tokens
- lemmas = self.lemmas
- postags = self.postags
- sentences = self.sentences
- start = 0
- eos = list(self.get_entity_occurrences())
- tkn_offset = 0
- for i, end in enumerate(sentences[1:]):
- if enriched:
- rich_tokens = []
- for i, (token, lemma, postag) in enumerate(zip(
- tokens[start:end], lemmas[start:end], postags[start:end]
- )):
- tkn_eos = [eo for eo in eos if eo.offset <= tkn_offset < eo.offset_end]
- rich_tokens.append(RichToken(
- token=token,
- lemma=lemma,
- pos=postag,
- eo_ids=[eo.id for eo in tkn_eos],
- eo_kinds=[eo.entity.kind for eo in tkn_eos],
- offset=tkn_offset,
- ))
- tkn_offset += 1
- yield rich_tokens
- else:
- yield tokens[start:end]
- start = end
- def get_entity_occurrences(self):
- """Returns an iterable of EntityOccurrences, sorted by offset"""
- return self.entity_occurrences.all().order_by('offset')
- def get_text_segments(self):
- """Returns the iterable of TextSegments, sorted by offset"""
- return self.segments.all().order_by('offset')
- ### Methods used for preprocess ###
- def was_preprocess_step_done(self, step):
- return getattr(self, '%s_done_at' % step.name) is not None
- def set_tokenization_result(self, offsets, tokens):
- """Sets the value to the correspondent storage format"""
- if not isinstance(offsets, list):
- raise ValueError("Tokenization expected result should be a list "
- "of tuples (token-offset on text (int), token-string).")
- if not isinstance(tokens, list):
- raise ValueError("Tokenization expected result should be a list "
- "of tuples (token-offset on text (int), token-string).")
- self.tokens = list(tokens)
- self.offsets_to_text = list(offsets)
- self.tokenization_done_at = datetime.now()
- return self
- def set_text(self,value):
- self.text = str(value)
- return self
- def set_lemmatization_result(self, value):
- if len(value) != len(self.tokens):
- raise ValueError(
- 'Lemmatization result must have same cardinality than tokens'
- )
- self.lemmas = list(value)
- self.lemmatization_done_at = datetime.now()
- return self
- def set_sentencer_result(self, value):
- if not isinstance(value, list):
- raise ValueError("Sentencer expected result should be a list.")
- if not all(isinstance(x, int) for x in value):
- raise ValueError('Sentencer result shall only contain ints: %r' % value)
- if sorted(value) != value:
- raise ValueError('Sentencer result shall be ordered.')
- if len(set(value)) < len(value):
- raise ValueError(
- 'Sentencer result shall not contain duplicates.')
- if value[0] != 0:
- raise ValueError(
- 'Sentencer result must start with 0. Actual=%r' % value[0])
- # if value[-1] != len(self.tokens):
- if value[-1] != len(self.text):
- raise ValueError(
- 'Sentencer result must end with token count=%d. Actual=%r' % (
- len(self.text), value[-1]))
- self.sentences = value
- self.sentencer_done_at = datetime.now()
- return self
- def set_tagging_result(self, value):
- if len(value) != len(self.tokens):
- raise ValueError(
- 'Tagging result must have same cardinality than tokens')
- self.postags = value
- self.tagging_done_at = datetime.now()
- return self
- def set_syntactic_parsing_result(self, parsed_sentences):
- if len(parsed_sentences) != len(list(self.get_sentences())):
- raise ValueError(
- 'Syntactic parsing must have same cardinality than sentences'
- )
- self.syntactic_sentences = parsed_sentences
- self.syntactic_parsing_done_at = datetime.now()
- return self
- def set_ner_result(self, value):
- # Before even doing anything, basic offset validation
- def feo_has_issues(feo):
- return (feo.offset < 0 or feo.offset >= feo.offset_end
- or feo.offset > len(self.tokens))
- invalids = [x for x in value if feo_has_issues(x)]
- if invalids:
- raise ValueError('Invalid FoundEvidences: {}'.format(invalids))
- existents = defaultdict(list)
- eo_clash_key = lambda x: (x.offset, x.offset_end)
- for eo in self.entity_occurrences.all():
- existents[eo_clash_key(eo)].append(eo)
- # No issue, let's create them
- for found_entity in value:
- key, kind_name, alias, offset, offset_end, from_gazette = found_entity
- if (offset, offset_end) in existents.keys():
- skip = False
- for existent in existents[offset, offset_end]:
- is_from_gazette = existent.entity.gazette is not None
- is_same_kind = existent.entity.kind.name == kind_name
- if is_from_gazette or is_same_kind:
- skip = True
- break
- if skip:
- continue
- kind, _ = EntityKind.objects.get_or_create(name=kind_name)
- if from_gazette:
- gazette_item = GazetteItem.objects.get(text=key, kind=kind)
- entity, created = Entity.objects.get_or_create(
- key=key, kind=kind,
- gazette=gazette_item
- )
- else:
- entity, created = Entity.objects.get_or_create(key=key, kind=kind)
- if len(alias) > CHAR_MAX_LENGHT:
- alias_ = alias[:CHAR_MAX_LENGHT]
- print('Alias "%s" reduced to "%s"' % (alias, alias_))
- alias = alias_
- obj, created = EntityOccurrence.objects.get_or_create(
- document=self,
- entity=entity,
- offset=offset,
- offset_end=offset_end,
- alias=alias
- )
- if created:
- existents[eo_clash_key(obj)].append(obj)
- self.ner_done_at = datetime.now()
- return self
- def set_segmentation_result(self, value, increment=True, override=False):
- if override:
- self.segments.all().delete()
- logger.info('Previous segments removed')
- get_offsets = attrgetter('offset', 'offset_end')
- value = sorted(value, key=get_offsets)
- logger.info('About to set %s segments for current doc', len(value))
- doc_ent_occurrences = list(self.entity_occurrences.all())
- currents = set(self.segments.all().values_list('offset', 'offset_end'))
- new_segs = []
- for i, raw_segment in enumerate(value):
- if (raw_segment.offset, raw_segment.offset_end) in currents:
- continue
- _segm = TextSegment(
- document=self, offset=raw_segment.offset,
- offset_end=raw_segment.offset_end)
- new_segs.append((_segm, raw_segment))
- if new_segs:
- TextSegment.objects.bulk_create(list(zip(*new_segs))[0])
- logger.info('New %s segments created', len(new_segs))
- # And now, taking care of setting Entity Occurrences
- doc_segments = dict((get_offsets(s), s) for s in self.segments.all())
- for _segm, raw_segment in new_segs:
- segm = doc_segments[get_offsets(_segm)]
- if raw_segment.entity_occurrences is None:
- # Entity Ocurrences not provided, need to compute them
- segm.entity_occurrences = [
- eo for eo in doc_ent_occurrences
- if eo.offset >= segm.offset
- and eo.offset_end <= segm.offset_end
- ]
- else:
- segm.entity_occurrences = raw_segment.entity_occurrences
- self.segmentation_done_at = datetime.now()
- return self
- class EntityOccurrence(BaseModel):
- """Models the occurrence of a particular Entity on a Document"""
- entity = models.ForeignKey('Entity')
- document = models.ForeignKey('IEDocument', related_name='entity_occurrences')
- segments = models.ManyToManyField('TextSegment', related_name='entity_occurrences')
- # Offset in tokens wrt to document
- offset = models.IntegerField() # offset of the 1st token included on the occurrence
- offset_end = models.IntegerField() # offset of the 1st token NOT included
- # Hydrated fields: same than "offsets", but wrt segment
- # segment_offset = IntegerField
- # segment_offset_end = IntegerField
- # Text of the occurrence, so if it's different than canonical_form, it's easy to see
- alias = models.CharField(max_length=CHAR_MAX_LENGHT)
- anaphora = models.BooleanField(default=False) # Is a Named Entity or an anaphora?
- class Meta(BaseModel.Meta):
- ordering = ['document', 'offset', 'offset_end']
- unique_together = ['entity', 'document', 'offset', 'offset_end']
- def __str__(self):
- return '{0} ({1}, {2})'.format(self.entity.key, self.offset, self.offset_end)
- def hydrate_for_segment(self, segment):
- # creates some on-memory attributes with respect to the segment
- self.segment_offset = self.offset - segment.offset
- self.segment_offset_end = self.offset_end - segment.offset
- return self
- class TextSegment(BaseModel):
- document = models.ForeignKey('IEDocument', related_name='segments', db_index=True)
- # Offset in tokens wrt to document
- # They represent:
- # - offset: index of the first token included on the segment
- # - offset_end: index of the first token NOT included on the segment
- offset = models.IntegerField(db_index=True)
- offset_end = models.IntegerField(db_index=True)
- # Reversed fields:
- # entity_occurrences = Reversed ManyToManyField of EntityOccurrence
- class Meta(BaseModel.Meta):
- ordering = ['document', 'offset', 'offset_end']
- unique_together = ['document', 'offset', 'offset_end']
- def __str__(self):
- # return u'{0}'.format(' '.join(self.tokens)) # TODO: no tokens
- return u'({0} {1})'.format(self.offset, self.offset_end)
- def hydrate(self, document_on_ram=None):
- # Using the segment offsets, and the data on document itself, constructs
- # on-memory attributes for the segment
- # If "document_on_ram" provided, is used instead of querying DB.
- if getattr(self, '_hydrated', False):
- return self
- if document_on_ram is not None:
- assert document_on_ram.pk == self.document_id
- doc = document_on_ram
- else:
- doc = self.document
- self.tokens = doc.tokens[self.offset: self.offset_end]
- self.lemmas = doc.lemmas[self.offset: self.offset_end]
- self.postags = doc.postags[self.offset: self.offset_end]
- self.offsets_to_text = doc.offsets_to_text[self.offset: self.offset_end]
- if self.offsets_to_text:
- # grab the text except the last token
- self.text = doc.text[self.offsets_to_text[0]:
- doc.offsets_to_text[self.offset_end - 1]]
- # and now append the "pure" last token.
- self.text += self.tokens[-1]
- else:
- self.text = ""
- self.sentences = [i - self.offset for i in doc.sentences
- if i >= self.offset and i < self.offset_end]
- self.syntactic_sentences = [doc.syntactic_sentences[s] for s in self.sentences]
- self._hydrated = True
- return self
- def get_entity_occurrences(self):
- """Returns an iterable of EntityOccurrences, sorted by offset"""
- eos = getattr(self, '_hydrated_eos', None)
- if eos is None:
- eos = [eo.hydrate_for_segment(self) for eo in
- self.entity_occurrences.all().order_by('offset')]
- self._hydrated_eos = eos
- return eos
- def get_evidences_for_relation(self, relation, existent=None):
- # Gets or creates Labeled Evidences (when creating, label is empty)
- lkind = relation.left_entity_kind
- rkind = relation.right_entity_kind
- # For performance sake, first grabs all existent, and if later some missing, they
- # are created
- if existent is None:
- existent = EvidenceCandidate.objects.filter(segment=self, labels__relation=relation)
- existent = existent.select_related(
- 'left_entity_occurrence', 'right_entity_occurrence')
- existent = {
- (ec.left_entity_occurrence_id, ec.right_entity_occurrence.id): ec
- for ec in existent
- } # dict of existent evidence-candidates, indexed by left and right EO ids
- for l_eo, r_eo in self.kind_occurrence_pairs(lkind, rkind):
- if (l_eo.pk, r_eo.pk) in existent:
- yield existent[(l_eo.pk, r_eo.pk)]
- continue
- obj, created = EvidenceCandidate.objects.get_or_create(
- left_entity_occurrence=l_eo,
- right_entity_occurrence=r_eo,
- segment=self,
- )
- yield obj
- def entity_occurrence_pairs(self, e1, e2):
- eos = list(self.get_entity_occurrences())
- left = [eo for eo in eos if eo.entity == e1]
- right = [eo for eo in eos if eo.entity == e2]
- return [(l, r) for l, r in itertools.product(left, right) if l != r]
- def kind_occurrence_pairs(self, lkind, rkind):
- eos = list(self.get_entity_occurrences())
- left = [o for o in eos if o.entity.kind == lkind]
- right = [o for o in eos if o.entity.kind == rkind]
- return [(l, r) for l, r in itertools.product(left, right) if l != r]
- def get_enriched_tokens(self):
- translation_dict = {'-LRB-': '(',
- '-RRB-': ')'}
- eos = list(self.get_entity_occurrences())
- for tkn_offset, (tkn, lemma, postag) in enumerate(zip(self.tokens, self.lemmas, self.postags)):
- tkn_eos = [eo for eo in eos
- if eo.segment_offset <= tkn_offset < eo.segment_offset_end]
- yield RichToken(
- token=translation_dict.get(tkn, tkn),
- lemma=lemma,
- pos=postag,
- eo_ids=[eo.id for eo in tkn_eos],
- eo_kinds=[eo.entity.kind for eo in tkn_eos],
- offset=self.offset + tkn_offset,
- )
- @classmethod
- def filter_by_entity_occurrence_kind_pair(cls, kind_a, kind_b):
- """Returns a queryset of TextSegments having at least one Entity
- Occurrence of the left entity kind, and at least one Entity Occurrence
- of the right entity kind. If left and rigth kinds are the same, at least
- two occurrences expected."""
- # This may be implemented as a Manager method, but for simplicity, will
- # be put in here as a classmethod.
- matching_segms = TextSegment.objects.filter(
- entity_occurrences__entity__kind=kind_a).distinct()
- if kind_a == kind_b:
- # BECAREFUL!!! There is a very subtle detail in here. The Django ORM,
- # after doing the first filter (before entering this if-branch) gave us
- # <TextSegments> whose "entity_occurrences" are not all of them, but only
- # those that match the criteria expressed above. Because of that, is that
- # when annotating Count of such thing, we trust is counting EOccurrences of
- # the kind we are interested in, and not the others.
- matching_segms = matching_segms.annotate(
- kind_count=models.Count('entity_occurrences__entity__kind')).filter(
- kind_count__gte=2
- )
- else:
- matching_segms = matching_segms.filter(
- entity_occurrences__entity__kind=kind_b,
- ).distinct()
- return matching_segms
- class Relation(BaseModel):
- name = models.CharField(max_length=CHAR_MAX_LENGHT)
- left_entity_kind = models.ForeignKey('EntityKind', related_name='left_relations')
- right_entity_kind = models.ForeignKey('EntityKind', related_name='right_relations')
- # Reversed fields:
- # evidence_relations = Reversed ForeignKey of EvidenceCandidate
- class Meta(BaseModel.Meta):
- ordering = ['name', 'left_entity_kind', 'right_entity_kind']
- unique_together = ['name', 'left_entity_kind', 'right_entity_kind']
- def __str__(self):
- return '{}({}, {})'.format(self.name, self.left_entity_kind,
- self.right_entity_kind)
- def save(self, *args, **kwargs):
- if self.pk:
- # Object already exists, this is a modification
- original_obj = Relation.objects.get(pk=self.pk)
- for fname in ['left_entity_kind', 'right_entity_kind']:
- if getattr(original_obj, fname) != getattr(self, fname):
- raise ValueError("Relation kinds can't be modified after creation")
- return super(Relation, self).save(*args, **kwargs)
- def _matching_text_segments(self):
- return TextSegment.filter_by_entity_occurrence_kind_pair(
- self.right_entity_kind, self.left_entity_kind)
- def labeled_neighbor(self, obj, judge, back=False):
- """Returns the id of the "closest" labeled object to the one provided.
- Notes:
- - By "closest", it's mean the distance of the id numbers.
- - Works both for TextSegment and for IEDocument
- - If back is True, it's picked the previous item, otherwise, the next one.
- - It's assumed that the obj provided HAS labeled evidence already. If not,
- it's not possible to determine what is next. In such case, the id of the
- last labeled object will be returned.
- - If asking "next" and obj is currently the last, his id will be returned.
- - If asking "prev" and obj is currently the first, his id will be returned.
- """
- filters = dict(
- judge__isnull=False,
- label__isnull=False,
- relation=self,
- )
- if judge is not None:
- filters["judge"] = judge
- judge_labels = EvidenceLabel.objects.filter(**filters)
- if isinstance(obj, TextSegment):
- segments = self._matching_text_segments()
- segments = segments.filter(evidence_relations__labels__relation=self)
- candidates_with_label = judge_labels.values_list("evidence_candidate__segment", flat=True)
- segments = segments.filter(id__in=candidates_with_label).distinct()
- ids = list(segments.values_list('id', flat=True).order_by('id'))
- elif isinstance(obj, IEDocument):
- ids = sorted(set(judge_labels.values_list(
- 'evidence_candidate__segment__document_id', flat=True)
- ))
- else:
- ids = []
- if not ids:
- return None
- try:
- base_idx = ids.index(obj.id)
- except ValueError:
- # the base-object provided is not listed... Returning the base-object
- # Returning last in list
- return ids[-1]
- else:
- if back:
- if base_idx == 0:
- # there is no previous one. Returning same.
- return obj.id
- else:
- return ids[base_idx - 1]
- else:
- if base_idx == len(ids) - 1:
- # there is no next one. Returning same.
- return obj.id
- else:
- return ids[base_idx + 1]
- def get_next_segment_to_label(self, judge):
- # We'll pick first those Segments having already created questions with empty
- # answer (label=None). After finishing those, we'll look for
- # Segments never considered (ie, that doest have any question created).
- # Finally, those with answers in place, but with some answers "ASK-ME-LATER"
- segments = self._matching_text_segments().order_by('id')
- never_considered_segm = segments.exclude(evidence_relations__labels__relation=self)
- evidences = EvidenceCandidate.objects.filter(
- labels__relation=self
- ).order_by('segment_id')
- never_considered_ev = evidences.filter(labels__isnull=True)
- existent_labels = EvidenceLabel.objects.filter(
- evidence_candidate__in=evidences,
- labeled_by_machine=False
- ).order_by('evidence_candidate__segment_id')
- none_labels = existent_labels.filter(label__isnull=True)
- own_none_labels = none_labels.filter(judge=judge)
- # requires re answer if there's no Good answer at all (not just for this judge)
- NOT_NEED_RELABEL = [k for k, name in EvidenceLabel.LABEL_CHOICES
- if k not in EvidenceLabel.NEED_RELABEL]
- to_re_answer = evidences.exclude(labels__label__in=NOT_NEED_RELABEL)
- for qset in [own_none_labels, never_considered_ev, never_considered_segm,
- to_re_answer, none_labels]:
- try:
- obj = qset[0]
- except IndexError:
- pass
- else:
- if isinstance(obj, TextSegment):
- return obj
- elif isinstance(obj, EvidenceCandidate):
- return obj.segment
- elif isinstance(obj, EvidenceLabel):
- return obj.evidence_candidate.segment
- else:
- raise ValueError
- return None
- def get_next_document_to_label(self, judge):
- next_segment = self.get_next_segment_to_label(judge)
- if next_segment is None:
- return None
- else:
- return next_segment.document
- class EvidenceCandidate(BaseModel):
- left_entity_occurrence = models.ForeignKey(
- 'EntityOccurrence',
- related_name='left_evidence_relations'
- )
- right_entity_occurrence = models.ForeignKey(
- 'EntityOccurrence',
- related_name='right_evidence_relations'
- )
- segment = models.ForeignKey('TextSegment', related_name='evidence_relations')
- class Meta(BaseModel.Meta):
- ordering = [
- 'left_entity_occurrence', 'right_entity_occurrence',
- 'segment_id',
- ]
- unique_together = [
- 'left_entity_occurrence', 'right_entity_occurrence',
- 'segment',
- ]
- def __str__(self):
- s = "Candidate evidence (id {})"
- return s.format(
- self.pk
- )
- def get_or_create_label_for_judge(self, relation, judge):
- obj, created = EvidenceLabel.objects.get_or_create(
- relation=relation,
- evidence_candidate=self, judge=judge,
- labeled_by_machine=False, defaults={'label': None})
- return obj
- def set_label(self, relation, label, judge, labeled_by_machine=False):
- evidence_label, created = EvidenceLabel.objects.get_or_create(
- relation=relation,
- evidence_candidate=self,
- judge=judge,
- labeled_by_machine=labeled_by_machine
- )
- evidence_label.label = label
- evidence_label.save()
- class EvidenceLabel(BaseModel):
- NORELATION = "NO"
- YESRELATION = "YE"
- SKIP = "SK"
- NONSENSE = "NS"
- LABEL_CHOICES = (
- (YESRELATION, "Yes, relation is present"),
- (NORELATION, "No relation present"),
- (NONSENSE, "Evidence is nonsense"),
- (SKIP, "Skipped labeling of this evidence"),
- )
- NEED_RELABEL = (
- # list of evidence labels that means it would be good to ask again
- SKIP
- )
- evidence_candidate = models.ForeignKey(
- 'EvidenceCandidate',
- related_name='labels'
- )
- label = models.CharField(
- max_length=2, choices=LABEL_CHOICES,
- default=SKIP, null=True, blank=False
- )
- relation = models.ForeignKey('Relation', related_name='relation_labels', null=True, blank=True)
- modification_date = models.DateTimeField(auto_now=True)
- # The judge field is meant to be the username of the person that decides
- # the label of this evidence. It's not modelled as a foreign key to allow
- # easier interaction with non-django code.
- judge = models.CharField(max_length=CHAR_MAX_LENGHT)
- labeled_by_machine = models.BooleanField(default=True)
- class Meta(BaseModel.Meta):
- unique_together = ['evidence_candidate', 'label', 'judge', 'relation']
- def __str__(self):
- s = "'{}' by '{}' in '{}'"
- return s.format(
- self.modification_date,
- self.judge,
- self.evidence_candidate.id,
- )
- class SegmentToTag(BaseModel):
- segment = models.ForeignKey("TextSegment")
- relation = models.ForeignKey("Relation")
- done = models.BooleanField(default=False)
- modification_date = models.DateTimeField(auto_now=True)
- class Meta(BaseModel.Meta):
- unique_together = ['segment', 'relation']
- class GazetteItem(BaseModel):
- kind = models.ForeignKey(EntityKind)
- text = models.CharField(max_length=CHAR_MAX_LENGHT, blank=False, unique=True)
- from_freebase = models.CharField(max_length=CHAR_MAX_LENGHT, blank=False)
- def __str__(self):
- return "'{}' ({})".format(self.text, self.kind.name)
|