models.py 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032
  1. # This module is the nexus/connection between the UI definitions (django models)
  2. # and the IEPY models. Modifications of this file should be done with the
  3. # awareness of this dual-impact.
  4. from datetime import datetime
  5. import itertools
  6. import logging
  7. from operator import attrgetter
  8. from collections import namedtuple, defaultdict
  9. from django.db import models
  10. from iepy.utils import unzip
  11. from iepy.webui.corpus.fields import ListField, ListSyntacticTreeField
  12. import jsonfield
  13. CHAR_MAX_LENGHT = 256
  14. logger = logging.getLogger(__name__)
  15. RichToken = namedtuple("RichToken", "token lemma pos eo_ids eo_kinds offset")
  16. class BaseModel(models.Model):
  17. class Meta:
  18. abstract = True
  19. app_label = 'corpus' # Name of the django app.
  20. class EntityKind(BaseModel):
  21. # There's a fixture declaring an initial set of Entity Kinds, containing
  22. # PERSON, LOCATION, AND ORGANIZATION
  23. name = models.CharField(max_length=CHAR_MAX_LENGHT, unique=True)
  24. class Meta(BaseModel.Meta):
  25. ordering = ['name']
  26. def __str__(self):
  27. return self.name
  28. class Payroll(BaseModel):
  29. user = models.CharField(max_length=CHAR_MAX_LENGHT,db_index=True)
  30. begin_time = models.CharField(max_length=10,db_index=True)
  31. end_time = models.CharField(max_length=10,db_index=True)
  32. doc_count = models.IntegerField()
  33. t_count = models.IntegerField()
  34. r_count = models.IntegerField()
  35. wage = models.FloatField()
  36. _yield = models.FloatField(default=0)
  37. account = models.FloatField(default=0)
  38. class Meta(BaseModel.Meta):
  39. ordering = ['user']
  40. class Entity(BaseModel):
  41. # the "key" IS the "canonical-form". Alieses are stored on
  42. # Entity Occurrences
  43. key = models.CharField(max_length=CHAR_MAX_LENGHT)
  44. kind = models.ForeignKey(EntityKind)
  45. gazette = models.ForeignKey(
  46. "GazetteItem", on_delete=models.SET_NULL,
  47. blank=True, null=True
  48. )
  49. class Meta(BaseModel.Meta):
  50. ordering = ['kind', 'key']
  51. unique_together = (('key', 'kind'), )
  52. def __str__(self):
  53. return '%s (%s)' % (self.key, self.kind.name)
  54. class IEDocumentMetadata(BaseModel):
  55. title = models.CharField(max_length=CHAR_MAX_LENGHT, blank=True)
  56. url = models.URLField(blank=True)
  57. items = jsonfield.JSONField(blank=True)
  58. def __str__(self):
  59. try:
  60. doc_id = self.document.id
  61. except IEDocument.DoesNotExist:
  62. doc_id = 'None'
  63. return '<Metadata of IEDocument {0}>'.format(doc_id)
  64. class LabeledIEDocumentMetadata(BaseModel):
  65. title = models.CharField(max_length=CHAR_MAX_LENGHT, blank=True)
  66. url = models.URLField(blank=True)
  67. items = jsonfield.JSONField(blank=True)
  68. def __str__(self):
  69. try:
  70. doc_id = self.document.id
  71. except LabeledIEDocument.DoesNotExist:
  72. doc_id = 'None'
  73. return '<Metadata of IEDocument {0}>'.format(doc_id)
  74. class IEDocument(BaseModel):
  75. metadata = models.OneToOneField('IEDocumentMetadata', related_name='document',
  76. on_delete=models.PROTECT)
  77. human_identifier = models.CharField(
  78. max_length=CHAR_MAX_LENGHT,
  79. unique=True,
  80. db_index=True
  81. )
  82. sourcetext = models.TextField(null=True)
  83. edituser = models.TextField(null=True,db_index=True)
  84. edittime = models.DateTimeField(null=True,blank=True,db_index=True)
  85. reedittime = models.DateTimeField(null=True,blank=True,db_index=True)
  86. brat_done_at = models.DateTimeField(null=True, blank=True,db_index=True)
  87. text = models.TextField()
  88. creation_date = models.DateTimeField(auto_now_add=True)
  89. pre_label = models.TextField(blank=True)
  90. deleted = models.IntegerField(default=0)
  91. added = models.IntegerField(default=0)
  92. # The following 3 lists have 1 item per token
  93. tokens = ListField(blank=True) # strings
  94. lemmas = ListField(blank=True) # strings
  95. postags = ListField(blank=True) # strings
  96. offsets_to_text = ListField(blank=True) # ints, character offset for tokens, lemmas and postags
  97. syntactic_sentences = ListSyntacticTreeField(blank=True, editable=False)
  98. sentences = ListField(blank=True) # ints, it's a list of token-offsets
  99. jump_signal = models.fields.IntegerField(default=0)
  100. # Reversed fields:
  101. # entity_occurrences = Reversed ForeignKey of EntityOccurrence
  102. # segments = Reversed ForeignKey of TextSegment
  103. # Metadata annotations that're computed while traveling the pre-process pipeline
  104. tokenization_done_at = models.DateTimeField(null=True, blank=True)
  105. lemmatization_done_at = models.DateTimeField(null=True, blank=True)
  106. sentencer_done_at = models.DateTimeField(null=True, blank=True)
  107. tagging_done_at = models.DateTimeField(null=True, blank=True)
  108. ner_done_at = models.DateTimeField(null=True, blank=True)
  109. segmentation_done_at = models.DateTimeField(null=True, blank=True)
  110. syntactic_parsing_done_at = models.DateTimeField(null=True, blank=True)
  111. class Meta(BaseModel.Meta):
  112. ordering = ['id', ]
  113. def __str__(self):
  114. return '<IEDocument {0}>'.format(self.human_identifier)
  115. def get_sentences(self, enriched=False):
  116. """Iterator over the sentences, each sentence being a list of tokens.
  117. """
  118. tokens = self.tokens
  119. lemmas = self.lemmas
  120. postags = self.postags
  121. sentences = self.sentences
  122. start = 0
  123. eos = list(self.get_entity_occurrences())
  124. tkn_offset = 0
  125. for i, end in enumerate(sentences[1:]):
  126. if enriched:
  127. rich_tokens = []
  128. for i, (token, lemma, postag) in enumerate(zip(
  129. tokens[start:end], lemmas[start:end], postags[start:end]
  130. )):
  131. tkn_eos = [eo for eo in eos if eo.offset <= tkn_offset < eo.offset_end]
  132. rich_tokens.append(RichToken(
  133. token=token,
  134. lemma=lemma,
  135. pos=postag,
  136. eo_ids=[eo.id for eo in tkn_eos],
  137. eo_kinds=[eo.entity.kind for eo in tkn_eos],
  138. offset=tkn_offset,
  139. ))
  140. tkn_offset += 1
  141. yield rich_tokens
  142. else:
  143. yield tokens[start:end]
  144. start = end
  145. def get_entity_occurrences(self):
  146. """Returns an iterable of EntityOccurrences, sorted by offset"""
  147. return self.entity_occurrences.all().order_by('offset')
  148. def get_text_segments(self):
  149. """Returns the iterable of TextSegments, sorted by offset"""
  150. return self.segments.all().order_by('offset')
  151. ### Methods used for preprocess ###
  152. def was_preprocess_step_done(self, step):
  153. return getattr(self, '%s_done_at' % step.name) is not None
  154. def set_tokenization_result(self, offsets, tokens):
  155. """Sets the value to the correspondent storage format"""
  156. if not isinstance(offsets, list):
  157. raise ValueError("Tokenization expected result should be a list "
  158. "of tuples (token-offset on text (int), token-string).")
  159. if not isinstance(tokens, list):
  160. raise ValueError("Tokenization expected result should be a list "
  161. "of tuples (token-offset on text (int), token-string).")
  162. self.tokens = list(tokens)
  163. self.offsets_to_text = list(offsets)
  164. self.tokenization_done_at = datetime.now()
  165. return self
  166. def set_text(self,value):
  167. self.text = str(value)
  168. return self
  169. def set_lemmatization_result(self, value):
  170. if len(value) != len(self.tokens):
  171. raise ValueError(
  172. 'Lemmatization result must have same cardinality than tokens'
  173. )
  174. self.lemmas = list(value)
  175. self.lemmatization_done_at = datetime.now()
  176. return self
  177. def set_sentencer_result(self, value):
  178. if not isinstance(value, list):
  179. raise ValueError("Sentencer expected result should be a list.")
  180. if not all(isinstance(x, int) for x in value):
  181. raise ValueError('Sentencer result shall only contain ints: %r' % value)
  182. if sorted(value) != value:
  183. raise ValueError('Sentencer result shall be ordered.')
  184. if len(set(value)) < len(value):
  185. raise ValueError(
  186. 'Sentencer result shall not contain duplicates.')
  187. if value[0] != 0:
  188. raise ValueError(
  189. 'Sentencer result must start with 0. Actual=%r' % value[0])
  190. # if value[-1] != len(self.tokens):
  191. if value[-1] != len(self.text):
  192. raise ValueError(
  193. 'Sentencer result must end with token count=%d. Actual=%r' % (
  194. len(self.text), value[-1]))
  195. self.sentences = value
  196. self.sentencer_done_at = datetime.now()
  197. return self
  198. def set_tagging_result(self, value):
  199. if len(value) != len(self.tokens):
  200. raise ValueError(
  201. 'Tagging result must have same cardinality than tokens')
  202. self.postags = value
  203. self.tagging_done_at = datetime.now()
  204. return self
  205. def set_syntactic_parsing_result(self, parsed_sentences):
  206. if len(parsed_sentences) != len(list(self.get_sentences())):
  207. raise ValueError(
  208. 'Syntactic parsing must have same cardinality than sentences'
  209. )
  210. self.syntactic_sentences = parsed_sentences
  211. self.syntactic_parsing_done_at = datetime.now()
  212. return self
  213. def set_ner_result(self, value):
  214. # Before even doing anything, basic offset validation
  215. def feo_has_issues(feo):
  216. return (feo.offset < 0 or feo.offset >= feo.offset_end
  217. or feo.offset > len(self.tokens))
  218. invalids = [x for x in value if feo_has_issues(x)]
  219. if invalids:
  220. raise ValueError('Invalid FoundEvidences: {}'.format(invalids))
  221. existents = defaultdict(list)
  222. eo_clash_key = lambda x: (x.offset, x.offset_end)
  223. for eo in self.entity_occurrences.all():
  224. existents[eo_clash_key(eo)].append(eo)
  225. # No issue, let's create them
  226. for found_entity in value:
  227. key, kind_name, alias, offset, offset_end, from_gazette = found_entity
  228. if (offset, offset_end) in existents.keys():
  229. skip = False
  230. for existent in existents[offset, offset_end]:
  231. is_from_gazette = existent.entity.gazette is not None
  232. is_same_kind = existent.entity.kind.name == kind_name
  233. if is_from_gazette or is_same_kind:
  234. skip = True
  235. break
  236. if skip:
  237. continue
  238. kind, _ = EntityKind.objects.get_or_create(name=kind_name)
  239. if from_gazette:
  240. gazette_item = GazetteItem.objects.get(text=key, kind=kind)
  241. entity, created = Entity.objects.get_or_create(
  242. key=key, kind=kind,
  243. gazette=gazette_item
  244. )
  245. else:
  246. entity, created = Entity.objects.get_or_create(key=key, kind=kind)
  247. if len(alias) > CHAR_MAX_LENGHT:
  248. alias_ = alias[:CHAR_MAX_LENGHT]
  249. print('Alias "%s" reduced to "%s"' % (alias, alias_))
  250. alias = alias_
  251. obj, created = EntityOccurrence.objects.get_or_create(
  252. document=self,
  253. entity=entity,
  254. offset=offset,
  255. offset_end=offset_end,
  256. alias=alias
  257. )
  258. if created:
  259. existents[eo_clash_key(obj)].append(obj)
  260. self.ner_done_at = datetime.now()
  261. return self
  262. def set_segmentation_result(self, value, increment=True, override=False):
  263. if override:
  264. self.segments.all().delete()
  265. logger.info('Previous segments removed')
  266. get_offsets = attrgetter('offset', 'offset_end')
  267. value = sorted(value, key=get_offsets)
  268. logger.info('About to set %s segments for current doc', len(value))
  269. doc_ent_occurrences = list(self.entity_occurrences.all())
  270. currents = set(self.segments.all().values_list('offset', 'offset_end'))
  271. new_segs = []
  272. for i, raw_segment in enumerate(value):
  273. if (raw_segment.offset, raw_segment.offset_end) in currents:
  274. continue
  275. _segm = TextSegment(
  276. document=self, offset=raw_segment.offset,
  277. offset_end=raw_segment.offset_end)
  278. new_segs.append((_segm, raw_segment))
  279. if new_segs:
  280. TextSegment.objects.bulk_create(list(zip(*new_segs))[0])
  281. logger.info('New %s segments created', len(new_segs))
  282. # And now, taking care of setting Entity Occurrences
  283. doc_segments = dict((get_offsets(s), s) for s in self.segments.all())
  284. for _segm, raw_segment in new_segs:
  285. segm = doc_segments[get_offsets(_segm)]
  286. if raw_segment.entity_occurrences is None:
  287. # Entity Ocurrences not provided, need to compute them
  288. segm.entity_occurrences = [
  289. eo for eo in doc_ent_occurrences
  290. if eo.offset >= segm.offset
  291. and eo.offset_end <= segm.offset_end
  292. ]
  293. else:
  294. segm.entity_occurrences = raw_segment.entity_occurrences
  295. self.segmentation_done_at = datetime.now()
  296. return self
  297. class LabeledIEDocument(BaseModel):
  298. metadata = models.OneToOneField('LabeledIEDocumentMetadata', related_name='document',
  299. on_delete=models.PROTECT)
  300. human_identifier = models.CharField(
  301. max_length=CHAR_MAX_LENGHT,
  302. unique=True
  303. )
  304. sourcetext = models.TextField(null=True)
  305. edituser = models.TextField(null=True)
  306. edittime = models.DateTimeField(null=True,blank=True)
  307. reedittime = models.DateTimeField(null=True,blank=True)
  308. brat_done_at = models.DateTimeField(null=True, blank=True)
  309. text = models.TextField()
  310. creation_date = models.DateTimeField(auto_now_add=True)
  311. # The following 3 lists have 1 item per token
  312. tokens = ListField(blank=True) # strings
  313. lemmas = ListField(blank=True) # strings
  314. postags = ListField(blank=True) # strings
  315. offsets_to_text = ListField(blank=True) # ints, character offset for tokens, lemmas and postags
  316. syntactic_sentences = ListSyntacticTreeField(blank=True, editable=False)
  317. sentences = ListField(blank=True) # ints, it's a list of token-offsets
  318. jump_signal = models.fields.IntegerField(default=0)
  319. # Reversed fields:
  320. # entity_occurrences = Reversed ForeignKey of EntityOccurrence
  321. # segments = Reversed ForeignKey of TextSegment
  322. # Metadata annotations that're computed while traveling the pre-process pipeline
  323. tokenization_done_at = models.DateTimeField(null=True, blank=True)
  324. lemmatization_done_at = models.DateTimeField(null=True, blank=True)
  325. sentencer_done_at = models.DateTimeField(null=True, blank=True)
  326. tagging_done_at = models.DateTimeField(null=True, blank=True)
  327. ner_done_at = models.DateTimeField(null=True, blank=True)
  328. segmentation_done_at = models.DateTimeField(null=True, blank=True)
  329. syntactic_parsing_done_at = models.DateTimeField(null=True, blank=True)
  330. class Meta(BaseModel.Meta):
  331. ordering = ['id', ]
  332. def __str__(self):
  333. return '<IEDocument {0}>'.format(self.human_identifier)
  334. def get_sentences(self, enriched=False):
  335. """Iterator over the sentences, each sentence being a list of tokens.
  336. """
  337. tokens = self.tokens
  338. lemmas = self.lemmas
  339. postags = self.postags
  340. sentences = self.sentences
  341. start = 0
  342. eos = list(self.get_entity_occurrences())
  343. tkn_offset = 0
  344. for i, end in enumerate(sentences[1:]):
  345. if enriched:
  346. rich_tokens = []
  347. for i, (token, lemma, postag) in enumerate(zip(
  348. tokens[start:end], lemmas[start:end], postags[start:end]
  349. )):
  350. tkn_eos = [eo for eo in eos if eo.offset <= tkn_offset < eo.offset_end]
  351. rich_tokens.append(RichToken(
  352. token=token,
  353. lemma=lemma,
  354. pos=postag,
  355. eo_ids=[eo.id for eo in tkn_eos],
  356. eo_kinds=[eo.entity.kind for eo in tkn_eos],
  357. offset=tkn_offset,
  358. ))
  359. tkn_offset += 1
  360. yield rich_tokens
  361. else:
  362. yield tokens[start:end]
  363. start = end
  364. def get_entity_occurrences(self):
  365. """Returns an iterable of EntityOccurrences, sorted by offset"""
  366. return self.entity_occurrences.all().order_by('offset')
  367. def get_text_segments(self):
  368. """Returns the iterable of TextSegments, sorted by offset"""
  369. return self.segments.all().order_by('offset')
  370. ### Methods used for preprocess ###
  371. def was_preprocess_step_done(self, step):
  372. return getattr(self, '%s_done_at' % step.name) is not None
  373. def set_tokenization_result(self, offsets, tokens):
  374. """Sets the value to the correspondent storage format"""
  375. if not isinstance(offsets, list):
  376. raise ValueError("Tokenization expected result should be a list "
  377. "of tuples (token-offset on text (int), token-string).")
  378. if not isinstance(tokens, list):
  379. raise ValueError("Tokenization expected result should be a list "
  380. "of tuples (token-offset on text (int), token-string).")
  381. self.tokens = list(tokens)
  382. self.offsets_to_text = list(offsets)
  383. self.tokenization_done_at = datetime.now()
  384. return self
  385. def set_text(self,value):
  386. self.text = str(value)
  387. return self
  388. def set_lemmatization_result(self, value):
  389. if len(value) != len(self.tokens):
  390. raise ValueError(
  391. 'Lemmatization result must have same cardinality than tokens'
  392. )
  393. self.lemmas = list(value)
  394. self.lemmatization_done_at = datetime.now()
  395. return self
  396. def set_sentencer_result(self, value):
  397. if not isinstance(value, list):
  398. raise ValueError("Sentencer expected result should be a list.")
  399. if not all(isinstance(x, int) for x in value):
  400. raise ValueError('Sentencer result shall only contain ints: %r' % value)
  401. if sorted(value) != value:
  402. raise ValueError('Sentencer result shall be ordered.')
  403. if len(set(value)) < len(value):
  404. raise ValueError(
  405. 'Sentencer result shall not contain duplicates.')
  406. if value[0] != 0:
  407. raise ValueError(
  408. 'Sentencer result must start with 0. Actual=%r' % value[0])
  409. # if value[-1] != len(self.tokens):
  410. if value[-1] != len(self.text):
  411. raise ValueError(
  412. 'Sentencer result must end with token count=%d. Actual=%r' % (
  413. len(self.text), value[-1]))
  414. self.sentences = value
  415. self.sentencer_done_at = datetime.now()
  416. return self
  417. def set_tagging_result(self, value):
  418. if len(value) != len(self.tokens):
  419. raise ValueError(
  420. 'Tagging result must have same cardinality than tokens')
  421. self.postags = value
  422. self.tagging_done_at = datetime.now()
  423. return self
  424. def set_syntactic_parsing_result(self, parsed_sentences):
  425. if len(parsed_sentences) != len(list(self.get_sentences())):
  426. raise ValueError(
  427. 'Syntactic parsing must have same cardinality than sentences'
  428. )
  429. self.syntactic_sentences = parsed_sentences
  430. self.syntactic_parsing_done_at = datetime.now()
  431. return self
  432. def set_ner_result(self, value):
  433. # Before even doing anything, basic offset validation
  434. def feo_has_issues(feo):
  435. return (feo.offset < 0 or feo.offset >= feo.offset_end
  436. or feo.offset > len(self.tokens))
  437. invalids = [x for x in value if feo_has_issues(x)]
  438. if invalids:
  439. raise ValueError('Invalid FoundEvidences: {}'.format(invalids))
  440. existents = defaultdict(list)
  441. eo_clash_key = lambda x: (x.offset, x.offset_end)
  442. for eo in self.entity_occurrences.all():
  443. existents[eo_clash_key(eo)].append(eo)
  444. # No issue, let's create them
  445. for found_entity in value:
  446. key, kind_name, alias, offset, offset_end, from_gazette = found_entity
  447. if (offset, offset_end) in existents.keys():
  448. skip = False
  449. for existent in existents[offset, offset_end]:
  450. is_from_gazette = existent.entity.gazette is not None
  451. is_same_kind = existent.entity.kind.name == kind_name
  452. if is_from_gazette or is_same_kind:
  453. skip = True
  454. break
  455. if skip:
  456. continue
  457. kind, _ = EntityKind.objects.get_or_create(name=kind_name)
  458. if from_gazette:
  459. gazette_item = GazetteItem.objects.get(text=key, kind=kind)
  460. entity, created = Entity.objects.get_or_create(
  461. key=key, kind=kind,
  462. gazette=gazette_item
  463. )
  464. else:
  465. entity, created = Entity.objects.get_or_create(key=key, kind=kind)
  466. if len(alias) > CHAR_MAX_LENGHT:
  467. alias_ = alias[:CHAR_MAX_LENGHT]
  468. print('Alias "%s" reduced to "%s"' % (alias, alias_))
  469. alias = alias_
  470. obj, created = EntityOccurrence.objects.get_or_create(
  471. document=self,
  472. entity=entity,
  473. offset=offset,
  474. offset_end=offset_end,
  475. alias=alias
  476. )
  477. if created:
  478. existents[eo_clash_key(obj)].append(obj)
  479. self.ner_done_at = datetime.now()
  480. return self
  481. def set_segmentation_result(self, value, increment=True, override=False):
  482. if override:
  483. self.segments.all().delete()
  484. logger.info('Previous segments removed')
  485. get_offsets = attrgetter('offset', 'offset_end')
  486. value = sorted(value, key=get_offsets)
  487. logger.info('About to set %s segments for current doc', len(value))
  488. doc_ent_occurrences = list(self.entity_occurrences.all())
  489. currents = set(self.segments.all().values_list('offset', 'offset_end'))
  490. new_segs = []
  491. for i, raw_segment in enumerate(value):
  492. if (raw_segment.offset, raw_segment.offset_end) in currents:
  493. continue
  494. _segm = TextSegment(
  495. document=self, offset=raw_segment.offset,
  496. offset_end=raw_segment.offset_end)
  497. new_segs.append((_segm, raw_segment))
  498. if new_segs:
  499. TextSegment.objects.bulk_create(list(zip(*new_segs))[0])
  500. logger.info('New %s segments created', len(new_segs))
  501. # And now, taking care of setting Entity Occurrences
  502. doc_segments = dict((get_offsets(s), s) for s in self.segments.all())
  503. for _segm, raw_segment in new_segs:
  504. segm = doc_segments[get_offsets(_segm)]
  505. if raw_segment.entity_occurrences is None:
  506. # Entity Ocurrences not provided, need to compute them
  507. segm.entity_occurrences = [
  508. eo for eo in doc_ent_occurrences
  509. if eo.offset >= segm.offset
  510. and eo.offset_end <= segm.offset_end
  511. ]
  512. else:
  513. segm.entity_occurrences = raw_segment.entity_occurrences
  514. self.segmentation_done_at = datetime.now()
  515. return self
  516. class EntityOccurrence(BaseModel):
  517. """Models the occurrence of a particular Entity on a Document"""
  518. entity = models.ForeignKey('Entity')
  519. document = models.ForeignKey('IEDocument', related_name='entity_occurrences')
  520. segments = models.ManyToManyField('TextSegment', related_name='entity_occurrences')
  521. # Offset in tokens wrt to document
  522. offset = models.IntegerField() # offset of the 1st token included on the occurrence
  523. offset_end = models.IntegerField() # offset of the 1st token NOT included
  524. # Hydrated fields: same than "offsets", but wrt segment
  525. # segment_offset = IntegerField
  526. # segment_offset_end = IntegerField
  527. # Text of the occurrence, so if it's different than canonical_form, it's easy to see
  528. alias = models.CharField(max_length=CHAR_MAX_LENGHT)
  529. anaphora = models.BooleanField(default=False) # Is a Named Entity or an anaphora?
  530. class Meta(BaseModel.Meta):
  531. ordering = ['document', 'offset', 'offset_end']
  532. unique_together = ['entity', 'document', 'offset', 'offset_end']
  533. def __str__(self):
  534. return '{0} ({1}, {2})'.format(self.entity.key, self.offset, self.offset_end)
  535. def hydrate_for_segment(self, segment):
  536. # creates some on-memory attributes with respect to the segment
  537. self.segment_offset = self.offset - segment.offset
  538. self.segment_offset_end = self.offset_end - segment.offset
  539. return self
  540. class TextSegment(BaseModel):
  541. document = models.ForeignKey('IEDocument', related_name='segments', db_index=True)
  542. # Offset in tokens wrt to document
  543. # They represent:
  544. # - offset: index of the first token included on the segment
  545. # - offset_end: index of the first token NOT included on the segment
  546. offset = models.IntegerField(db_index=True)
  547. offset_end = models.IntegerField(db_index=True)
  548. # Reversed fields:
  549. # entity_occurrences = Reversed ManyToManyField of EntityOccurrence
  550. class Meta(BaseModel.Meta):
  551. ordering = ['document', 'offset', 'offset_end']
  552. unique_together = ['document', 'offset', 'offset_end']
  553. def __str__(self):
  554. # return u'{0}'.format(' '.join(self.tokens)) # TODO: no tokens
  555. return u'({0} {1})'.format(self.offset, self.offset_end)
  556. def hydrate(self, document_on_ram=None):
  557. # Using the segment offsets, and the data on document itself, constructs
  558. # on-memory attributes for the segment
  559. # If "document_on_ram" provided, is used instead of querying DB.
  560. if getattr(self, '_hydrated', False):
  561. return self
  562. if document_on_ram is not None:
  563. assert document_on_ram.pk == self.document_id
  564. doc = document_on_ram
  565. else:
  566. doc = self.document
  567. self.tokens = doc.tokens[self.offset: self.offset_end]
  568. self.lemmas = doc.lemmas[self.offset: self.offset_end]
  569. self.postags = doc.postags[self.offset: self.offset_end]
  570. self.offsets_to_text = doc.offsets_to_text[self.offset: self.offset_end]
  571. if self.offsets_to_text:
  572. # grab the text except the last token
  573. self.text = doc.text[self.offsets_to_text[0]:
  574. doc.offsets_to_text[self.offset_end - 1]]
  575. # and now append the "pure" last token.
  576. self.text += self.tokens[-1]
  577. else:
  578. self.text = ""
  579. self.sentences = [i - self.offset for i in doc.sentences
  580. if i >= self.offset and i < self.offset_end]
  581. self.syntactic_sentences = [doc.syntactic_sentences[s] for s in self.sentences]
  582. self._hydrated = True
  583. return self
  584. def get_entity_occurrences(self):
  585. """Returns an iterable of EntityOccurrences, sorted by offset"""
  586. eos = getattr(self, '_hydrated_eos', None)
  587. if eos is None:
  588. eos = [eo.hydrate_for_segment(self) for eo in
  589. self.entity_occurrences.all().order_by('offset')]
  590. self._hydrated_eos = eos
  591. return eos
  592. def get_evidences_for_relation(self, relation, existent=None):
  593. # Gets or creates Labeled Evidences (when creating, label is empty)
  594. lkind = relation.left_entity_kind
  595. rkind = relation.right_entity_kind
  596. # For performance sake, first grabs all existent, and if later some missing, they
  597. # are created
  598. if existent is None:
  599. existent = EvidenceCandidate.objects.filter(segment=self, labels__relation=relation)
  600. existent = existent.select_related(
  601. 'left_entity_occurrence', 'right_entity_occurrence')
  602. existent = {
  603. (ec.left_entity_occurrence_id, ec.right_entity_occurrence.id): ec
  604. for ec in existent
  605. } # dict of existent evidence-candidates, indexed by left and right EO ids
  606. for l_eo, r_eo in self.kind_occurrence_pairs(lkind, rkind):
  607. if (l_eo.pk, r_eo.pk) in existent:
  608. yield existent[(l_eo.pk, r_eo.pk)]
  609. continue
  610. obj, created = EvidenceCandidate.objects.get_or_create(
  611. left_entity_occurrence=l_eo,
  612. right_entity_occurrence=r_eo,
  613. segment=self,
  614. )
  615. yield obj
  616. def entity_occurrence_pairs(self, e1, e2):
  617. eos = list(self.get_entity_occurrences())
  618. left = [eo for eo in eos if eo.entity == e1]
  619. right = [eo for eo in eos if eo.entity == e2]
  620. return [(l, r) for l, r in itertools.product(left, right) if l != r]
  621. def kind_occurrence_pairs(self, lkind, rkind):
  622. eos = list(self.get_entity_occurrences())
  623. left = [o for o in eos if o.entity.kind == lkind]
  624. right = [o for o in eos if o.entity.kind == rkind]
  625. return [(l, r) for l, r in itertools.product(left, right) if l != r]
  626. def get_enriched_tokens(self):
  627. translation_dict = {'-LRB-': '(',
  628. '-RRB-': ')'}
  629. eos = list(self.get_entity_occurrences())
  630. for tkn_offset, (tkn, lemma, postag) in enumerate(zip(self.tokens, self.lemmas, self.postags)):
  631. tkn_eos = [eo for eo in eos
  632. if eo.segment_offset <= tkn_offset < eo.segment_offset_end]
  633. yield RichToken(
  634. token=translation_dict.get(tkn, tkn),
  635. lemma=lemma,
  636. pos=postag,
  637. eo_ids=[eo.id for eo in tkn_eos],
  638. eo_kinds=[eo.entity.kind for eo in tkn_eos],
  639. offset=self.offset + tkn_offset,
  640. )
  641. @classmethod
  642. def filter_by_entity_occurrence_kind_pair(cls, kind_a, kind_b):
  643. """Returns a queryset of TextSegments having at least one Entity
  644. Occurrence of the left entity kind, and at least one Entity Occurrence
  645. of the right entity kind. If left and rigth kinds are the same, at least
  646. two occurrences expected."""
  647. # This may be implemented as a Manager method, but for simplicity, will
  648. # be put in here as a classmethod.
  649. matching_segms = TextSegment.objects.filter(
  650. entity_occurrences__entity__kind=kind_a).distinct()
  651. if kind_a == kind_b:
  652. # BECAREFUL!!! There is a very subtle detail in here. The Django ORM,
  653. # after doing the first filter (before entering this if-branch) gave us
  654. # <TextSegments> whose "entity_occurrences" are not all of them, but only
  655. # those that match the criteria expressed above. Because of that, is that
  656. # when annotating Count of such thing, we trust is counting EOccurrences of
  657. # the kind we are interested in, and not the others.
  658. matching_segms = matching_segms.annotate(
  659. kind_count=models.Count('entity_occurrences__entity__kind')).filter(
  660. kind_count__gte=2
  661. )
  662. else:
  663. matching_segms = matching_segms.filter(
  664. entity_occurrences__entity__kind=kind_b,
  665. ).distinct()
  666. return matching_segms
  667. class Relation(BaseModel):
  668. name = models.CharField(max_length=CHAR_MAX_LENGHT)
  669. left_entity_kind = models.ForeignKey('EntityKind', related_name='left_relations')
  670. right_entity_kind = models.ForeignKey('EntityKind', related_name='right_relations')
  671. # Reversed fields:
  672. # evidence_relations = Reversed ForeignKey of EvidenceCandidate
  673. class Meta(BaseModel.Meta):
  674. ordering = ['name', 'left_entity_kind', 'right_entity_kind']
  675. unique_together = ['name', 'left_entity_kind', 'right_entity_kind']
  676. def __str__(self):
  677. return '{}({}, {})'.format(self.name, self.left_entity_kind,
  678. self.right_entity_kind)
  679. def save(self, *args, **kwargs):
  680. if self.pk:
  681. # Object already exists, this is a modification
  682. original_obj = Relation.objects.get(pk=self.pk)
  683. for fname in ['left_entity_kind', 'right_entity_kind']:
  684. if getattr(original_obj, fname) != getattr(self, fname):
  685. raise ValueError("Relation kinds can't be modified after creation")
  686. return super(Relation, self).save(*args, **kwargs)
  687. def _matching_text_segments(self):
  688. return TextSegment.filter_by_entity_occurrence_kind_pair(
  689. self.right_entity_kind, self.left_entity_kind)
  690. def labeled_neighbor(self, obj, judge, back=False):
  691. """Returns the id of the "closest" labeled object to the one provided.
  692. Notes:
  693. - By "closest", it's mean the distance of the id numbers.
  694. - Works both for TextSegment and for IEDocument
  695. - If back is True, it's picked the previous item, otherwise, the next one.
  696. - It's assumed that the obj provided HAS labeled evidence already. If not,
  697. it's not possible to determine what is next. In such case, the id of the
  698. last labeled object will be returned.
  699. - If asking "next" and obj is currently the last, his id will be returned.
  700. - If asking "prev" and obj is currently the first, his id will be returned.
  701. """
  702. filters = dict(
  703. judge__isnull=False,
  704. label__isnull=False,
  705. relation=self,
  706. )
  707. if judge is not None:
  708. filters["judge"] = judge
  709. judge_labels = EvidenceLabel.objects.filter(**filters)
  710. if isinstance(obj, TextSegment):
  711. segments = self._matching_text_segments()
  712. segments = segments.filter(evidence_relations__labels__relation=self)
  713. candidates_with_label = judge_labels.values_list("evidence_candidate__segment", flat=True)
  714. segments = segments.filter(id__in=candidates_with_label).distinct()
  715. ids = list(segments.values_list('id', flat=True).order_by('id'))
  716. elif isinstance(obj, IEDocument):
  717. ids = sorted(set(judge_labels.values_list(
  718. 'evidence_candidate__segment__document_id', flat=True)
  719. ))
  720. else:
  721. ids = []
  722. if not ids:
  723. return None
  724. try:
  725. base_idx = ids.index(obj.id)
  726. except ValueError:
  727. # the base-object provided is not listed... Returning the base-object
  728. # Returning last in list
  729. return ids[-1]
  730. else:
  731. if back:
  732. if base_idx == 0:
  733. # there is no previous one. Returning same.
  734. return obj.id
  735. else:
  736. return ids[base_idx - 1]
  737. else:
  738. if base_idx == len(ids) - 1:
  739. # there is no next one. Returning same.
  740. return obj.id
  741. else:
  742. return ids[base_idx + 1]
  743. def get_next_segment_to_label(self, judge):
  744. # We'll pick first those Segments having already created questions with empty
  745. # answer (label=None). After finishing those, we'll look for
  746. # Segments never considered (ie, that doest have any question created).
  747. # Finally, those with answers in place, but with some answers "ASK-ME-LATER"
  748. segments = self._matching_text_segments().order_by('id')
  749. never_considered_segm = segments.exclude(evidence_relations__labels__relation=self)
  750. evidences = EvidenceCandidate.objects.filter(
  751. labels__relation=self
  752. ).order_by('segment_id')
  753. never_considered_ev = evidences.filter(labels__isnull=True)
  754. existent_labels = EvidenceLabel.objects.filter(
  755. evidence_candidate__in=evidences,
  756. labeled_by_machine=False
  757. ).order_by('evidence_candidate__segment_id')
  758. none_labels = existent_labels.filter(label__isnull=True)
  759. own_none_labels = none_labels.filter(judge=judge)
  760. # requires re answer if there's no Good answer at all (not just for this judge)
  761. NOT_NEED_RELABEL = [k for k, name in EvidenceLabel.LABEL_CHOICES
  762. if k not in EvidenceLabel.NEED_RELABEL]
  763. to_re_answer = evidences.exclude(labels__label__in=NOT_NEED_RELABEL)
  764. for qset in [own_none_labels, never_considered_ev, never_considered_segm,
  765. to_re_answer, none_labels]:
  766. try:
  767. obj = qset[0]
  768. except IndexError:
  769. pass
  770. else:
  771. if isinstance(obj, TextSegment):
  772. return obj
  773. elif isinstance(obj, EvidenceCandidate):
  774. return obj.segment
  775. elif isinstance(obj, EvidenceLabel):
  776. return obj.evidence_candidate.segment
  777. else:
  778. raise ValueError
  779. return None
  780. def get_next_document_to_label(self, judge):
  781. next_segment = self.get_next_segment_to_label(judge)
  782. if next_segment is None:
  783. return None
  784. else:
  785. return next_segment.document
  786. class EvidenceCandidate(BaseModel):
  787. left_entity_occurrence = models.ForeignKey(
  788. 'EntityOccurrence',
  789. related_name='left_evidence_relations'
  790. )
  791. right_entity_occurrence = models.ForeignKey(
  792. 'EntityOccurrence',
  793. related_name='right_evidence_relations'
  794. )
  795. segment = models.ForeignKey('TextSegment', related_name='evidence_relations')
  796. class Meta(BaseModel.Meta):
  797. ordering = [
  798. 'left_entity_occurrence', 'right_entity_occurrence',
  799. 'segment_id',
  800. ]
  801. unique_together = [
  802. 'left_entity_occurrence', 'right_entity_occurrence',
  803. 'segment',
  804. ]
  805. def __str__(self):
  806. s = "Candidate evidence (id {})"
  807. return s.format(
  808. self.pk
  809. )
  810. def get_or_create_label_for_judge(self, relation, judge):
  811. obj, created = EvidenceLabel.objects.get_or_create(
  812. relation=relation,
  813. evidence_candidate=self, judge=judge,
  814. labeled_by_machine=False, defaults={'label': None})
  815. return obj
  816. def set_label(self, relation, label, judge, labeled_by_machine=False):
  817. evidence_label, created = EvidenceLabel.objects.get_or_create(
  818. relation=relation,
  819. evidence_candidate=self,
  820. judge=judge,
  821. labeled_by_machine=labeled_by_machine
  822. )
  823. evidence_label.label = label
  824. evidence_label.save()
  825. class EvidenceLabel(BaseModel):
  826. NORELATION = "NO"
  827. YESRELATION = "YE"
  828. SKIP = "SK"
  829. NONSENSE = "NS"
  830. LABEL_CHOICES = (
  831. (YESRELATION, "Yes, relation is present"),
  832. (NORELATION, "No relation present"),
  833. (NONSENSE, "Evidence is nonsense"),
  834. (SKIP, "Skipped labeling of this evidence"),
  835. )
  836. NEED_RELABEL = (
  837. # list of evidence labels that means it would be good to ask again
  838. SKIP
  839. )
  840. evidence_candidate = models.ForeignKey(
  841. 'EvidenceCandidate',
  842. related_name='labels'
  843. )
  844. label = models.CharField(
  845. max_length=2, choices=LABEL_CHOICES,
  846. default=SKIP, null=True, blank=False
  847. )
  848. relation = models.ForeignKey('Relation', related_name='relation_labels', null=True, blank=True)
  849. modification_date = models.DateTimeField(auto_now=True)
  850. # The judge field is meant to be the username of the person that decides
  851. # the label of this evidence. It's not modelled as a foreign key to allow
  852. # easier interaction with non-django code.
  853. judge = models.CharField(max_length=CHAR_MAX_LENGHT)
  854. labeled_by_machine = models.BooleanField(default=True)
  855. class Meta(BaseModel.Meta):
  856. unique_together = ['evidence_candidate', 'label', 'judge', 'relation']
  857. def __str__(self):
  858. s = "'{}' by '{}' in '{}'"
  859. return s.format(
  860. self.modification_date,
  861. self.judge,
  862. self.evidence_candidate.id,
  863. )
  864. class SegmentToTag(BaseModel):
  865. segment = models.ForeignKey("TextSegment")
  866. relation = models.ForeignKey("Relation")
  867. done = models.BooleanField(default=False)
  868. modification_date = models.DateTimeField(auto_now=True)
  869. class Meta(BaseModel.Meta):
  870. unique_together = ['segment', 'relation']
  871. class GazetteItem(BaseModel):
  872. kind = models.ForeignKey(EntityKind)
  873. text = models.CharField(max_length=CHAR_MAX_LENGHT, blank=False, unique=True)
  874. from_freebase = models.CharField(max_length=CHAR_MAX_LENGHT, blank=False)
  875. def __str__(self):
  876. return "'{}' ({})".format(self.text, self.kind.name)