models.py 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027
  1. # This module is the nexus/connection between the UI definitions (django models)
  2. # and the IEPY models. Modifications of this file should be done with the
  3. # awareness of this dual-impact.
  4. from datetime import datetime
  5. import itertools
  6. import logging
  7. from operator import attrgetter
  8. from collections import namedtuple, defaultdict
  9. from django.db import models
  10. from iepy.utils import unzip
  11. from iepy.webui.corpus.fields import ListField, ListSyntacticTreeField
  12. import jsonfield
  13. CHAR_MAX_LENGHT = 256
  14. logger = logging.getLogger(__name__)
  15. RichToken = namedtuple("RichToken", "token lemma pos eo_ids eo_kinds offset")
  16. class BaseModel(models.Model):
  17. class Meta:
  18. abstract = True
  19. app_label = 'corpus' # Name of the django app.
  20. class EntityKind(BaseModel):
  21. # There's a fixture declaring an initial set of Entity Kinds, containing
  22. # PERSON, LOCATION, AND ORGANIZATION
  23. name = models.CharField(max_length=CHAR_MAX_LENGHT, unique=True)
  24. class Meta(BaseModel.Meta):
  25. ordering = ['name']
  26. def __str__(self):
  27. return self.name
  28. class Payroll(BaseModel):
  29. user = models.CharField(max_length=CHAR_MAX_LENGHT)
  30. begin_time = models.CharField(max_length=10)
  31. end_time = models.CharField(max_length=10)
  32. doc_count = models.IntegerField()
  33. t_count = models.IntegerField()
  34. r_count = models.IntegerField()
  35. wage = models.FloatField()
  36. _yield = models.FloatField(default=0)
  37. account = models.FloatField(default=0)
  38. class Meta(BaseModel.Meta):
  39. ordering = ['user']
  40. class Entity(BaseModel):
  41. # the "key" IS the "canonical-form". Alieses are stored on
  42. # Entity Occurrences
  43. key = models.CharField(max_length=CHAR_MAX_LENGHT)
  44. kind = models.ForeignKey(EntityKind)
  45. gazette = models.ForeignKey(
  46. "GazetteItem", on_delete=models.SET_NULL,
  47. blank=True, null=True
  48. )
  49. class Meta(BaseModel.Meta):
  50. ordering = ['kind', 'key']
  51. unique_together = (('key', 'kind'), )
  52. def __str__(self):
  53. return '%s (%s)' % (self.key, self.kind.name)
  54. class IEDocumentMetadata(BaseModel):
  55. title = models.CharField(max_length=CHAR_MAX_LENGHT, blank=True)
  56. url = models.URLField(blank=True)
  57. items = jsonfield.JSONField(blank=True)
  58. def __str__(self):
  59. try:
  60. doc_id = self.document.id
  61. except IEDocument.DoesNotExist:
  62. doc_id = 'None'
  63. return '<Metadata of IEDocument {0}>'.format(doc_id)
  64. class LabeledIEDocumentMetadata(BaseModel):
  65. title = models.CharField(max_length=CHAR_MAX_LENGHT, blank=True)
  66. url = models.URLField(blank=True)
  67. items = jsonfield.JSONField(blank=True)
  68. def __str__(self):
  69. try:
  70. doc_id = self.document.id
  71. except LabeledIEDocument.DoesNotExist:
  72. doc_id = 'None'
  73. return '<Metadata of IEDocument {0}>'.format(doc_id)
  74. class IEDocument(BaseModel):
  75. metadata = models.OneToOneField('IEDocumentMetadata', related_name='document',
  76. on_delete=models.PROTECT)
  77. human_identifier = models.CharField(
  78. max_length=CHAR_MAX_LENGHT,
  79. unique=True
  80. )
  81. sourcetext = models.TextField(null=True)
  82. edituser = models.TextField(null=True)
  83. edittime = models.DateTimeField(null=True,blank=True)
  84. reedittime = models.DateTimeField(null=True,blank=True)
  85. brat_done_at = models.DateTimeField(null=True, blank=True)
  86. text = models.TextField()
  87. creation_date = models.DateTimeField(auto_now_add=True)
  88. # The following 3 lists have 1 item per token
  89. tokens = ListField(blank=True) # strings
  90. lemmas = ListField(blank=True) # strings
  91. postags = ListField(blank=True) # strings
  92. offsets_to_text = ListField(blank=True) # ints, character offset for tokens, lemmas and postags
  93. syntactic_sentences = ListSyntacticTreeField(blank=True, editable=False)
  94. sentences = ListField(blank=True) # ints, it's a list of token-offsets
  95. jump_signal = models.fields.IntegerField(default=0)
  96. # Reversed fields:
  97. # entity_occurrences = Reversed ForeignKey of EntityOccurrence
  98. # segments = Reversed ForeignKey of TextSegment
  99. # Metadata annotations that're computed while traveling the pre-process pipeline
  100. tokenization_done_at = models.DateTimeField(null=True, blank=True)
  101. lemmatization_done_at = models.DateTimeField(null=True, blank=True)
  102. sentencer_done_at = models.DateTimeField(null=True, blank=True)
  103. tagging_done_at = models.DateTimeField(null=True, blank=True)
  104. ner_done_at = models.DateTimeField(null=True, blank=True)
  105. segmentation_done_at = models.DateTimeField(null=True, blank=True)
  106. syntactic_parsing_done_at = models.DateTimeField(null=True, blank=True)
  107. class Meta(BaseModel.Meta):
  108. ordering = ['id', ]
  109. def __str__(self):
  110. return '<IEDocument {0}>'.format(self.human_identifier)
  111. def get_sentences(self, enriched=False):
  112. """Iterator over the sentences, each sentence being a list of tokens.
  113. """
  114. tokens = self.tokens
  115. lemmas = self.lemmas
  116. postags = self.postags
  117. sentences = self.sentences
  118. start = 0
  119. eos = list(self.get_entity_occurrences())
  120. tkn_offset = 0
  121. for i, end in enumerate(sentences[1:]):
  122. if enriched:
  123. rich_tokens = []
  124. for i, (token, lemma, postag) in enumerate(zip(
  125. tokens[start:end], lemmas[start:end], postags[start:end]
  126. )):
  127. tkn_eos = [eo for eo in eos if eo.offset <= tkn_offset < eo.offset_end]
  128. rich_tokens.append(RichToken(
  129. token=token,
  130. lemma=lemma,
  131. pos=postag,
  132. eo_ids=[eo.id for eo in tkn_eos],
  133. eo_kinds=[eo.entity.kind for eo in tkn_eos],
  134. offset=tkn_offset,
  135. ))
  136. tkn_offset += 1
  137. yield rich_tokens
  138. else:
  139. yield tokens[start:end]
  140. start = end
  141. def get_entity_occurrences(self):
  142. """Returns an iterable of EntityOccurrences, sorted by offset"""
  143. return self.entity_occurrences.all().order_by('offset')
  144. def get_text_segments(self):
  145. """Returns the iterable of TextSegments, sorted by offset"""
  146. return self.segments.all().order_by('offset')
  147. ### Methods used for preprocess ###
  148. def was_preprocess_step_done(self, step):
  149. return getattr(self, '%s_done_at' % step.name) is not None
  150. def set_tokenization_result(self, offsets, tokens):
  151. """Sets the value to the correspondent storage format"""
  152. if not isinstance(offsets, list):
  153. raise ValueError("Tokenization expected result should be a list "
  154. "of tuples (token-offset on text (int), token-string).")
  155. if not isinstance(tokens, list):
  156. raise ValueError("Tokenization expected result should be a list "
  157. "of tuples (token-offset on text (int), token-string).")
  158. self.tokens = list(tokens)
  159. self.offsets_to_text = list(offsets)
  160. self.tokenization_done_at = datetime.now()
  161. return self
  162. def set_text(self,value):
  163. self.text = str(value)
  164. return self
  165. def set_lemmatization_result(self, value):
  166. if len(value) != len(self.tokens):
  167. raise ValueError(
  168. 'Lemmatization result must have same cardinality than tokens'
  169. )
  170. self.lemmas = list(value)
  171. self.lemmatization_done_at = datetime.now()
  172. return self
  173. def set_sentencer_result(self, value):
  174. if not isinstance(value, list):
  175. raise ValueError("Sentencer expected result should be a list.")
  176. if not all(isinstance(x, int) for x in value):
  177. raise ValueError('Sentencer result shall only contain ints: %r' % value)
  178. if sorted(value) != value:
  179. raise ValueError('Sentencer result shall be ordered.')
  180. if len(set(value)) < len(value):
  181. raise ValueError(
  182. 'Sentencer result shall not contain duplicates.')
  183. if value[0] != 0:
  184. raise ValueError(
  185. 'Sentencer result must start with 0. Actual=%r' % value[0])
  186. # if value[-1] != len(self.tokens):
  187. if value[-1] != len(self.text):
  188. raise ValueError(
  189. 'Sentencer result must end with token count=%d. Actual=%r' % (
  190. len(self.text), value[-1]))
  191. self.sentences = value
  192. self.sentencer_done_at = datetime.now()
  193. return self
  194. def set_tagging_result(self, value):
  195. if len(value) != len(self.tokens):
  196. raise ValueError(
  197. 'Tagging result must have same cardinality than tokens')
  198. self.postags = value
  199. self.tagging_done_at = datetime.now()
  200. return self
  201. def set_syntactic_parsing_result(self, parsed_sentences):
  202. if len(parsed_sentences) != len(list(self.get_sentences())):
  203. raise ValueError(
  204. 'Syntactic parsing must have same cardinality than sentences'
  205. )
  206. self.syntactic_sentences = parsed_sentences
  207. self.syntactic_parsing_done_at = datetime.now()
  208. return self
  209. def set_ner_result(self, value):
  210. # Before even doing anything, basic offset validation
  211. def feo_has_issues(feo):
  212. return (feo.offset < 0 or feo.offset >= feo.offset_end
  213. or feo.offset > len(self.tokens))
  214. invalids = [x for x in value if feo_has_issues(x)]
  215. if invalids:
  216. raise ValueError('Invalid FoundEvidences: {}'.format(invalids))
  217. existents = defaultdict(list)
  218. eo_clash_key = lambda x: (x.offset, x.offset_end)
  219. for eo in self.entity_occurrences.all():
  220. existents[eo_clash_key(eo)].append(eo)
  221. # No issue, let's create them
  222. for found_entity in value:
  223. key, kind_name, alias, offset, offset_end, from_gazette = found_entity
  224. if (offset, offset_end) in existents.keys():
  225. skip = False
  226. for existent in existents[offset, offset_end]:
  227. is_from_gazette = existent.entity.gazette is not None
  228. is_same_kind = existent.entity.kind.name == kind_name
  229. if is_from_gazette or is_same_kind:
  230. skip = True
  231. break
  232. if skip:
  233. continue
  234. kind, _ = EntityKind.objects.get_or_create(name=kind_name)
  235. if from_gazette:
  236. gazette_item = GazetteItem.objects.get(text=key, kind=kind)
  237. entity, created = Entity.objects.get_or_create(
  238. key=key, kind=kind,
  239. gazette=gazette_item
  240. )
  241. else:
  242. entity, created = Entity.objects.get_or_create(key=key, kind=kind)
  243. if len(alias) > CHAR_MAX_LENGHT:
  244. alias_ = alias[:CHAR_MAX_LENGHT]
  245. print('Alias "%s" reduced to "%s"' % (alias, alias_))
  246. alias = alias_
  247. obj, created = EntityOccurrence.objects.get_or_create(
  248. document=self,
  249. entity=entity,
  250. offset=offset,
  251. offset_end=offset_end,
  252. alias=alias
  253. )
  254. if created:
  255. existents[eo_clash_key(obj)].append(obj)
  256. self.ner_done_at = datetime.now()
  257. return self
  258. def set_segmentation_result(self, value, increment=True, override=False):
  259. if override:
  260. self.segments.all().delete()
  261. logger.info('Previous segments removed')
  262. get_offsets = attrgetter('offset', 'offset_end')
  263. value = sorted(value, key=get_offsets)
  264. logger.info('About to set %s segments for current doc', len(value))
  265. doc_ent_occurrences = list(self.entity_occurrences.all())
  266. currents = set(self.segments.all().values_list('offset', 'offset_end'))
  267. new_segs = []
  268. for i, raw_segment in enumerate(value):
  269. if (raw_segment.offset, raw_segment.offset_end) in currents:
  270. continue
  271. _segm = TextSegment(
  272. document=self, offset=raw_segment.offset,
  273. offset_end=raw_segment.offset_end)
  274. new_segs.append((_segm, raw_segment))
  275. if new_segs:
  276. TextSegment.objects.bulk_create(list(zip(*new_segs))[0])
  277. logger.info('New %s segments created', len(new_segs))
  278. # And now, taking care of setting Entity Occurrences
  279. doc_segments = dict((get_offsets(s), s) for s in self.segments.all())
  280. for _segm, raw_segment in new_segs:
  281. segm = doc_segments[get_offsets(_segm)]
  282. if raw_segment.entity_occurrences is None:
  283. # Entity Ocurrences not provided, need to compute them
  284. segm.entity_occurrences = [
  285. eo for eo in doc_ent_occurrences
  286. if eo.offset >= segm.offset
  287. and eo.offset_end <= segm.offset_end
  288. ]
  289. else:
  290. segm.entity_occurrences = raw_segment.entity_occurrences
  291. self.segmentation_done_at = datetime.now()
  292. return self
  293. class LabeledIEDocument(BaseModel):
  294. metadata = models.OneToOneField('LabeledIEDocumentMetadata', related_name='document',
  295. on_delete=models.PROTECT)
  296. human_identifier = models.CharField(
  297. max_length=CHAR_MAX_LENGHT,
  298. unique=True
  299. )
  300. sourcetext = models.TextField(null=True)
  301. edituser = models.TextField(null=True)
  302. edittime = models.DateTimeField(null=True,blank=True)
  303. reedittime = models.DateTimeField(null=True,blank=True)
  304. brat_done_at = models.DateTimeField(null=True, blank=True)
  305. text = models.TextField()
  306. creation_date = models.DateTimeField(auto_now_add=True)
  307. # The following 3 lists have 1 item per token
  308. tokens = ListField(blank=True) # strings
  309. lemmas = ListField(blank=True) # strings
  310. postags = ListField(blank=True) # strings
  311. offsets_to_text = ListField(blank=True) # ints, character offset for tokens, lemmas and postags
  312. syntactic_sentences = ListSyntacticTreeField(blank=True, editable=False)
  313. sentences = ListField(blank=True) # ints, it's a list of token-offsets
  314. jump_signal = models.fields.IntegerField(default=0)
  315. # Reversed fields:
  316. # entity_occurrences = Reversed ForeignKey of EntityOccurrence
  317. # segments = Reversed ForeignKey of TextSegment
  318. # Metadata annotations that're computed while traveling the pre-process pipeline
  319. tokenization_done_at = models.DateTimeField(null=True, blank=True)
  320. lemmatization_done_at = models.DateTimeField(null=True, blank=True)
  321. sentencer_done_at = models.DateTimeField(null=True, blank=True)
  322. tagging_done_at = models.DateTimeField(null=True, blank=True)
  323. ner_done_at = models.DateTimeField(null=True, blank=True)
  324. segmentation_done_at = models.DateTimeField(null=True, blank=True)
  325. syntactic_parsing_done_at = models.DateTimeField(null=True, blank=True)
  326. class Meta(BaseModel.Meta):
  327. ordering = ['id', ]
  328. def __str__(self):
  329. return '<IEDocument {0}>'.format(self.human_identifier)
  330. def get_sentences(self, enriched=False):
  331. """Iterator over the sentences, each sentence being a list of tokens.
  332. """
  333. tokens = self.tokens
  334. lemmas = self.lemmas
  335. postags = self.postags
  336. sentences = self.sentences
  337. start = 0
  338. eos = list(self.get_entity_occurrences())
  339. tkn_offset = 0
  340. for i, end in enumerate(sentences[1:]):
  341. if enriched:
  342. rich_tokens = []
  343. for i, (token, lemma, postag) in enumerate(zip(
  344. tokens[start:end], lemmas[start:end], postags[start:end]
  345. )):
  346. tkn_eos = [eo for eo in eos if eo.offset <= tkn_offset < eo.offset_end]
  347. rich_tokens.append(RichToken(
  348. token=token,
  349. lemma=lemma,
  350. pos=postag,
  351. eo_ids=[eo.id for eo in tkn_eos],
  352. eo_kinds=[eo.entity.kind for eo in tkn_eos],
  353. offset=tkn_offset,
  354. ))
  355. tkn_offset += 1
  356. yield rich_tokens
  357. else:
  358. yield tokens[start:end]
  359. start = end
  360. def get_entity_occurrences(self):
  361. """Returns an iterable of EntityOccurrences, sorted by offset"""
  362. return self.entity_occurrences.all().order_by('offset')
  363. def get_text_segments(self):
  364. """Returns the iterable of TextSegments, sorted by offset"""
  365. return self.segments.all().order_by('offset')
  366. ### Methods used for preprocess ###
  367. def was_preprocess_step_done(self, step):
  368. return getattr(self, '%s_done_at' % step.name) is not None
  369. def set_tokenization_result(self, offsets, tokens):
  370. """Sets the value to the correspondent storage format"""
  371. if not isinstance(offsets, list):
  372. raise ValueError("Tokenization expected result should be a list "
  373. "of tuples (token-offset on text (int), token-string).")
  374. if not isinstance(tokens, list):
  375. raise ValueError("Tokenization expected result should be a list "
  376. "of tuples (token-offset on text (int), token-string).")
  377. self.tokens = list(tokens)
  378. self.offsets_to_text = list(offsets)
  379. self.tokenization_done_at = datetime.now()
  380. return self
  381. def set_text(self,value):
  382. self.text = str(value)
  383. return self
  384. def set_lemmatization_result(self, value):
  385. if len(value) != len(self.tokens):
  386. raise ValueError(
  387. 'Lemmatization result must have same cardinality than tokens'
  388. )
  389. self.lemmas = list(value)
  390. self.lemmatization_done_at = datetime.now()
  391. return self
  392. def set_sentencer_result(self, value):
  393. if not isinstance(value, list):
  394. raise ValueError("Sentencer expected result should be a list.")
  395. if not all(isinstance(x, int) for x in value):
  396. raise ValueError('Sentencer result shall only contain ints: %r' % value)
  397. if sorted(value) != value:
  398. raise ValueError('Sentencer result shall be ordered.')
  399. if len(set(value)) < len(value):
  400. raise ValueError(
  401. 'Sentencer result shall not contain duplicates.')
  402. if value[0] != 0:
  403. raise ValueError(
  404. 'Sentencer result must start with 0. Actual=%r' % value[0])
  405. # if value[-1] != len(self.tokens):
  406. if value[-1] != len(self.text):
  407. raise ValueError(
  408. 'Sentencer result must end with token count=%d. Actual=%r' % (
  409. len(self.text), value[-1]))
  410. self.sentences = value
  411. self.sentencer_done_at = datetime.now()
  412. return self
  413. def set_tagging_result(self, value):
  414. if len(value) != len(self.tokens):
  415. raise ValueError(
  416. 'Tagging result must have same cardinality than tokens')
  417. self.postags = value
  418. self.tagging_done_at = datetime.now()
  419. return self
  420. def set_syntactic_parsing_result(self, parsed_sentences):
  421. if len(parsed_sentences) != len(list(self.get_sentences())):
  422. raise ValueError(
  423. 'Syntactic parsing must have same cardinality than sentences'
  424. )
  425. self.syntactic_sentences = parsed_sentences
  426. self.syntactic_parsing_done_at = datetime.now()
  427. return self
  428. def set_ner_result(self, value):
  429. # Before even doing anything, basic offset validation
  430. def feo_has_issues(feo):
  431. return (feo.offset < 0 or feo.offset >= feo.offset_end
  432. or feo.offset > len(self.tokens))
  433. invalids = [x for x in value if feo_has_issues(x)]
  434. if invalids:
  435. raise ValueError('Invalid FoundEvidences: {}'.format(invalids))
  436. existents = defaultdict(list)
  437. eo_clash_key = lambda x: (x.offset, x.offset_end)
  438. for eo in self.entity_occurrences.all():
  439. existents[eo_clash_key(eo)].append(eo)
  440. # No issue, let's create them
  441. for found_entity in value:
  442. key, kind_name, alias, offset, offset_end, from_gazette = found_entity
  443. if (offset, offset_end) in existents.keys():
  444. skip = False
  445. for existent in existents[offset, offset_end]:
  446. is_from_gazette = existent.entity.gazette is not None
  447. is_same_kind = existent.entity.kind.name == kind_name
  448. if is_from_gazette or is_same_kind:
  449. skip = True
  450. break
  451. if skip:
  452. continue
  453. kind, _ = EntityKind.objects.get_or_create(name=kind_name)
  454. if from_gazette:
  455. gazette_item = GazetteItem.objects.get(text=key, kind=kind)
  456. entity, created = Entity.objects.get_or_create(
  457. key=key, kind=kind,
  458. gazette=gazette_item
  459. )
  460. else:
  461. entity, created = Entity.objects.get_or_create(key=key, kind=kind)
  462. if len(alias) > CHAR_MAX_LENGHT:
  463. alias_ = alias[:CHAR_MAX_LENGHT]
  464. print('Alias "%s" reduced to "%s"' % (alias, alias_))
  465. alias = alias_
  466. obj, created = EntityOccurrence.objects.get_or_create(
  467. document=self,
  468. entity=entity,
  469. offset=offset,
  470. offset_end=offset_end,
  471. alias=alias
  472. )
  473. if created:
  474. existents[eo_clash_key(obj)].append(obj)
  475. self.ner_done_at = datetime.now()
  476. return self
  477. def set_segmentation_result(self, value, increment=True, override=False):
  478. if override:
  479. self.segments.all().delete()
  480. logger.info('Previous segments removed')
  481. get_offsets = attrgetter('offset', 'offset_end')
  482. value = sorted(value, key=get_offsets)
  483. logger.info('About to set %s segments for current doc', len(value))
  484. doc_ent_occurrences = list(self.entity_occurrences.all())
  485. currents = set(self.segments.all().values_list('offset', 'offset_end'))
  486. new_segs = []
  487. for i, raw_segment in enumerate(value):
  488. if (raw_segment.offset, raw_segment.offset_end) in currents:
  489. continue
  490. _segm = TextSegment(
  491. document=self, offset=raw_segment.offset,
  492. offset_end=raw_segment.offset_end)
  493. new_segs.append((_segm, raw_segment))
  494. if new_segs:
  495. TextSegment.objects.bulk_create(list(zip(*new_segs))[0])
  496. logger.info('New %s segments created', len(new_segs))
  497. # And now, taking care of setting Entity Occurrences
  498. doc_segments = dict((get_offsets(s), s) for s in self.segments.all())
  499. for _segm, raw_segment in new_segs:
  500. segm = doc_segments[get_offsets(_segm)]
  501. if raw_segment.entity_occurrences is None:
  502. # Entity Ocurrences not provided, need to compute them
  503. segm.entity_occurrences = [
  504. eo for eo in doc_ent_occurrences
  505. if eo.offset >= segm.offset
  506. and eo.offset_end <= segm.offset_end
  507. ]
  508. else:
  509. segm.entity_occurrences = raw_segment.entity_occurrences
  510. self.segmentation_done_at = datetime.now()
  511. return self
  512. class EntityOccurrence(BaseModel):
  513. """Models the occurrence of a particular Entity on a Document"""
  514. entity = models.ForeignKey('Entity')
  515. document = models.ForeignKey('IEDocument', related_name='entity_occurrences')
  516. segments = models.ManyToManyField('TextSegment', related_name='entity_occurrences')
  517. # Offset in tokens wrt to document
  518. offset = models.IntegerField() # offset of the 1st token included on the occurrence
  519. offset_end = models.IntegerField() # offset of the 1st token NOT included
  520. # Hydrated fields: same than "offsets", but wrt segment
  521. # segment_offset = IntegerField
  522. # segment_offset_end = IntegerField
  523. # Text of the occurrence, so if it's different than canonical_form, it's easy to see
  524. alias = models.CharField(max_length=CHAR_MAX_LENGHT)
  525. anaphora = models.BooleanField(default=False) # Is a Named Entity or an anaphora?
  526. class Meta(BaseModel.Meta):
  527. ordering = ['document', 'offset', 'offset_end']
  528. unique_together = ['entity', 'document', 'offset', 'offset_end']
  529. def __str__(self):
  530. return '{0} ({1}, {2})'.format(self.entity.key, self.offset, self.offset_end)
  531. def hydrate_for_segment(self, segment):
  532. # creates some on-memory attributes with respect to the segment
  533. self.segment_offset = self.offset - segment.offset
  534. self.segment_offset_end = self.offset_end - segment.offset
  535. return self
  536. class TextSegment(BaseModel):
  537. document = models.ForeignKey('IEDocument', related_name='segments', db_index=True)
  538. # Offset in tokens wrt to document
  539. # They represent:
  540. # - offset: index of the first token included on the segment
  541. # - offset_end: index of the first token NOT included on the segment
  542. offset = models.IntegerField(db_index=True)
  543. offset_end = models.IntegerField(db_index=True)
  544. # Reversed fields:
  545. # entity_occurrences = Reversed ManyToManyField of EntityOccurrence
  546. class Meta(BaseModel.Meta):
  547. ordering = ['document', 'offset', 'offset_end']
  548. unique_together = ['document', 'offset', 'offset_end']
  549. def __str__(self):
  550. # return u'{0}'.format(' '.join(self.tokens)) # TODO: no tokens
  551. return u'({0} {1})'.format(self.offset, self.offset_end)
  552. def hydrate(self, document_on_ram=None):
  553. # Using the segment offsets, and the data on document itself, constructs
  554. # on-memory attributes for the segment
  555. # If "document_on_ram" provided, is used instead of querying DB.
  556. if getattr(self, '_hydrated', False):
  557. return self
  558. if document_on_ram is not None:
  559. assert document_on_ram.pk == self.document_id
  560. doc = document_on_ram
  561. else:
  562. doc = self.document
  563. self.tokens = doc.tokens[self.offset: self.offset_end]
  564. self.lemmas = doc.lemmas[self.offset: self.offset_end]
  565. self.postags = doc.postags[self.offset: self.offset_end]
  566. self.offsets_to_text = doc.offsets_to_text[self.offset: self.offset_end]
  567. if self.offsets_to_text:
  568. # grab the text except the last token
  569. self.text = doc.text[self.offsets_to_text[0]:
  570. doc.offsets_to_text[self.offset_end - 1]]
  571. # and now append the "pure" last token.
  572. self.text += self.tokens[-1]
  573. else:
  574. self.text = ""
  575. self.sentences = [i - self.offset for i in doc.sentences
  576. if i >= self.offset and i < self.offset_end]
  577. self.syntactic_sentences = [doc.syntactic_sentences[s] for s in self.sentences]
  578. self._hydrated = True
  579. return self
  580. def get_entity_occurrences(self):
  581. """Returns an iterable of EntityOccurrences, sorted by offset"""
  582. eos = getattr(self, '_hydrated_eos', None)
  583. if eos is None:
  584. eos = [eo.hydrate_for_segment(self) for eo in
  585. self.entity_occurrences.all().order_by('offset')]
  586. self._hydrated_eos = eos
  587. return eos
  588. def get_evidences_for_relation(self, relation, existent=None):
  589. # Gets or creates Labeled Evidences (when creating, label is empty)
  590. lkind = relation.left_entity_kind
  591. rkind = relation.right_entity_kind
  592. # For performance sake, first grabs all existent, and if later some missing, they
  593. # are created
  594. if existent is None:
  595. existent = EvidenceCandidate.objects.filter(segment=self, labels__relation=relation)
  596. existent = existent.select_related(
  597. 'left_entity_occurrence', 'right_entity_occurrence')
  598. existent = {
  599. (ec.left_entity_occurrence_id, ec.right_entity_occurrence.id): ec
  600. for ec in existent
  601. } # dict of existent evidence-candidates, indexed by left and right EO ids
  602. for l_eo, r_eo in self.kind_occurrence_pairs(lkind, rkind):
  603. if (l_eo.pk, r_eo.pk) in existent:
  604. yield existent[(l_eo.pk, r_eo.pk)]
  605. continue
  606. obj, created = EvidenceCandidate.objects.get_or_create(
  607. left_entity_occurrence=l_eo,
  608. right_entity_occurrence=r_eo,
  609. segment=self,
  610. )
  611. yield obj
  612. def entity_occurrence_pairs(self, e1, e2):
  613. eos = list(self.get_entity_occurrences())
  614. left = [eo for eo in eos if eo.entity == e1]
  615. right = [eo for eo in eos if eo.entity == e2]
  616. return [(l, r) for l, r in itertools.product(left, right) if l != r]
  617. def kind_occurrence_pairs(self, lkind, rkind):
  618. eos = list(self.get_entity_occurrences())
  619. left = [o for o in eos if o.entity.kind == lkind]
  620. right = [o for o in eos if o.entity.kind == rkind]
  621. return [(l, r) for l, r in itertools.product(left, right) if l != r]
  622. def get_enriched_tokens(self):
  623. translation_dict = {'-LRB-': '(',
  624. '-RRB-': ')'}
  625. eos = list(self.get_entity_occurrences())
  626. for tkn_offset, (tkn, lemma, postag) in enumerate(zip(self.tokens, self.lemmas, self.postags)):
  627. tkn_eos = [eo for eo in eos
  628. if eo.segment_offset <= tkn_offset < eo.segment_offset_end]
  629. yield RichToken(
  630. token=translation_dict.get(tkn, tkn),
  631. lemma=lemma,
  632. pos=postag,
  633. eo_ids=[eo.id for eo in tkn_eos],
  634. eo_kinds=[eo.entity.kind for eo in tkn_eos],
  635. offset=self.offset + tkn_offset,
  636. )
  637. @classmethod
  638. def filter_by_entity_occurrence_kind_pair(cls, kind_a, kind_b):
  639. """Returns a queryset of TextSegments having at least one Entity
  640. Occurrence of the left entity kind, and at least one Entity Occurrence
  641. of the right entity kind. If left and rigth kinds are the same, at least
  642. two occurrences expected."""
  643. # This may be implemented as a Manager method, but for simplicity, will
  644. # be put in here as a classmethod.
  645. matching_segms = TextSegment.objects.filter(
  646. entity_occurrences__entity__kind=kind_a).distinct()
  647. if kind_a == kind_b:
  648. # BECAREFUL!!! There is a very subtle detail in here. The Django ORM,
  649. # after doing the first filter (before entering this if-branch) gave us
  650. # <TextSegments> whose "entity_occurrences" are not all of them, but only
  651. # those that match the criteria expressed above. Because of that, is that
  652. # when annotating Count of such thing, we trust is counting EOccurrences of
  653. # the kind we are interested in, and not the others.
  654. matching_segms = matching_segms.annotate(
  655. kind_count=models.Count('entity_occurrences__entity__kind')).filter(
  656. kind_count__gte=2
  657. )
  658. else:
  659. matching_segms = matching_segms.filter(
  660. entity_occurrences__entity__kind=kind_b,
  661. ).distinct()
  662. return matching_segms
  663. class Relation(BaseModel):
  664. name = models.CharField(max_length=CHAR_MAX_LENGHT)
  665. left_entity_kind = models.ForeignKey('EntityKind', related_name='left_relations')
  666. right_entity_kind = models.ForeignKey('EntityKind', related_name='right_relations')
  667. # Reversed fields:
  668. # evidence_relations = Reversed ForeignKey of EvidenceCandidate
  669. class Meta(BaseModel.Meta):
  670. ordering = ['name', 'left_entity_kind', 'right_entity_kind']
  671. unique_together = ['name', 'left_entity_kind', 'right_entity_kind']
  672. def __str__(self):
  673. return '{}({}, {})'.format(self.name, self.left_entity_kind,
  674. self.right_entity_kind)
  675. def save(self, *args, **kwargs):
  676. if self.pk:
  677. # Object already exists, this is a modification
  678. original_obj = Relation.objects.get(pk=self.pk)
  679. for fname in ['left_entity_kind', 'right_entity_kind']:
  680. if getattr(original_obj, fname) != getattr(self, fname):
  681. raise ValueError("Relation kinds can't be modified after creation")
  682. return super(Relation, self).save(*args, **kwargs)
  683. def _matching_text_segments(self):
  684. return TextSegment.filter_by_entity_occurrence_kind_pair(
  685. self.right_entity_kind, self.left_entity_kind)
  686. def labeled_neighbor(self, obj, judge, back=False):
  687. """Returns the id of the "closest" labeled object to the one provided.
  688. Notes:
  689. - By "closest", it's mean the distance of the id numbers.
  690. - Works both for TextSegment and for IEDocument
  691. - If back is True, it's picked the previous item, otherwise, the next one.
  692. - It's assumed that the obj provided HAS labeled evidence already. If not,
  693. it's not possible to determine what is next. In such case, the id of the
  694. last labeled object will be returned.
  695. - If asking "next" and obj is currently the last, his id will be returned.
  696. - If asking "prev" and obj is currently the first, his id will be returned.
  697. """
  698. filters = dict(
  699. judge__isnull=False,
  700. label__isnull=False,
  701. relation=self,
  702. )
  703. if judge is not None:
  704. filters["judge"] = judge
  705. judge_labels = EvidenceLabel.objects.filter(**filters)
  706. if isinstance(obj, TextSegment):
  707. segments = self._matching_text_segments()
  708. segments = segments.filter(evidence_relations__labels__relation=self)
  709. candidates_with_label = judge_labels.values_list("evidence_candidate__segment", flat=True)
  710. segments = segments.filter(id__in=candidates_with_label).distinct()
  711. ids = list(segments.values_list('id', flat=True).order_by('id'))
  712. elif isinstance(obj, IEDocument):
  713. ids = sorted(set(judge_labels.values_list(
  714. 'evidence_candidate__segment__document_id', flat=True)
  715. ))
  716. else:
  717. ids = []
  718. if not ids:
  719. return None
  720. try:
  721. base_idx = ids.index(obj.id)
  722. except ValueError:
  723. # the base-object provided is not listed... Returning the base-object
  724. # Returning last in list
  725. return ids[-1]
  726. else:
  727. if back:
  728. if base_idx == 0:
  729. # there is no previous one. Returning same.
  730. return obj.id
  731. else:
  732. return ids[base_idx - 1]
  733. else:
  734. if base_idx == len(ids) - 1:
  735. # there is no next one. Returning same.
  736. return obj.id
  737. else:
  738. return ids[base_idx + 1]
  739. def get_next_segment_to_label(self, judge):
  740. # We'll pick first those Segments having already created questions with empty
  741. # answer (label=None). After finishing those, we'll look for
  742. # Segments never considered (ie, that doest have any question created).
  743. # Finally, those with answers in place, but with some answers "ASK-ME-LATER"
  744. segments = self._matching_text_segments().order_by('id')
  745. never_considered_segm = segments.exclude(evidence_relations__labels__relation=self)
  746. evidences = EvidenceCandidate.objects.filter(
  747. labels__relation=self
  748. ).order_by('segment_id')
  749. never_considered_ev = evidences.filter(labels__isnull=True)
  750. existent_labels = EvidenceLabel.objects.filter(
  751. evidence_candidate__in=evidences,
  752. labeled_by_machine=False
  753. ).order_by('evidence_candidate__segment_id')
  754. none_labels = existent_labels.filter(label__isnull=True)
  755. own_none_labels = none_labels.filter(judge=judge)
  756. # requires re answer if there's no Good answer at all (not just for this judge)
  757. NOT_NEED_RELABEL = [k for k, name in EvidenceLabel.LABEL_CHOICES
  758. if k not in EvidenceLabel.NEED_RELABEL]
  759. to_re_answer = evidences.exclude(labels__label__in=NOT_NEED_RELABEL)
  760. for qset in [own_none_labels, never_considered_ev, never_considered_segm,
  761. to_re_answer, none_labels]:
  762. try:
  763. obj = qset[0]
  764. except IndexError:
  765. pass
  766. else:
  767. if isinstance(obj, TextSegment):
  768. return obj
  769. elif isinstance(obj, EvidenceCandidate):
  770. return obj.segment
  771. elif isinstance(obj, EvidenceLabel):
  772. return obj.evidence_candidate.segment
  773. else:
  774. raise ValueError
  775. return None
  776. def get_next_document_to_label(self, judge):
  777. next_segment = self.get_next_segment_to_label(judge)
  778. if next_segment is None:
  779. return None
  780. else:
  781. return next_segment.document
  782. class EvidenceCandidate(BaseModel):
  783. left_entity_occurrence = models.ForeignKey(
  784. 'EntityOccurrence',
  785. related_name='left_evidence_relations'
  786. )
  787. right_entity_occurrence = models.ForeignKey(
  788. 'EntityOccurrence',
  789. related_name='right_evidence_relations'
  790. )
  791. segment = models.ForeignKey('TextSegment', related_name='evidence_relations')
  792. class Meta(BaseModel.Meta):
  793. ordering = [
  794. 'left_entity_occurrence', 'right_entity_occurrence',
  795. 'segment_id',
  796. ]
  797. unique_together = [
  798. 'left_entity_occurrence', 'right_entity_occurrence',
  799. 'segment',
  800. ]
  801. def __str__(self):
  802. s = "Candidate evidence (id {})"
  803. return s.format(
  804. self.pk
  805. )
  806. def get_or_create_label_for_judge(self, relation, judge):
  807. obj, created = EvidenceLabel.objects.get_or_create(
  808. relation=relation,
  809. evidence_candidate=self, judge=judge,
  810. labeled_by_machine=False, defaults={'label': None})
  811. return obj
  812. def set_label(self, relation, label, judge, labeled_by_machine=False):
  813. evidence_label, created = EvidenceLabel.objects.get_or_create(
  814. relation=relation,
  815. evidence_candidate=self,
  816. judge=judge,
  817. labeled_by_machine=labeled_by_machine
  818. )
  819. evidence_label.label = label
  820. evidence_label.save()
  821. class EvidenceLabel(BaseModel):
  822. NORELATION = "NO"
  823. YESRELATION = "YE"
  824. SKIP = "SK"
  825. NONSENSE = "NS"
  826. LABEL_CHOICES = (
  827. (YESRELATION, "Yes, relation is present"),
  828. (NORELATION, "No relation present"),
  829. (NONSENSE, "Evidence is nonsense"),
  830. (SKIP, "Skipped labeling of this evidence"),
  831. )
  832. NEED_RELABEL = (
  833. # list of evidence labels that means it would be good to ask again
  834. SKIP
  835. )
  836. evidence_candidate = models.ForeignKey(
  837. 'EvidenceCandidate',
  838. related_name='labels'
  839. )
  840. label = models.CharField(
  841. max_length=2, choices=LABEL_CHOICES,
  842. default=SKIP, null=True, blank=False
  843. )
  844. relation = models.ForeignKey('Relation', related_name='relation_labels', null=True, blank=True)
  845. modification_date = models.DateTimeField(auto_now=True)
  846. # The judge field is meant to be the username of the person that decides
  847. # the label of this evidence. It's not modelled as a foreign key to allow
  848. # easier interaction with non-django code.
  849. judge = models.CharField(max_length=CHAR_MAX_LENGHT)
  850. labeled_by_machine = models.BooleanField(default=True)
  851. class Meta(BaseModel.Meta):
  852. unique_together = ['evidence_candidate', 'label', 'judge', 'relation']
  853. def __str__(self):
  854. s = "'{}' by '{}' in '{}'"
  855. return s.format(
  856. self.modification_date,
  857. self.judge,
  858. self.evidence_candidate.id,
  859. )
  860. class SegmentToTag(BaseModel):
  861. segment = models.ForeignKey("TextSegment")
  862. relation = models.ForeignKey("Relation")
  863. done = models.BooleanField(default=False)
  864. modification_date = models.DateTimeField(auto_now=True)
  865. class Meta(BaseModel.Meta):
  866. unique_together = ['segment', 'relation']
  867. class GazetteItem(BaseModel):
  868. kind = models.ForeignKey(EntityKind)
  869. text = models.CharField(max_length=CHAR_MAX_LENGHT, blank=False, unique=True)
  870. from_freebase = models.CharField(max_length=CHAR_MAX_LENGHT, blank=False)
  871. def __str__(self):
  872. return "'{}' ({})".format(self.text, self.kind.name)