db.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. """
  2. IEPY DB Abstraction level.
  3. The goal of this module is to provide some thin abstraction between
  4. the chosen database engine and ORM and the IEPY core and tools.
  5. """
  6. from collections import defaultdict, namedtuple
  7. from functools import lru_cache
  8. from random import shuffle
  9. import logging
  10. import iepy
  11. iepy.setup()
  12. from iepy.data.models import (
  13. IEDocument, IEDocumentMetadata,
  14. TextSegment, Relation,
  15. Entity, EntityKind, EntityOccurrence,
  16. EvidenceLabel, EvidenceCandidate
  17. )
  18. from iepy.preprocess import segmenter
  19. from iepy.selfpreprocess.pipeline import PreProcessSteps
  20. IEPYDBConnector = namedtuple('IEPYDBConnector', 'segments documents')
  21. # Number of entities that will be cached on get_entity function.
  22. ENTITY_CACHE_SIZE = 20 # reasonable compromise
  23. logger = logging.getLogger(__name__)
  24. class DocumentManager(object):
  25. """Wrapper to the db-access, so it's not that impossible to switch
  26. from current ORM to something else if desired.
  27. """
  28. ### Basic administration and pre-process
  29. def __init__(self, base_queryset=None):
  30. self.base_queryset = base_queryset
  31. def create_document(self, identifier, text, metadata=None, update_mode=False):
  32. """Creates a new Document with text ready to be inserted on the
  33. information extraction pipeline (ie, ready to be tokenized, POS Tagged,
  34. etc).
  35. Identifier must be a unique value that will be used for distinguishing
  36. one document from another.
  37. Metadata is a dictionary where you can put whatever you want to persist
  38. with your document. IEPY will do nothing with it except ensuring that
  39. such information will be preserved.
  40. With update_mode enabled, then if there's an existent document with the
  41. provided identifier, it's updated (be warn that if some preprocess
  42. result exist will be preserved untouched, delegating the responsability
  43. of deciding what to do to the caller of this method).
  44. """
  45. if metadata is None:
  46. metadata = {}
  47. filter_query = IEDocument.objects.filter(human_identifier=identifier)
  48. if not filter_query.exists():
  49. mtd_obj = IEDocumentMetadata.objects.create(items=metadata)
  50. doc = IEDocument.objects.create(human_identifier=identifier, sourcetext=text,
  51. metadata=mtd_obj)
  52. else:
  53. doc = filter_query.get()
  54. if update_mode:
  55. doc.sourcetext = text
  56. doc.metadata.items = metadata
  57. doc.metadata.save()
  58. doc.save()
  59. return doc
  60. def _docs(self):
  61. if self.base_queryset:
  62. docs = self.base_queryset
  63. else:
  64. docs = IEDocument.objects.all()
  65. return docs
  66. def __iter__(self):
  67. return iter(self._docs())
  68. def get_raw_documents(self):
  69. """returns an interator of documents that lack the text field, or it's
  70. empty.
  71. """
  72. return self._docs().filter(text='')
  73. def get_documents_lacking_preprocess(self, step_or_steps):
  74. """Returns an iterator of documents that shall be processed on the given
  75. step."""
  76. from django.db.models import Q
  77. if not isinstance(step_or_steps, (list, tuple)):
  78. steps = [step_or_steps]
  79. else:
  80. steps = step_or_steps
  81. query = None
  82. for step in steps:
  83. if step in PreProcessSteps:
  84. flag_field_name = "%s_done_at" % step.name
  85. q = Q(**{"%s__isnull" % flag_field_name: True})
  86. if query is None:
  87. query = q
  88. else:
  89. query = query | q
  90. if query is not None:
  91. return self._docs().filter(query & Q(jump_signal=0)).order_by('id')
  92. else:
  93. return IEDocument.objects.none()
  94. class TextSegmentManager(object):
  95. @classmethod
  96. def get_segment(cls, document_identifier, offset):
  97. # FIXME: this is still mongo storage dependent
  98. d = IEDocument.objects.get(human_identifier=document_identifier)
  99. return TextSegment.objects.get(document=d, offset=offset)
  100. class EntityManager(object):
  101. @classmethod
  102. def ensure_kinds(cls, kind_names):
  103. for kn in kind_names:
  104. EntityKind.objects.get_or_create(name=kn)
  105. @classmethod
  106. @lru_cache(maxsize=ENTITY_CACHE_SIZE)
  107. def get_entity(cls, kind, literal):
  108. kw = {'key': literal}
  109. if isinstance(kind, int):
  110. kw['kind_id'] = kind
  111. else:
  112. kw['kind__name'] = kind
  113. return Entity.objects.get(**kw)
  114. class EntityOccurrenceManager:
  115. @classmethod
  116. def create_with_entity(cls, kind, document, offset, offset_end):
  117. entity, _ = Entity.objects.get_or_create(
  118. key="{} {} {} {}".format(
  119. document.human_identifier,
  120. kind, offset, offset_end
  121. ),
  122. kind=kind
  123. )
  124. entity_occurrence = EntityOccurrence(
  125. entity=entity,
  126. document=document,
  127. offset=offset,
  128. offset_end=offset_end,
  129. alias=" ".join(document.tokens[offset:offset_end]),
  130. )
  131. entity_occurrence.save()
  132. segmenter_runner = segmenter.SyntacticSegmenterRunner(override=True)
  133. segmenter_runner(document)
  134. class RelationManager(object):
  135. @classmethod
  136. def get_relation(cls, pk):
  137. return Relation.objects.get(pk=pk)
  138. @classmethod
  139. def dict_by_id(cls):
  140. return dict((r.pk, r) for r in Relation.objects.all())
  141. class CandidateEvidenceManager(object):
  142. @classmethod
  143. def hydrate(cls, ev, document=None):
  144. ev.evidence = ev.segment.hydrate(document)
  145. ev.right_entity_occurrence.hydrate_for_segment(ev.segment)
  146. ev.left_entity_occurrence.hydrate_for_segment(ev.segment)
  147. # contains a duplicate of left and right eo. Not big deal
  148. ev.all_eos = ev.segment.get_entity_occurrences()
  149. return ev
  150. @classmethod
  151. def candidates_for_relation(cls, relation, construct_missing_candidates=True,
  152. seg_limit=-1, shuffle_segs=False):
  153. # Wraps the actual database lookup of evidence, hydrating them so
  154. # in theory, no extra db access shall be done
  155. # The idea here is simple, but with some tricks for improving performance
  156. logger.info("Loading candidate evidence from database...")
  157. hydrate = cls.hydrate
  158. segments_per_document = defaultdict(list)
  159. raw_segments = {}
  160. segments = relation._matching_text_segments()
  161. if shuffle_segs:
  162. segments = list(segments)
  163. shuffle(segments)
  164. for i, s in enumerate(segments):
  165. if seg_limit >= 0 and i >= seg_limit:
  166. break
  167. raw_segments[s.id] = s
  168. for s in raw_segments.values():
  169. segments_per_document[s.document_id].append(s)
  170. doc_ids = segments_per_document.keys()
  171. existent_ec = EvidenceCandidate.objects.filter(
  172. left_entity_occurrence__entity__kind=relation.left_entity_kind,
  173. right_entity_occurrence__entity__kind=relation.right_entity_kind,
  174. segment__in=raw_segments.keys()
  175. ).select_related(
  176. 'left_entity_occurrence', 'right_entity_occurrence', 'segment'
  177. )
  178. existent_ec_per_segment = defaultdict(list)
  179. for ec in existent_ec:
  180. existent_ec_per_segment[ec.segment_id].append(ec)
  181. _doc_ids = list(doc_ids)
  182. while _doc_ids:
  183. _id = _doc_ids.pop()
  184. document = IEDocument.objects.get(id=_id)
  185. for segment in segments_per_document[document.id]:
  186. _existent = existent_ec_per_segment[segment.pk]
  187. if construct_missing_candidates:
  188. seg_ecs = segment.get_evidences_for_relation(relation, _existent)
  189. else:
  190. seg_ecs = _existent
  191. for evidence in seg_ecs:
  192. yield hydrate(evidence, document)
  193. @classmethod
  194. def value_labeled_candidates_count_for_relation(cls, relation):
  195. """Returns the count of labels for the given relation that provide actual
  196. information/value: YES or NO"""
  197. labels = EvidenceLabel.objects.filter(
  198. relation=relation,
  199. label__in=[EvidenceLabel.NORELATION, EvidenceLabel.YESRELATION],
  200. labeled_by_machine=False
  201. )
  202. return labels.count()
  203. @classmethod
  204. def labels_for(cls, relation, evidences, conflict_solver=None):
  205. """Returns a dict with the form evidence->[True|False|None]"""
  206. # Given a relation and a sequence of candidate-evidences, compute its
  207. # labels
  208. candidates = {e: None for e in evidences}
  209. logger.info("Getting labels from DB")
  210. labels = EvidenceLabel.objects.filter(
  211. relation=relation,
  212. label__in=[EvidenceLabel.NORELATION, EvidenceLabel.YESRELATION,
  213. EvidenceLabel.NONSENSE],
  214. labeled_by_machine=False
  215. )
  216. logger.info("Sorting labels them by evidence")
  217. labels_per_ev = defaultdict(list)
  218. for l in labels:
  219. labels_per_ev[l.evidence_candidate].append(l)
  220. logger.info("Labels conflict solving")
  221. for e in candidates:
  222. answers = labels_per_ev[e]
  223. if not answers:
  224. continue
  225. if len(answers) == 1:
  226. lbl = answers[0].label
  227. elif len(set([a.label for a in answers])) == 1:
  228. # several answers, all the same. Just pick the first one
  229. lbl = answers[0].label
  230. elif conflict_solver:
  231. preferred = conflict_solver(answers)
  232. if preferred is None:
  233. # unsolvable conflict
  234. continue
  235. lbl = preferred.label
  236. else:
  237. continue
  238. # Ok, we have a choosen answer. Lets see if it's informative
  239. if lbl == EvidenceLabel.NONSENSE:
  240. # too bad, not informative
  241. continue
  242. elif lbl == EvidenceLabel.NORELATION:
  243. candidates[e] = False
  244. elif lbl == EvidenceLabel.YESRELATION:
  245. candidates[e] = True
  246. return candidates
  247. @classmethod
  248. def conflict_resolution_by_judge_name(cls, judges_order):
  249. # Only consider answers for the given judges, prefering those of the judge listed
  250. # first. Returns None if not found.
  251. def solver(ev_labels):
  252. # expects to be called only when len(ev_labels) > 1
  253. ev_labels = [el for el in ev_labels if el.judge in judges_order]
  254. if ev_labels:
  255. ev_labels.sort(key=lambda el: judges_order.index(el.judge))
  256. return ev_labels[0]
  257. return None
  258. return solver
  259. @classmethod
  260. def conflict_resolution_newest_wins(cls, ev_labels):
  261. # expects to be called only when len(ev_labels) > 1
  262. return sorted(ev_labels[:], key=lambda el: el.modification_date, reverse=True)[0]