literal.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. import codecs
  2. from iepy.preprocess.ner.base import BaseNERRunner
  3. class LiteralNER(object):
  4. """Trivial Named Entity Recognizer that tags exact matches.
  5. """
  6. def __init__(self, labels, src_filenames):
  7. """The i-th label is used to tag the occurrences of the terms in the
  8. i-th source file. If a term can have several labels, the last one in
  9. the list is selected.
  10. """
  11. assert len(labels) == len(src_filenames)
  12. self.labels = labels
  13. self.src_filenames = src_filenames
  14. names = set()
  15. names_map = {}
  16. for label, filename in zip(labels, src_filenames):
  17. f = codecs.open(filename, encoding="utf8")
  18. namelist = f.read().strip().split('\n')
  19. names.update(namelist)
  20. for name in namelist:
  21. names_map[name] = label
  22. self.names = frozenset(names)
  23. self.names_map = names_map
  24. # compute prefix closure
  25. prefixes = set()
  26. for name in self.names:
  27. sname = name.split()
  28. prefixes.update([' '.join(sname[:i]) for i in range(1, len(sname) + 1)])
  29. self.prefixes = frozenset(prefixes)
  30. def tag(self, sent):
  31. """Tagger with output a la Stanford (no start/end markers).
  32. """
  33. entities = self.entities(sent)
  34. # dummy entity for nicer code:
  35. entities.append(((len(sent), len(sent)), 'X'))
  36. next_entity = entities.pop(0)
  37. result = []
  38. for i, t in enumerate(sent):
  39. if i >= next_entity[0][1]:
  40. # assert entities
  41. next_entity = entities.pop(0)
  42. if i < next_entity[0][0]:
  43. result.append((t, 'O'))
  44. elif i < next_entity[0][1]:
  45. result.append((t, next_entity[1]))
  46. return result
  47. def entities(self, sent):
  48. """Return entities as a list of pairs ((offset, offset_end), label).
  49. """
  50. result = []
  51. i = 0
  52. while i < len(sent):
  53. j = i + 1
  54. prev_segment = segment = ' '.join(sent[i:j])
  55. while segment in self.prefixes and j <= len(sent):
  56. j += 1
  57. prev_segment = segment
  58. segment = ' '.join(sent[i:j])
  59. if prev_segment in self.names:
  60. label = self.names_map[prev_segment]
  61. result.append(((i, j - 1), label))
  62. i = j - 1
  63. else:
  64. i += 1
  65. return result
  66. class LiteralNERRunner(BaseNERRunner):
  67. def __init__(self, labels, src_filenames, override=False):
  68. super(LiteralNERRunner, self).__init__(override=override)
  69. self.lit_tagger = LiteralNER(labels, src_filenames)
  70. def run_ner(self, doc):
  71. entities = []
  72. sent_offset = 0
  73. for sent in doc.get_sentences():
  74. sent_entities = self.lit_tagger.entities(sent)
  75. for ((i, j), label) in sent_entities:
  76. name = ' '.join(sent[i:j])
  77. kind = label.lower() # XXX: should be in models.ENTITY_KINDS
  78. entities.append(
  79. self.build_occurrence(
  80. key=name,
  81. kind_name=kind,
  82. alias=name,
  83. offset=sent_offset + i,
  84. offset_end=sent_offset + j)
  85. )
  86. sent_offset += len(sent)
  87. return entities
  88. def to_lower_normalizer(name):
  89. """Utility normalizer that converts a name to lowercase unless it is an
  90. acronym. To be used as parameter of download_freebase_type().
  91. """
  92. words = name.split()
  93. result = []
  94. for w in words:
  95. if not w.isupper():
  96. w = w.lower()
  97. result.append(w)
  98. return ' '.join(result)