lexical.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. #!/usr/bin/env python
  2. # -*-coding:utf-8-*-
  3. import sys
  4. import os
  5. import json
  6. from BiddingKG.dl.foolnltk.selffool.predictor import Predictor
  7. from BiddingKG.dl.common.Utils import *
  8. from zipfile import ZipFile
  9. OOV_STR = "<OOV>"
  10. data_path = os.path.dirname(__file__)+"/../data"
  11. def _load_map_file(path, char_map_name, id_map_name):
  12. with ZipFile(path) as myzip:
  13. with myzip.open('all_map.json') as myfile:
  14. content = myfile.readline()
  15. content = content.decode()
  16. data = json.loads(content)
  17. return data.get(char_map_name), data.get(id_map_name)
  18. class LexicalAnalyzer(object):
  19. def __init__(self):
  20. self.initialized = False
  21. self.map = None
  22. self.seg_model = None
  23. self.pos_model = None
  24. self.ner_model = None
  25. self.data_path = data_path
  26. self.map_file_path = os.path.join(self.data_path, "map.zip")
  27. def _load_model(self, model_namel, word_map_name, tag_name,url,authorization):
  28. seg_model_path = os.path.join(self.data_path, model_namel)
  29. char_to_id, id_to_seg = _load_map_file(self.map_file_path, word_map_name, tag_name)
  30. return Predictor(seg_model_path, char_to_id, id_to_seg,url,authorization)
  31. def _load_seg_model(self,url=None,authorization=None):
  32. self.seg_model = self._load_model("seg.pb", "char_map", "seg_map",url,authorization)
  33. def _load_pos_model(self,url=None,authorization=None):
  34. self.pos_model = self._load_model("pos.pb", "word_map", "pos_map",url,authorization)
  35. def _load_ner_model(self,url=None,authorization=None):
  36. self.ner_model = self._load_model("ner.pb", "char_map", "ner_map",url,authorization)
  37. def pos(self, seg_words_list):
  38. if not self.pos_model:
  39. self._load_pos_model()
  40. pos_labels = self.pos_model.predict(seg_words_list)
  41. return pos_labels
  42. def ner_labels(self, text_list,tokens):
  43. if not self.ner_model:
  44. self._load_ner_model()
  45. assert len(text_list)==len(tokens)
  46. ner_labels = self.ner_model.predict(text_list)
  47. out = []
  48. for index in range(len(ner_labels)):
  49. ner = ner_labels[index]
  50. token = tokens[index]
  51. out_item = []
  52. i = -1
  53. for item in token:
  54. i += len(item)
  55. if ner[i]=="O":
  56. out_item.append("O")
  57. else:
  58. out_item.append(ner[i].split("_")[1])
  59. out.append(out_item)
  60. return out
  61. def ner(self, text_list):
  62. if not self.ner_model:
  63. self._load_ner_model()
  64. ner_labels = self.ner_model.predict(text_list)
  65. #print(ner_labels)
  66. all_entitys = []
  67. for ti, text in enumerate(text_list):
  68. ens = []
  69. entity = ""
  70. i = 0
  71. ner_label = ner_labels[ti]
  72. chars = list(text)
  73. for label, word in zip(ner_label, chars):
  74. i += 1
  75. if label == "O":
  76. continue
  77. lt = label.split("_")[1]
  78. lb = label.split("_")[0]
  79. if lb == "S":
  80. ens.append((i-1, i, lt, word))
  81. elif lb == "B":
  82. entity = ""
  83. entity += word
  84. elif lb == "M":
  85. entity += word
  86. elif lb == "E":
  87. entity += word
  88. ens.append((i - len(entity), i, lt, entity))
  89. entity = ""
  90. if entity:
  91. ens.append((i - len(entity), i, lt, entity))
  92. all_entitys.append(ens)
  93. return all_entitys
  94. def cut(self, text_list):
  95. if not self.seg_model:
  96. self._load_seg_model(url=selffool_seg_url,authorization=selffool_seg_authorization)
  97. all_labels = self.seg_model.predict(text_list)
  98. sent_words = []
  99. for ti, text in enumerate(text_list):
  100. words = []
  101. N = len(text)
  102. seg_labels = all_labels[ti]
  103. tmp_word = ""
  104. for i in range(N):
  105. label = seg_labels[i]
  106. w = text[i]
  107. if label == "B":
  108. tmp_word += w
  109. elif label == "M":
  110. tmp_word += w
  111. elif label == "E":
  112. tmp_word += w
  113. words.append(tmp_word)
  114. tmp_word = ""
  115. else:
  116. tmp_word = ""
  117. words.append(w)
  118. if tmp_word:
  119. words.append(tmp_word)
  120. sent_words.append(words)
  121. return sent_words
  122. def analysis(self, text_list):
  123. words = self.cut(text_list)
  124. pos_labels = self.pos(words)
  125. ners = self.ner(text_list)
  126. word_inf = [list(zip(ws, ps)) for ws, ps in zip(words, pos_labels)]
  127. return word_inf, ners