nerUtils.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. from BiddingKG.dl.foolnltk import selffool
  2. def getTokensAndNers(sentences,MAXAREA = 10000,useselffool=False):
  3. '''
  4. @param: sentences:句子数
  5. @return 限流执行后的分词和实体识别list
  6. '''
  7. def getData(tokens,ners,process_data):
  8. process_sentences = [item[1] for item in process_data]
  9. token_ = selffool.cut(process_sentences)
  10. if useselffool:
  11. ner_ = selffool.self_ner(process_sentences)
  12. else:
  13. ner_ = selffool.ner(process_sentences)
  14. for i in range(len(token_)):
  15. the_index = process_data[i][0]
  16. tokens[the_index] = token_[i]
  17. ners[the_index] = ner_[i]
  18. sents = []
  19. for i in range(len(sentences)):
  20. sents.append([i,sentences[i]])
  21. sents.sort(key=lambda x:len(x[1]),reverse=True)
  22. index_ = 0
  23. tokens = [[]for i in range(len(sentences))]
  24. ners = [[]for i in range(len(sentences))]
  25. while(True):
  26. width = len(sents[index_][1])
  27. height = MAXAREA//width+1
  28. if height>len(sents)-index_:
  29. height = len(sents)-index_
  30. process_data = sents[index_:index_+height]
  31. getData(tokens, ners, process_data)
  32. index_ += height
  33. if index_>=len(sents):
  34. break
  35. return tokens,ners
  36. def getTokens(sentences,MAXAREA = 10000,useselffool=True):
  37. '''
  38. @param: sentences:句子数
  39. @return 限流执行后的分词list
  40. '''
  41. def getData(tokens,process_data):
  42. process_sentences = [item[1] for item in process_data]
  43. token_ = selffool.cut(process_sentences)
  44. for i in range(len(token_)):
  45. the_index = process_data[i][0]
  46. tokens[the_index] = token_[i]
  47. sents = []
  48. for i in range(len(sentences)):
  49. sents.append([i,sentences[i]])
  50. sents.sort(key=lambda x:len(x[1]),reverse=True)
  51. index_ = 0
  52. tokens = [[]for i in range(len(sentences))]
  53. while(True):
  54. width = len(sents[index_][1])
  55. height = MAXAREA//width+1
  56. if height>len(sents)-index_:
  57. height = len(sents)-index_
  58. process_data = sents[index_:index_+height]
  59. getData(tokens, process_data)
  60. index_ += height
  61. if index_>=len(sents):
  62. break
  63. return tokens
  64. def getNers(sentences,MAXAREA = 10000,useselffool=False):
  65. '''
  66. @param: sentences:句子数
  67. @return 限流执行后的实体识别list
  68. '''
  69. def getData(ners,process_data):
  70. process_sentences = [item[1] for item in process_data]
  71. if useselffool:
  72. ner_ = selffool.self_ner(process_sentences)
  73. else:
  74. ner_ = selffool.ner(process_sentences)
  75. for i in range(len(ner_)):
  76. the_index = process_data[i][0]
  77. ners[the_index] = ner_[i]
  78. sents = []
  79. for i in range(len(sentences)):
  80. sents.append([i,sentences[i]])
  81. sents.sort(key=lambda x:len(x[1]),reverse=True)
  82. index_ = 0
  83. ners = [[]for i in range(len(sentences))]
  84. while(True):
  85. width = len(sents[index_][1])
  86. height = MAXAREA//width+1
  87. if height>len(sents)-index_:
  88. height = len(sents)-index_
  89. process_data = sents[index_:index_+height]
  90. getData( ners, process_data)
  91. index_ += height
  92. if index_>=len(sents):
  93. break
  94. return ners