from BiddingKG.dl.foolnltk import selffool def getTokensAndNers(sentences,MAXAREA = 10000,useselffool=False): ''' @param: sentences:句子数 @return 限流执行后的分词和实体识别list ''' def getData(tokens,ners,process_data): process_sentences = [item[1] for item in process_data] token_ = selffool.cut(process_sentences) if useselffool: ner_ = selffool.self_ner(process_sentences) else: ner_ = selffool.ner(process_sentences) for i in range(len(token_)): the_index = process_data[i][0] tokens[the_index] = token_[i] ners[the_index] = ner_[i] sents = [] for i in range(len(sentences)): sents.append([i,sentences[i]]) sents.sort(key=lambda x:len(x[1]),reverse=True) index_ = 0 tokens = [[]for i in range(len(sentences))] ners = [[]for i in range(len(sentences))] while(True): width = len(sents[index_][1]) height = MAXAREA//width+1 if height>len(sents)-index_: height = len(sents)-index_ process_data = sents[index_:index_+height] getData(tokens, ners, process_data) index_ += height if index_>=len(sents): break return tokens,ners def getTokens(sentences,MAXAREA = 10000,useselffool=True): ''' @param: sentences:句子数 @return 限流执行后的分词list ''' def getData(tokens,process_data): process_sentences = [item[1] for item in process_data] token_ = selffool.cut(process_sentences) for i in range(len(token_)): the_index = process_data[i][0] tokens[the_index] = token_[i] sents = [] for i in range(len(sentences)): sents.append([i,sentences[i]]) sents.sort(key=lambda x:len(x[1]),reverse=True) index_ = 0 tokens = [[]for i in range(len(sentences))] while(True): width = len(sents[index_][1]) height = MAXAREA//width+1 if height>len(sents)-index_: height = len(sents)-index_ process_data = sents[index_:index_+height] getData(tokens, process_data) index_ += height if index_>=len(sents): break return tokens def getNers(sentences,MAXAREA = 10000,useselffool=False): ''' @param: sentences:句子数 @return 限流执行后的实体识别list ''' def getData(ners,process_data): process_sentences = [item[1] for item in process_data] if useselffool: ner_ = selffool.self_ner(process_sentences) else: ner_ = selffool.ner(process_sentences) for i in range(len(ner_)): the_index = process_data[i][0] ners[the_index] = ner_[i] sents = [] for i in range(len(sentences)): sents.append([i,sentences[i]]) sents.sort(key=lambda x:len(x[1]),reverse=True) index_ = 0 ners = [[]for i in range(len(sentences))] while(True): width = len(sents[index_][1]) height = MAXAREA//width+1 if height>len(sents)-index_: height = len(sents)-index_ process_data = sents[index_:index_+height] getData( ners, process_data) index_ += height if index_>=len(sents): break return ners