123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- from BiddingKG.dl.foolnltk import selffool
- def getTokensAndNers(sentences,MAXAREA = 10000,useselffool=False):
- '''
- @param: sentences:句子数
- @return 限流执行后的分词和实体识别list
- '''
- def getData(tokens,ners,process_data):
- process_sentences = [item[1] for item in process_data]
- token_ = selffool.cut(process_sentences)
- if useselffool:
- ner_ = selffool.self_ner(process_sentences)
- else:
- ner_ = selffool.ner(process_sentences)
- for i in range(len(token_)):
- the_index = process_data[i][0]
- tokens[the_index] = token_[i]
- ners[the_index] = ner_[i]
- sents = []
- for i in range(len(sentences)):
- sents.append([i,sentences[i]])
- sents.sort(key=lambda x:len(x[1]),reverse=True)
- index_ = 0
- tokens = [[]for i in range(len(sentences))]
- ners = [[]for i in range(len(sentences))]
- while(True):
- width = len(sents[index_][1])
- height = MAXAREA//width+1
- if height>len(sents)-index_:
- height = len(sents)-index_
- process_data = sents[index_:index_+height]
- getData(tokens, ners, process_data)
- index_ += height
- if index_>=len(sents):
- break
- return tokens,ners
- def getTokens(sentences,MAXAREA = 10000,useselffool=True):
- '''
- @param: sentences:句子数
- @return 限流执行后的分词list
- '''
- def getData(tokens,process_data):
- process_sentences = [item[1] for item in process_data]
- token_ = selffool.cut(process_sentences)
- for i in range(len(token_)):
- the_index = process_data[i][0]
- tokens[the_index] = token_[i]
- sents = []
- for i in range(len(sentences)):
- sents.append([i,sentences[i]])
- sents.sort(key=lambda x:len(x[1]),reverse=True)
- index_ = 0
- tokens = [[]for i in range(len(sentences))]
- while(True):
- width = len(sents[index_][1])
- height = MAXAREA//width+1
- if height>len(sents)-index_:
- height = len(sents)-index_
- process_data = sents[index_:index_+height]
- getData(tokens, process_data)
- index_ += height
- if index_>=len(sents):
- break
- return tokens
- def getNers(sentences,MAXAREA = 10000,useselffool=False):
- '''
- @param: sentences:句子数
- @return 限流执行后的实体识别list
- '''
- def getData(ners,process_data):
- process_sentences = [item[1] for item in process_data]
- if useselffool:
- ner_ = selffool.self_ner(process_sentences)
- else:
- ner_ = selffool.ner(process_sentences)
- for i in range(len(ner_)):
- the_index = process_data[i][0]
- ners[the_index] = ner_[i]
- sents = []
- for i in range(len(sentences)):
- sents.append([i,sentences[i]])
- sents.sort(key=lambda x:len(x[1]),reverse=True)
- index_ = 0
- ners = [[]for i in range(len(sentences))]
- while(True):
- width = len(sents[index_][1])
- height = MAXAREA//width+1
- if height>len(sents)-index_:
- height = len(sents)-index_
- process_data = sents[index_:index_+height]
- getData( ners, process_data)
- index_ += height
- if index_>=len(sents):
- break
- return ners
|