|
@@ -248,7 +248,7 @@ class CodeNamePredict():
|
|
_LEN = MAX_AREA//MAX_LEN
|
|
_LEN = MAX_AREA//MAX_LEN
|
|
#预测
|
|
#预测
|
|
|
|
|
|
- # x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
|
|
|
|
|
|
+ # x = [[self.word2index.get(word,index_pad)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
|
|
x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
|
|
x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
|
|
x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
|
|
x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
|
|
x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
|
|
x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
|
|
@@ -370,27 +370,28 @@ class CodeNamePredict():
|
|
list_name_freq_score = []
|
|
list_name_freq_score = []
|
|
|
|
|
|
# 2020/11/23 大网站规则调整
|
|
# 2020/11/23 大网站规则调整
|
|
- name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
|
|
|
|
- for sentence in list_sentence:
|
|
|
|
- # pad_sentence = sentence.sentence_text
|
|
|
|
- othername = re.search(name_re1, sentence.sentence_text)
|
|
|
|
- if othername != None:
|
|
|
|
- project_name = othername.group(3)
|
|
|
|
- beg = find_index([project_name], sentence.sentence_text)[0]
|
|
|
|
- end = beg + len(project_name)
|
|
|
|
- _name = self.fitDataByRule(sentence.sentence_text[beg:end])
|
|
|
|
- # add name to entitys
|
|
|
|
- _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
|
|
|
|
- sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
|
|
|
|
- entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
|
|
|
|
- end_index=0, wordOffset_begin=beg, wordOffset_end=end)
|
|
|
|
- list_entity.append(_entity)
|
|
|
|
- w = 1
|
|
|
|
- if _name not in dict_name_freq_score:
|
|
|
|
- # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
|
|
|
|
- dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
|
|
|
|
- else:
|
|
|
|
- dict_name_freq_score[_name][0] += 1
|
|
|
|
|
|
+ if len(dict_name_freq_score) == 0:
|
|
|
|
+ name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
|
|
|
|
+ for sentence in list_sentence:
|
|
|
|
+ # pad_sentence = sentence.sentence_text
|
|
|
|
+ othername = re.search(name_re1, sentence.sentence_text)
|
|
|
|
+ if othername != None:
|
|
|
|
+ project_name = othername.group(3)
|
|
|
|
+ beg = find_index([project_name], sentence.sentence_text)[0]
|
|
|
|
+ end = beg + len(project_name)
|
|
|
|
+ _name = self.fitDataByRule(sentence.sentence_text[beg:end])
|
|
|
|
+ # add name to entitys
|
|
|
|
+ _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
|
|
|
|
+ sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
|
|
|
|
+ entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
|
|
|
|
+ end_index=0, wordOffset_begin=beg, wordOffset_end=end)
|
|
|
|
+ list_entity.append(_entity)
|
|
|
|
+ w = 1
|
|
|
|
+ if _name not in dict_name_freq_score:
|
|
|
|
+ # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
|
|
|
|
+ dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
|
|
|
|
+ else:
|
|
|
|
+ dict_name_freq_score[_name][0] += 1
|
|
# othername = re.search(name_re1, sentence.sentence_text)
|
|
# othername = re.search(name_re1, sentence.sentence_text)
|
|
# if othername != None:
|
|
# if othername != None:
|
|
# _name = othername.group(3)
|
|
# _name = othername.group(3)
|