|
@@ -2110,59 +2110,59 @@ class TendereeRuleRecall():
|
|
|
self.get_tenderee = True
|
|
|
|
|
|
# 正则召回未识别实体
|
|
|
- def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
|
|
|
- list_sentence = list_sentences[0]
|
|
|
- for in_attachment in [False,True]:
|
|
|
- for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
|
|
|
- sentence_text = sentence.sentence_text
|
|
|
- tokens = sentence.tokens
|
|
|
- doc_id = sentence.doc_id
|
|
|
- in_attachment = sentence.in_attachment
|
|
|
- list_tokenbegin = []
|
|
|
- begin = 0
|
|
|
- for i in range(0, len(tokens)):
|
|
|
- list_tokenbegin.append(begin)
|
|
|
- begin += len(str(tokens[i]))
|
|
|
- list_tokenbegin.append(begin + 1)
|
|
|
- for _match in re.finditer(pattern,sentence_text):
|
|
|
- _groupdict = _match.groupdict()
|
|
|
- _match_text = _match.group()
|
|
|
- _unrecognized_text = _groupdict["unrecognized"]
|
|
|
- _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
|
|
|
- if not _unrecognized:
|
|
|
- _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
|
|
|
- if _unrecognized:
|
|
|
- _unrecognized = _unrecognized.group()
|
|
|
- else:
|
|
|
- continue
|
|
|
- # print(_unrecognized)
|
|
|
- if re.search("某|乙方|代理",_unrecognized) or len(_unrecognized)>15:
|
|
|
- continue
|
|
|
- begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
|
|
|
- for j in range(len(list_tokenbegin)):
|
|
|
- if list_tokenbegin[j] == begin_index_temp:
|
|
|
- begin_index = j
|
|
|
- break
|
|
|
- elif list_tokenbegin[j] > begin_index_temp:
|
|
|
- begin_index = j - 1
|
|
|
- break
|
|
|
- index = begin_index_temp + len(_unrecognized)
|
|
|
- end_index_temp = index
|
|
|
- for j in range(begin_index, len(list_tokenbegin)):
|
|
|
- if list_tokenbegin[j] >= index:
|
|
|
- end_index = j - 1
|
|
|
- break
|
|
|
- entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
|
|
|
- entity_text = _unrecognized
|
|
|
- new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
|
|
|
- begin_index_temp, end_index_temp, in_attachment=in_attachment)
|
|
|
- new_entity.label = 0
|
|
|
- new_entity.values = [on_value,0,0,0,0,0]
|
|
|
- list_entitys[0].append(new_entity)
|
|
|
- self.get_tenderee = True
|
|
|
- if self.get_tenderee:
|
|
|
- list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
|
|
|
- break
|
|
|
+ # def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
|
|
|
+ # list_sentence = list_sentences[0]
|
|
|
+ # for in_attachment in [False,True]:
|
|
|
+ # for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
|
|
|
+ # sentence_text = sentence.sentence_text
|
|
|
+ # tokens = sentence.tokens
|
|
|
+ # doc_id = sentence.doc_id
|
|
|
+ # in_attachment = sentence.in_attachment
|
|
|
+ # list_tokenbegin = []
|
|
|
+ # begin = 0
|
|
|
+ # for i in range(0, len(tokens)):
|
|
|
+ # list_tokenbegin.append(begin)
|
|
|
+ # begin += len(str(tokens[i]))
|
|
|
+ # list_tokenbegin.append(begin + 1)
|
|
|
+ # for _match in re.finditer(pattern,sentence_text):
|
|
|
+ # _groupdict = _match.groupdict()
|
|
|
+ # _match_text = _match.group()
|
|
|
+ # _unrecognized_text = _groupdict["unrecognized"]
|
|
|
+ # _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
|
|
|
+ # if not _unrecognized:
|
|
|
+ # _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
|
|
|
+ # if _unrecognized:
|
|
|
+ # _unrecognized = _unrecognized.group()
|
|
|
+ # else:
|
|
|
+ # continue
|
|
|
+ # # print(_unrecognized)
|
|
|
+ # if re.search("某|乙方|代理",_unrecognized) or len(_unrecognized)>15:
|
|
|
+ # continue
|
|
|
+ # begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
|
|
|
+ # for j in range(len(list_tokenbegin)):
|
|
|
+ # if list_tokenbegin[j] == begin_index_temp:
|
|
|
+ # begin_index = j
|
|
|
+ # break
|
|
|
+ # elif list_tokenbegin[j] > begin_index_temp:
|
|
|
+ # begin_index = j - 1
|
|
|
+ # break
|
|
|
+ # index = begin_index_temp + len(_unrecognized)
|
|
|
+ # end_index_temp = index
|
|
|
+ # for j in range(begin_index, len(list_tokenbegin)):
|
|
|
+ # if list_tokenbegin[j] >= index:
|
|
|
+ # end_index = j - 1
|
|
|
+ # break
|
|
|
+ # entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
|
|
|
+ # entity_text = _unrecognized
|
|
|
+ # new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
|
|
|
+ # begin_index_temp, end_index_temp, in_attachment=in_attachment)
|
|
|
+ # new_entity.label = 0
|
|
|
+ # new_entity.values = [on_value,0,0,0,0,0]
|
|
|
+ # list_entitys[0].append(new_entity)
|
|
|
+ # self.get_tenderee = True
|
|
|
+ # if self.get_tenderee:
|
|
|
+ # list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
|
|
|
+ # break
|
|
|
|
|
|
class RoleGrade():
|
|
|
def __init__(self):
|