Просмотр исходного кода

下线正则召回未识别实体规则

znj 1 год назад
Родитель
Сommit
5a868556a0
1 измененных файлов с 53 добавлено и 53 удалено
  1. 53 53
      BiddingKG/dl/interface/predictor.py

+ 53 - 53
BiddingKG/dl/interface/predictor.py

@@ -2110,59 +2110,59 @@ class TendereeRuleRecall():
                         self.get_tenderee = True
 
     # 正则召回未识别实体
-    def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
-        list_sentence = list_sentences[0]
-        for in_attachment in [False,True]:
-            for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
-                sentence_text = sentence.sentence_text
-                tokens = sentence.tokens
-                doc_id = sentence.doc_id
-                in_attachment = sentence.in_attachment
-                list_tokenbegin = []
-                begin = 0
-                for i in range(0, len(tokens)):
-                    list_tokenbegin.append(begin)
-                    begin += len(str(tokens[i]))
-                list_tokenbegin.append(begin + 1)
-                for _match in re.finditer(pattern,sentence_text):
-                    _groupdict = _match.groupdict()
-                    _match_text = _match.group()
-                    _unrecognized_text = _groupdict["unrecognized"]
-                    _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
-                    if not _unrecognized:
-                        _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
-                    if _unrecognized:
-                        _unrecognized = _unrecognized.group()
-                    else:
-                        continue
-                    # print(_unrecognized)
-                    if re.search("某|乙方|代理",_unrecognized) or len(_unrecognized)>15:
-                        continue
-                    begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
-                    for j in range(len(list_tokenbegin)):
-                        if list_tokenbegin[j] == begin_index_temp:
-                            begin_index = j
-                            break
-                        elif list_tokenbegin[j] > begin_index_temp:
-                            begin_index = j - 1
-                            break
-                    index = begin_index_temp + len(_unrecognized)
-                    end_index_temp = index
-                    for j in range(begin_index, len(list_tokenbegin)):
-                        if list_tokenbegin[j] >= index:
-                            end_index = j - 1
-                            break
-                    entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
-                    entity_text = _unrecognized
-                    new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
-                               begin_index_temp, end_index_temp, in_attachment=in_attachment)
-                    new_entity.label = 0
-                    new_entity.values = [on_value,0,0,0,0,0]
-                    list_entitys[0].append(new_entity)
-                    self.get_tenderee = True
-            if self.get_tenderee:
-                list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
-                break
+    # def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
+    #     list_sentence = list_sentences[0]
+    #     for in_attachment in [False,True]:
+    #         for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
+    #             sentence_text = sentence.sentence_text
+    #             tokens = sentence.tokens
+    #             doc_id = sentence.doc_id
+    #             in_attachment = sentence.in_attachment
+    #             list_tokenbegin = []
+    #             begin = 0
+    #             for i in range(0, len(tokens)):
+    #                 list_tokenbegin.append(begin)
+    #                 begin += len(str(tokens[i]))
+    #             list_tokenbegin.append(begin + 1)
+    #             for _match in re.finditer(pattern,sentence_text):
+    #                 _groupdict = _match.groupdict()
+    #                 _match_text = _match.group()
+    #                 _unrecognized_text = _groupdict["unrecognized"]
+    #                 _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
+    #                 if not _unrecognized:
+    #                     _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
+    #                 if _unrecognized:
+    #                     _unrecognized = _unrecognized.group()
+    #                 else:
+    #                     continue
+    #                 # print(_unrecognized)
+    #                 if re.search("某|乙方|代理",_unrecognized) or len(_unrecognized)>15:
+    #                     continue
+    #                 begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
+    #                 for j in range(len(list_tokenbegin)):
+    #                     if list_tokenbegin[j] == begin_index_temp:
+    #                         begin_index = j
+    #                         break
+    #                     elif list_tokenbegin[j] > begin_index_temp:
+    #                         begin_index = j - 1
+    #                         break
+    #                 index = begin_index_temp + len(_unrecognized)
+    #                 end_index_temp = index
+    #                 for j in range(begin_index, len(list_tokenbegin)):
+    #                     if list_tokenbegin[j] >= index:
+    #                         end_index = j - 1
+    #                         break
+    #                 entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
+    #                 entity_text = _unrecognized
+    #                 new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
+    #                            begin_index_temp, end_index_temp, in_attachment=in_attachment)
+    #                 new_entity.label = 0
+    #                 new_entity.values = [on_value,0,0,0,0,0]
+    #                 list_entitys[0].append(new_entity)
+    #                 self.get_tenderee = True
+    #         if self.get_tenderee:
+    #             list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
+    #             break
 
 class RoleGrade():
     def __init__(self):