1 year ago · 94ad249a0b
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -1919,9 +1919,9 @@ class TendereeRuleRecall():
 
				                                 "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[，：]公司名称：|权属人|甲方当事人|询价书企业|比选发起人|项目单位[，：]单位名称|结算单位)"\
			
 
				                                 "[）)]?(信息[，：])?((公司|单位)?名称)?([(（](全称|盖章)[）)])?(是|：|:)+)(?P<unrecognized>[^，。：:；]+)[，。；：:]")
			
 
				         # 未识别实体尾部判断
			
 
				-        self.unrecognized_end1 = re.compile(
			
 
				-            "^[\u4e00-\u9fa5]{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心|联合社|合作社)")
			
 
				-        self.unrecognized_end2 = re.compile("^[\u4e00-\u9fa5]{4,}(?:署|局|厅|处|室|科|部|站|所|股|行|园)")
			
 
				+        # self.unrecognized_end1 = re.compile(
			
 
				+        #     "^[\u4e00-\u9fa5]{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心|联合社|合作社)")
			
 
				+        # self.unrecognized_end2 = re.compile("^[\u4e00-\u9fa5]{4,}(?:署|局|厅|处|室|科|部|站|所|股|行|园)")
			
 
				 
			
 
				     def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
			
 
				 
			
@@ -1946,10 +1946,10 @@ class TendereeRuleRecall():
 
				             self.entity_context_rule(ents,list_name,list_sentences,list(agency_set))
			
 
				         if not self.get_tenderee:
			
 
				             self.subject_rule(ents,list_articles,list_sentences)
			
 
				-        if not self.get_tenderee:
			
 
				-            self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
			
 
				-        if not self.get_tenderee:
			
 
				-            self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
			
 
				+        # if not self.get_tenderee:
			
 
				+        #     self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
			
 
				+        # if not self.get_tenderee:
			
 
				+        #     self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
			
 
				 
			
 
				     #entity上下文正则判断
			
 
				     def entity_context_rule(self,entitys,list_name,list_sentences,list_agency):
			
@@ -2110,59 +2110,59 @@ class TendereeRuleRecall():
 
				                         self.get_tenderee = True
			
 
				 
			
 
				     # 正则召回未识别实体
			
 
				-    def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
			
 
				-        list_sentence = list_sentences[0]
			
 
				-        for in_attachment in [False,True]:
			
 
				-            for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
			
 
				-                sentence_text = sentence.sentence_text
			
 
				-                tokens = sentence.tokens
			
 
				-                doc_id = sentence.doc_id
			
 
				-                in_attachment = sentence.in_attachment
			
 
				-                list_tokenbegin = []
			
 
				-                begin = 0
			
 
				-                for i in range(0, len(tokens)):
			
 
				-                    list_tokenbegin.append(begin)
			
 
				-                    begin += len(str(tokens[i]))
			
 
				-                list_tokenbegin.append(begin + 1)
			
 
				-                for _match in re.finditer(pattern,sentence_text):
			
 
				-                    _groupdict = _match.groupdict()
			
 
				-                    _match_text = _match.group()
			
 
				-                    _unrecognized_text = _groupdict["unrecognized"]
			
 
				-                    _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
			
 
				-                    if not _unrecognized:
			
 
				-                        _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
			
 
				-                    if _unrecognized:
			
 
				-                        _unrecognized = _unrecognized.group()
			
 
				-                    else:
			
 
				-                        continue
			
 
				-                    # print(_unrecognized)
			
 
				-                    if re.search("某|乙方|代理",_unrecognized) or len(_unrecognized)>15:
			
 
				-                        continue
			
 
				-                    begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
			
 
				-                    for j in range(len(list_tokenbegin)):
			
 
				-                        if list_tokenbegin[j] == begin_index_temp:
			
 
				-                            begin_index = j
			
 
				-                            break
			
 
				-                        elif list_tokenbegin[j] > begin_index_temp:
			
 
				-                            begin_index = j - 1
			
 
				-                            break
			
 
				-                    index = begin_index_temp + len(_unrecognized)
			
 
				-                    end_index_temp = index
			
 
				-                    for j in range(begin_index, len(list_tokenbegin)):
			
 
				-                        if list_tokenbegin[j] >= index:
			
 
				-                            end_index = j - 1
			
 
				-                            break
			
 
				-                    entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
			
 
				-                    entity_text = _unrecognized
			
 
				-                    new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
			
 
				-                               begin_index_temp, end_index_temp, in_attachment=in_attachment)
			
 
				-                    new_entity.label = 0
			
 
				-                    new_entity.values = [on_value,0,0,0,0,0]
			
 
				-                    list_entitys[0].append(new_entity)
			
 
				-                    self.get_tenderee = True
			
 
				-            if self.get_tenderee:
			
 
				-                list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
			
 
				-                break
			
 
				+    # def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
			
 
				+    #     list_sentence = list_sentences[0]
			
 
				+    #     for in_attachment in [False,True]:
			
 
				+    #         for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
			
 
				+    #             sentence_text = sentence.sentence_text
			
 
				+    #             tokens = sentence.tokens
			
 
				+    #             doc_id = sentence.doc_id
			
 
				+    #             in_attachment = sentence.in_attachment
			
 
				+    #             list_tokenbegin = []
			
 
				+    #             begin = 0
			
 
				+    #             for i in range(0, len(tokens)):
			
 
				+    #                 list_tokenbegin.append(begin)
			
 
				+    #                 begin += len(str(tokens[i]))
			
 
				+    #             list_tokenbegin.append(begin + 1)
			
 
				+    #             for _match in re.finditer(pattern,sentence_text):
			
 
				+    #                 _groupdict = _match.groupdict()
			
 
				+    #                 _match_text = _match.group()
			
 
				+    #                 _unrecognized_text = _groupdict["unrecognized"]
			
 
				+    #                 _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
			
 
				+    #                 if not _unrecognized:
			
 
				+    #                     _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
			
 
				+    #                 if _unrecognized:
			
 
				+    #                     _unrecognized = _unrecognized.group()
			
 
				+    #                 else:
			
 
				+    #                     continue
			
 
				+    #                 # print(_unrecognized)
			
 
				+    #                 if re.search("某|乙方|代理",_unrecognized) or len(_unrecognized)>15:
			
 
				+    #                     continue
			
 
				+    #                 begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
			
 
				+    #                 for j in range(len(list_tokenbegin)):
			
 
				+    #                     if list_tokenbegin[j] == begin_index_temp:
			
 
				+    #                         begin_index = j
			
 
				+    #                         break
			
 
				+    #                     elif list_tokenbegin[j] > begin_index_temp:
			
 
				+    #                         begin_index = j - 1
			
 
				+    #                         break
			
 
				+    #                 index = begin_index_temp + len(_unrecognized)
			
 
				+    #                 end_index_temp = index
			
 
				+    #                 for j in range(begin_index, len(list_tokenbegin)):
			
 
				+    #                     if list_tokenbegin[j] >= index:
			
 
				+    #                         end_index = j - 1
			
 
				+    #                         break
			
 
				+    #                 entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
			
 
				+    #                 entity_text = _unrecognized
			
 
				+    #                 new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
			
 
				+    #                            begin_index_temp, end_index_temp, in_attachment=in_attachment)
			
 
				+    #                 new_entity.label = 0
			
 
				+    #                 new_entity.values = [on_value,0,0,0,0,0]
			
 
				+    #                 list_entitys[0].append(new_entity)
			
 
				+    #                 self.get_tenderee = True
			
 
				+    #         if self.get_tenderee:
			
 
				+    #             list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
			
 
				+    #             break
			
 
				 
			
 
				 class RoleGrade():
			
 
				     def __init__(self):