Sfoglia il codice sorgente

修正RoleRulePredictor()中括号替换问题

znj 3 anni fa
parent
commit
590341692e

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -146,7 +146,7 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
     # for _article in list_articles:
     #     log(_article.content)
-    #
+
     # for list_entity in list_entitys:
     #     for _entity in list_entity:
     #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%

+ 4 - 4
BiddingKG/dl/interface/predictor.py

@@ -1168,7 +1168,7 @@ class RoleRulePredictor():
         for article,list_entity,list_sentence,list_codename in zip(list_articles,list_entitys,list_sentences,list_codenames):
             list_sentence.sort(key=lambda x: x.sentence_index)  # 2022/1/5 按句子顺序排序
             # list_name = list_codename["name"]
-            list_name = []  # 20212/1/5  改为实体列表内所有项目名称
+            list_name = []  # 2022/1/5  改为实体列表内所有项目名称
             for entity in list_entity:
                 if entity.entity_type == 'name':
                     list_name.append(entity.entity_text)
@@ -1182,7 +1182,7 @@ class RoleRulePredictor():
                         find_flag = False
                         for _sentence in list_sentence:
                             if _sentence.sentence_index==p_entity.sentence_index:
-                                _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
+                                _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,use_text=True,text=re.sub(")",")",re.sub("(","(",p_entity.entity_text)))
                                 for _name in list_name:
                                     if _name!="" and str(_span[1]+_span[2][:len(str(_name))]).find(_name)>=0:
                                         find_flag = True
@@ -1201,12 +1201,12 @@ class RoleRulePredictor():
                         _list_name = self._check_input(list_name,ignore=True)
                         find_flag = False
                         for _name in _list_name: #2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
-                            if str(_name).find(p_entity.entity_text) >= 0 and p_entity.sentence_index<4:
+                            if str(_name).find(re.sub(")",")",re.sub("(","(",p_entity.entity_text))) >= 0 and p_entity.sentence_index<4:
                                 for _sentence in list_sentence:
                                     if _sentence.sentence_index == p_entity.sentence_index:
                                         _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
                                                            end_index=p_entity.end_index, size=20, center_include=True,
-                                                           word_flag=True, text=p_entity.entity_text)
+                                                           word_flag=True, use_text=True,text=re.sub(")",")",re.sub("(","(",p_entity.entity_text)))
                                         if str(_span[1] + _span[2][:len(str(_name))]).find(
                                             _name) >= 0:
                                             find_flag = True

+ 2 - 2
BiddingKG/dl/test/test4.py

@@ -46,7 +46,7 @@ def test(name,content):
 if __name__=="__main__":
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
+    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\test12354.txt","r",encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
     # df_a = {"html":[]}
     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
@@ -75,7 +75,7 @@ if __name__=="__main__":
     # '''
     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
     # print(predict("12", content,"打印机"))
-    print(predict("12", content,"打印机"))
+    print(predict("12", text,"打印机"))
     # test(12,content)
     # test(12,text)
     print("takes",time.time()-_time1)

+ 5 - 3
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -122,7 +122,7 @@ def predict(doc_id,text):
             # print('**********实体信息****************')
             if entity.entity_type=='person':
                 print("联系人-电话:",end=' ')
-                print(entity.entity_text,[i.entity_text for i in entity.person_phone] if entity.person_phone else None,entity.label,entity.values)
+                print(entity.entity_text,[i.entity_text for i in entity.person_phone] if entity.person_phone else [],entity.label,entity.values)
                 if entity.pointer_email:
                     print("联系人-邮箱:",entity.entity_text,entity.pointer_email.entity_text)
                 # print(entity.begin_index, entity.end_index)
@@ -141,11 +141,11 @@ def predict(doc_id,text):
                     print("公司->联系人1:",end=' ')
                     print(entity.entity_text,[i.entity_text for i in entity.pointer_person],entity.label,entity.values)
                     # print(entity.entity_text,entity.label,entity.values)
-                    print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
+                    # print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
                 else:
                     print("公司->联系人2:", end=' ')
                     print(entity.entity_text, entity.pointer_person,entity.label,entity.values)
-                    print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
+                    # print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
                     pass
                 if entity.label in [2,3,4]:
                     if entity.pointer_money:
@@ -164,6 +164,8 @@ def predict(doc_id,text):
             #         print('pointer_pack_name:',entity.pointer_pack.entity_text)
             # elif entity.entity_type =='money':
             #     print('money',entity.entity_text,entity.label)
+            # elif entity.entity_type =='name':
+            #     print('pj_name',entity.entity_text,entity.sentence_index,entity.begin_index)
             # elif entity.entity_type in ['package']:
             #     print('pack_entity:',entity.entity_text)
             # print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)