Explorar o código

预处理新增正则召回招标角色实体;正则角色召回优化;补充文章末尾固定格式招标代理召回;

lishimin %!s(int64=3) %!d(string=hai) anos
pai
achega
64dd6f866c

+ 14 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -1033,7 +1033,7 @@ def segment(soup,final=True):
             text = soup.get_text()
             # 2020/11/24 大网站规则添加
             if 'title' in soup.attrs:
-                if '...' in soup.get_text() and (soup.get_text()[:-3]).strip() in soup.attrs['title']:
+                if '...' in soup.get_text() and soup.get_text().strip()[:-3] in soup.attrs['title']:
                     text = soup.attrs['title']
 
             _list = []
@@ -1675,7 +1675,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             '''正则识别角色实体  经营部|经销部|电脑部|服务部|复印部|印刷部|彩印部|装饰部|修理部|汽修部|修理店|零售店|设计店|服务店|家具店|专卖店|分店|文具行|商行|印刷厂|修理厂|维修中心|修配中心|养护中心|服务中心|会馆|文化馆|超市|门市|商场|家具城|印刷社|经销处'''
             for it in re.finditer(
-                    '(?P<text_key_word>[^,。、;《]{,5}(单一来源|中标|中选|中价|成交)?(供应商|供货商|服务商|候选人|单位|人)(名称)?为?[::]+)(?P<text>([^,。、;《]{5,20})(厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处))[,。]',
+                    '(?P<text_key_word>[^,。、;《]{,5}(单一来源|中标|中选|中价|成交)?(供应商|供货商|服务商|候选人|单位|人)(名称)?[为::]+)(?P<text>([^,。、;《]{5,20})(厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处))[,。]',
                     sentence_text):
                 for k, v in it.groupdict().items():
                     if k == 'text_key_word':
@@ -1686,8 +1686,19 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 e = it.end() - 1
                 if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
                     ner_entitys.append((b, e, 'company', entity))
-                    # print('正则新增 :',(b, e, 'company', entity))
 
+            for it in re.finditer(
+                    '(?P<text_key_word>((建设|招租|招标|采购)(单位|人)|业主)(名称)?[为::]+)(?P<text>\w{2,4}[省市县区镇]([^,。、;《]{2,20})(管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园))[,。]',
+                    sentence_text):
+                for k, v in it.groupdict().items():
+                    if k == 'text_key_word':
+                        keyword = v
+                    if k == 'text':
+                        entity = v
+                b = it.start() + len(keyword)
+                e = it.end() - 1
+                if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
+                    ner_entitys.append((b, e, 'org', entity))
 
             #识别package
 

+ 8 - 4
BiddingKG/dl/interface/extract.py

@@ -80,6 +80,11 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
     cost_time["rule"] = round(time.time()-start_time,2)
 
+    '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
+    start_time = time.time() #正则角色提取
+    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys)
+    cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
+
     start_time = time.time() #联系人模型提取
     predictor.getPredictor("epc").predict(list_sentences,list_entitys)
     log("get epc done of doc_id%s"%(doc_id))
@@ -133,7 +138,6 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
     # for _article in list_articles:
     #     log(_article.content)
-    #
     # for list_entity in list_entitys:
     #     for _entity in list_entity:
     #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
@@ -141,7 +145,7 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     #                str(_entity.begin_index),str(_entity.end_index)))
 
     return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
-
+    # return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False), list_articles[0].content, list_entitys[0]
 
 def test(name,content):
     user = {
@@ -170,12 +174,12 @@ if __name__=="__main__":
     #     print(rs['product_attrs'])
     # print(rs)
 
-    with open('D:/html/2.html', 'r', encoding='utf-8') as f:
+    with open('../test/data/2.html', 'r', encoding='utf-8') as f: #D:/html
         text = f.read()
         t1 = time.time()
         print(predict('', text, title))
         t2 = time.time()
-        print(predict('', text, title))
+        # print(predict('', text, title))
         t3 = time.time()
         print('第一次耗时:%.4f, 第二次耗时:%.4f'%(t2-t1, t3-t2))
     # print(predict('',text,title))

+ 52 - 5
BiddingKG/dl/interface/predictor.py

@@ -30,6 +30,7 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
               "prem":{"predictor":None,"Lock":RLock()},
               "epc":{"predictor":None,"Lock":RLock()},
               "roleRule":{"predictor":None,"Lock":RLock()},
+              "roleRuleFinal":{"predictor":None,"Lock":RLock()},
                   "form":{"predictor":None,"Lock":RLock()},
                   "time":{"predictor":None,"Lock":RLock()},
                   "punish":{"predictor":None,"Lock":RLock()},
@@ -51,6 +52,8 @@ def getPredictor(_type):
                     dict_predictor[_type]["predictor"] = EPCPredict()
                 if _type=="roleRule":
                     dict_predictor[_type]["predictor"] = RoleRulePredictor()
+                if _type == "roleRuleFinal":
+                    dict_predictor[_type]["predictor"] = RoleRuleFinalAdd()
                 if _type=="form":
                     dict_predictor[_type]["predictor"] = FormPredictor()
                 if _type=="time":
@@ -658,6 +661,8 @@ class PREMPredict():
                 elif re.search('尊敬的供应商:.{,25}我公司', text):
                     label = 0
                     values[label] = 0.801
+            if label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
+                label = 0
             entity.set_Role(label, values)
 
     def predict_money(self,list_sentences,list_entitys):
@@ -1083,12 +1088,13 @@ class FormPredictor():
 class RoleRulePredictor():
     
     def __init__(self):
-        self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|需方)(名称)?(是|为|信息|:|:|\s*)$)"
+        # self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|需方)(名称)?(是|为|信息|:|:|\s*)$)"
+        self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价|评选|挂牌|出租|出让|谈判|邀标|邀请|洽谈|约谈|买受|选取|抽取|抽选|出售|标卖|比价)(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|权属人|甲方当事人)[))]?(名称|信息)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{,20}委托))"
-        self.pattern_tenderee_right = "(?P<tenderee_right>^(\((以下简称)?[\"”]?(招标|采购)(人|单位|机构)\)?))"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
-        
-        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|招标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{,20}委托))"
-        self.pattern_agency_right = "(?P<agency_right>^(\((以下简称)?[\"”]?(代理)(人|单位|机构)\))|受.{,15}委托)"
+        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)\)?))|^委托"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
+
+        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|[议))]+标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{,20}委托))"
+        self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)\))|受.{,15}委托)|^受托"
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
         self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|各?供应商|方|公司|厂商|商)[::是为]+$|(选定单位|指定的中介服务机构))[::是为,]+$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))[::是为]+$|((评审结果|名次|排名)[::]第?[一1]名?)$|单一来源(采购)?方式向$|((中标|成交)(结果|信息))(是|为|:|:)$|(单一来源采购(供应商|供货商|服务商))$|[^候选]((分包|标包){,5}供应商|供货商|服务商|供应商名称|服务机构|供方)[::]$)"
         # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
@@ -1371,6 +1377,47 @@ class RoleRulePredictor():
                 if p_entity.entity_text in self.SET_NOT_TENDERER:
                     p_entity.label=5
 
+'''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
+class RoleRuleFinalAdd():
+    def predict(self, list_articles, list_entitys):
+        text_end = list_articles[0].content[-30:]
+        sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),\s*.{2,4}年.{1,2}月.{1,2}日', text_end)
+        if sear_ent:
+            ent_re = sear_ent.group(1)
+            tenderee_notfound = True
+            agency_notfound = True
+            ents = []
+            for ent in list_entitys[0]:
+                if ent.entity_type in ['org', 'company']:
+                    if ent.label == 0:
+                        tenderee_notfound = False
+                    elif ent.label == 1:
+                        agency_notfound = False
+                    elif ent.label == 5:
+                        ents.append(ent)
+            if agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
+                n = 0
+                for i in range(len(ents) - 1, -1, -1):
+                    n += 1
+                    if n > 3:
+                        break
+                    if ents[i].entity_text == ent_re:
+                        ents[i].label = 1
+                        ents[i].values[1] = 0.5
+                        break
+
+            elif tenderee_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None:
+                n = 0
+                for i in range(len(ents) - 1, -1, -1):
+                    n += 1
+                    if n > 3:
+                        break
+                    if ents[i].entity_text == ent_re:
+                        ents[i].label = 0
+                        ents[i].values[0] = 0.5
+                        break
+
+
 # 时间类别
 class TimePredictor():
     def __init__(self):