Переглянути джерело

优化评审专家;优化表格要素提取

lsm 2 роки тому
батько
коміт
7719718af9

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -257,7 +257,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-03-09'}
+    version_date = {'version_date': '2023-03-10'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise

BIN
BiddingKG/dl/interface/header_set.pkl


+ 15 - 6
BiddingKG/dl/interface/predictor.py

@@ -867,6 +867,7 @@ class EPCPredict():
 
         data_x = []
         points_entitys = []
+        pre_texts = []
         for list_entity,list_sentence in zip(list_entitys,list_sentences):
             
             p_entitys = 0
@@ -883,6 +884,7 @@ class EPCPredict():
                     item_x = self.model_person.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
                     data_x.append(item_x)
                     points_entitys.append(entity)
+                    pre_texts.append(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20))
 
                 p_entitys += 1
 
@@ -890,13 +892,14 @@ class EPCPredict():
             return None
         
         # return [data_x,points_entitys,dianhua]
-        return [data_x,points_entitys]
+        return [data_x,points_entitys, pre_texts]
 
     def predict_person(self,list_sentences, list_entitys):
         datas = self.search_person_data(list_sentences, list_entitys)
         if datas is None:
             return
         points_entitys = datas[1]
+        pre_texts = datas[2]
         # phone = datas[2]
         if USE_PAI_EAS:
             _data = datas[0]
@@ -922,6 +925,11 @@ class EPCPredict():
         for i in range(len(predict_y)):
             entity = points_entitys[i]
             label = np.argmax(predict_y[i])
+            pre_text = ''.join(pre_texts[i][0])
+            # print('pre_text', pre_text)
+            if label==0 and re.search('(谈判|磋商|询价|资格审查|评审专家|(评选|议标|评标|评审)委员会?|专家|评委)(小?组|小?组成员)?(成员|名单)[:,](\w{2,4}((组长)|(成员))?[、,,])*$', pre_text):
+                # print(entity.entity_text, re.search('(谈判|磋商|询价|资格审查|评审专家|(评选|议标|评标|评审)委员会?|专家|评委)(小?组|小?组成员)?(成员|名单)[:,](\w{2,4}((组长)|(成员))?[、,,])*$', pre_text).group(0))
+                label = 4
             values = []
             for item in predict_y[i]:
                 values.append(item)
@@ -4821,9 +4829,9 @@ class TablePremExtractor(object):
     def __init__(self):
         '''各要素表头规则'''
         self.head_rule_dic = {
-            'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])(编号|编码)",
+            'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
-            "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
+            "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
             "win_sort": "是否(中标|成交)|排名|排序|名次|未(中标|成交)原因",
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
@@ -4878,7 +4886,7 @@ class TablePremExtractor(object):
                 return flag, contain_header, header_dic
             elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
                 return flag,contain_header, header_dic
-        elif len(set(td_list) & self.headerset) >= 2 or (len(set(td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
+        elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
             contain_header = True
         return flag, contain_header, dict()
 
@@ -5039,8 +5047,9 @@ class TablePremExtractor(object):
                         "role_text": tenderer,
                         "serviceTime": ""
                 })
-            if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃
+            if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃 并不再继续往下匹配
                 prem_dic.pop(package)
+                break
             if multi_same_package:
                 for k, v in package_fix2raw.items():
                     if k in prem_dic:
@@ -5164,7 +5173,7 @@ class CandidateExtractor(object):
                     return flag, contain_header, dict()
             if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic):
                 return flag, contain_header, header_dic
-        elif len(set(td_list) & self.headerset) >= 2  or (len(set(td_list)) == 2 and len(set(td_list) & self.headerset) >= 1):  # 如果包含两个表头以上或 只有两列且包含一个表头
+        elif len(set(fix_td_list) & self.headerset) >= 2  or (len(set(fix_td_list)) == 2 and len(set(fix_td_list) & self.headerset) >= 1):  # 如果包含两个表头以上或 只有两列且包含一个表头
             contain_header = True
         return flag, contain_header, dict()