瀏覽代碼

新增审批项目提取

lsm 1 年之前
父節點
當前提交
3c834576aa
共有 2 個文件被更改,包括 158 次插入21 次删除
  1. 20 11
      BiddingKG/dl/interface/extract.py
  2. 138 10
      BiddingKG/dl/interface/predictor.py

+ 20 - 11
BiddingKG/dl/interface/extract.py

@@ -206,9 +206,6 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     log("get prem done of doc_id%s"%(doc_id))
     cost_time["prem"] = round(time.time()-start_time,2)
 
-    # roles_l = get_role_context(doc_id, list_sentences, list_entitys)
-    # return roles_l
-
     # start_time = time.time() # 产品名称及废标原因提取  此处作废 换到后面预测 2022/4/29
     # fail = channel_dic['docchannel']['docchannel'] == "废标公告"
     # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
@@ -276,12 +273,13 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
 
-    '''表格要素提取'''
-    table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise, web_source_name)
-    # print('表格提取中标人:', table_prem)
-    # print('原提取角色:', prem[0]['prem'])
-    if table_prem:
-        getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem, in_attachment=in_attachment)
+    if original_docchannel != 302:  # 审批项目不做下面提取
+        '''表格要素提取'''
+        table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise, web_source_name)
+        # print('表格提取中标人:', table_prem)
+        # print('原提取角色:', prem[0]['prem'])
+        if table_prem:
+            getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem, in_attachment=in_attachment)
 
     '''候选人提取'''
     candidate_top3_prem, candidate_dic, in_attachment = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise)
@@ -304,7 +302,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     start_time = time.time()
     # content = list_articles[0].content
     # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
-    channel_dic, msc = predictor.getPredictor("channel").predict_merge(title,list_sentences[0], text, list_articles[0].bidway, prem[0], original_docchannel)
+    if original_docchannel == 302:
+        channel_dic = {"docchannel":
+             { "docchannel": "审批项目", "doctype": "审批项目", "life_docchannel": "审批项目" }
+        }
+        prem[0]['prem'] = {}  # 审批项目不要这项
+
+    else:
+        channel_dic, msc = predictor.getPredictor("channel").predict_merge(title,list_sentences[0], text, list_articles[0].bidway, prem[0], original_docchannel)
     # print('msc', msc)
     cost_time["rule_channel"] = round(time.time()-start_time,2)
 
@@ -359,9 +364,13 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-05-28'}
+    version_date = {'version_date': '2024-06-05'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
+    if original_docchannel == 302:
+        approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys)
+        data_res['approval'] = approval
+
     '''最终检查修正招标、中标金额'''
     getAttributes.limit_maximum_amount(data_res, list_entitys[0])
 

+ 138 - 10
BiddingKG/dl/interface/predictor.py

@@ -28,6 +28,7 @@ import calendar
 import datetime
 from BiddingKG.dl.entityLink.entityLink import get_business_data
 from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
+from BiddingKG.dl.interface.getAttributes import turnMoneySource
 # import fool   # 统一用 selffool ,阿里云上只有selffool 包
 
 cpu_num = int(os.environ.get("CPU_NUM",0))
@@ -70,7 +71,8 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
                   'candidate': {"predictor": None, "Lock": RLock()},
                   'websource_tenderee': {"predictor": None, "Lock": RLock()},
                   'project_label': {"predictor": None, "Lock": RLock()},
-                  'pb_extract': {"predictor": None, "Lock": RLock()}
+                  'pb_extract': {"predictor": None, "Lock": RLock()},
+                  'approval': {"predictor": None, "Lock": RLock()}  # 审批项目预测
                   }
 
 
@@ -124,6 +126,8 @@ def getPredictor(_type):
                     dict_predictor[_type]['predictor'] = ProjectLabel()
                 if _type == 'pb_extract':
                     dict_predictor[_type]['predictor'] = PBPredictor()
+                if _type == 'approval':
+                    dict_predictor[_type]['predictor'] = ApprovalPredictor()
             return dict_predictor[_type]["predictor"]
     raise NameError("no this type of predictor")
 
@@ -1596,15 +1600,12 @@ class RoleRulePredictor():
                         _list_name = self._check_input(list_name, ignore=True)
                         find_flag = False
                         for _name in _list_name:  # 2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
-                            if str(_name).find(re.sub(")", ")", re.sub("(", "(",
-                                                                       p_entity.entity_text))) >= 0 and p_entity.sentence_index < 4:
+                            if str(_name).find(p_entity.entity_text) >= 0 and p_entity.sentence_index < 4:
                                 for _sentence in list_sentence:
                                     if _sentence.sentence_index == p_entity.sentence_index:
                                         _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
                                                            end_index=p_entity.end_index, size=20, center_include=True,
-                                                           word_flag=True, use_text=True, text=re.sub(")", ")",
-                                                                                                      re.sub("(", "(",
-                                                                                                             p_entity.entity_text)))
+                                                           word_flag=True, use_text=True, text=p_entity.entity_text)
                                         if _span[2].startswith(":"): # 实体后面为冒号的不作为招标人,避免项目名称出错中标变招标  368122675 陇西兴恒建建筑有限责任公司:线路安全保护区内环境治理专项整改(第二标段)项目
                                             break
                                         if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
@@ -6102,7 +6103,7 @@ class TablePremExtractor(object):
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
             "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
             "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
-            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
+            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请|拟推荐(入选|入围)?)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
             "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
@@ -6141,7 +6142,6 @@ class TablePremExtractor(object):
                             continue
                         elif k in header_dic:
                             if k in ['budget', 'bid_amount'] and re.search('总(价|金?额)', text):  # 总价替换单价
-                                print('总价替换单价')
                                 header_dic[k] = (i, text)
                                 num += 1
                             elif k == 'project_code' and text != header_dic[k][1] and 'package_code' not in header_dic\
@@ -6151,7 +6151,7 @@ class TablePremExtractor(object):
                         header_dic[k] = (i, text)
                         num += 1
                 if num>1:
-                    print('表头错误,一个td匹配到两个表头:', header_dic)
+                    # print('表头错误,一个td匹配到两个表头:', header_dic)
                     return flag, contain_header, dict()
             if re.search(';金额((万?元))?;', ';'.join(td_list)):  # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
                 if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
@@ -6272,7 +6272,7 @@ class TablePremExtractor(object):
                 continue
             if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
                 continue
-            if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]) and re.search('推荐的?(中标|成交)候选人', headers['tenderer'][1])==None:
+            if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None:
                 tenderer = ""
 
             if tenderer in ['采购失败', '废标']: # 避免类似 353867205 这篇只提取到一个
@@ -6958,6 +6958,134 @@ class WebsourceTenderee():
                                               ]}
         return prem
 
+class ApprovalPredictor():
+    def __init__(self):
+        self.other_part = {
+            "project_name": "(项目|工程|采购|招标)名称:(?P<main>[^:。]{5,50})[,。](\w{2,10}:|$)?",
+            "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标)编[号码]):(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)(\w{2,10}:|$)?",
+            "doc_num": "((审[批查]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案)文号|综合受理号):(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)[,。]?(\w{2,10}:|$)?",
+            "pro_type": "(申报类型|项目所属行业):(?P<main>[^:。]{2,30})[,。](\w{2,10}:|$)?",
+            "year_limit": "((建设|工程|服务)年限):(?P<main>[\d个年月日.-]{2,20})[,。](\w{2,10}:|$)?",
+            "construction_scale": "(建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|建设规模(如下)?):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?",
+            "approval_items": "((审[批查]|批[复准])事项|事项名称):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?",
+            "properties": "((建设|工程)性质):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?",
+            "approval_result": "((审[批查]|批[复准])(结果|决定)):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?",
+            "phone": "联系电话:(?P<main>1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|"
+                     '\+86.?1[3-9]\d{9}|'
+                     '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
+                     '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|'
+                     '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|'
+                     '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|'
+                     '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|'
+                     '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|'
+                     '400\d{7}转\d{1,4}|'
+                     '[2-9]\d{6,7})[,。](\w{2,10}:|$)?'
+        }
+
+        self.role_type = {
+            "declare_company": "申报(部门|机关|单位|企业|公司)",  # 申报单位
+            "construct_company": "(业主|建设|用地))?(部门|机关|单位|企业|公司)|主送机关|法人单位",  # 建设单位
+            "approver": "(审批|许可|批准|发证|批复|管理)(部门|机关|单位|企业|公司)",  # 审批部门
+            "evaluation_agency": "环境影响评价机构|环评机构|评价机构|环评单位"  # 环评机构
+        }
+        self.person_type = {
+            "legal_person": "项目法人|法定代表人"  # 项目法人
+        }
+        self.date_type = {
+            "time_declare": "申报时间",
+            "time_commencement": "开工时间",
+            "time_completion": "竣工时间"
+        }
+
+        self.addr_type = {
+            "project_addr": "(建设|工程|项目)(地址|地点|位置)"
+        }
+
+        self.money_type = {
+            "total_tendereeMoney": "项目金额|项目投资|总投资|投资总额|总预算|总概算|投资规模|批复概算|投资额",
+        }
+
+    def predict(self, list_sentences, list_entitys, span=12):
+        rs_dic = {k: "" for k in
+                  self.other_part.keys() | self.role_type.keys() | self.date_type.keys() | self.addr_type.keys() | self.money_type.keys() | self.person_type.keys()}
+        rs_dic['moneysource'] = ""
+        sentences = [it.sentence_text for it in sorted(list_sentences[0], key=lambda x: x.sentence_index)]
+        entities = [[] for _ in range(len(sentences))]
+        rs_l = []
+        found_key = 0
+        code_name_set = set() # 项目编号、名称集合
+        for entity in list_entitys[0]:
+            entities[entity.sentence_index].append(entity)
+
+        for i in range(len(sentences)):
+            multi_project = {k: "" for k in
+                             self.other_part.keys() | self.role_type.keys() | self.date_type.keys() | self.addr_type.keys() | self.money_type.keys() | self.person_type.keys()}
+            multi_project['moneysource'] = ''
+            text = sentences[i]
+            for k, v in self.other_part.items():
+                for iter in re.finditer(v, text):
+                    rs_dic[k] = iter.group('main')
+                    multi_project[k] = iter.group('main')
+                    found_key = 1
+                    break
+            for entity in entities[i]:
+                b, e = entity.wordOffset_begin, entity.wordOffset_end
+                if entity.entity_type in ['org', 'company']:
+                    for k, v in self.role_type.items():
+                        if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
+                            rs_dic[k] = entity.entity_text
+                            multi_project[k] = entity.entity_text
+                            found_key = 1
+                elif entity.entity_type in ['person']:
+                    for k, v in self.person_type.items():
+                        if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
+                            rs_dic[k] = entity.entity_text
+                            multi_project[k] = entity.entity_text
+                            found_key = 1
+                            break
+                elif entity.entity_type in ['time']:
+                    for k, v in self.date_type.items():
+                        if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
+                            rs_dic[k] = entity.entity_text
+                            multi_project[k] = entity.entity_text
+                            found_key = 1
+                elif entity.entity_type in ['location']:
+                    for k, v in self.addr_type.items():
+                        if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
+                            rs_dic[k] = entity.entity_text
+                            multi_project[k] = entity.entity_text
+                            found_key = 1
+                elif entity.entity_type in ['money']:
+                    for k, v in self.money_type.items():
+                        if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
+                            rs_dic[k] = entity.entity_text
+                            multi_project[k] = entity.entity_text
+                            found_key = 1
+                elif entity.entity_type in ['moneysource']:
+                    rs_dic['moneysource'] = turnMoneySource(entity.entity_text)
+                    multi_project['moneysource'] = turnMoneySource(entity.entity_text)
+            if (multi_project['project_code'] != "" or multi_project['project_name'] != "") and multi_project['project_code']+multi_project['project_name'] not in code_name_set:
+                code_name_set.add(multi_project['project_code']+multi_project['project_name'])
+                district = getPredictor('district').get_area(
+                    multi_project['project_name'] + multi_project['project_addr'], '')
+                multi_project['area'] = district['district']['area']
+                multi_project['province'] = district['district']['province']
+                multi_project['city'] = district['district']['city']
+                multi_project['district'] = district['district']['district']
+                multi_project = {k:v for k,v in multi_project.items() if v != ''}
+                rs_l.append(multi_project)
+        if len(rs_l)>1:
+            return rs_l
+        elif found_key == 1:
+            district = getPredictor('district').get_area(
+                rs_dic['construct_company'] + rs_dic['project_name'] + rs_dic['project_addr'], '')
+            rs_dic['area'] = district['district']['area']
+            rs_dic['province'] = district['district']['province']
+            rs_dic['city'] = district['district']['city']
+            rs_dic['district'] = district['district']['district']
+            rs_dic = {k: v for k, v in rs_dic.items() if v != ''}
+            return [rs_dic]
+        return []
 
 def getSavedModel():
     #predictor = FormPredictor()