Browse Source

优化地区匹配;优化审批提取、日期归一化;新增dict_enterprise字典返回公司工商编号

lsm 11 months ago
parent
commit
aba971b509

+ 18 - 2
BiddingKG/dl/common/Utils.py

@@ -636,9 +636,23 @@ def isValidDate(year, month, day):
     else:
         return True
 
-time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
+time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]?\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3})?)")
 from BiddingKG.dl.ratio.re_ratio import getUnifyNum
-def timeFormat(_time):
+import calendar
+
+def get_maxday(year, month):
+    # calendar.monthrange(year, month) 返回一个元组,其中第一个元素是那个月第一天的星期几(0-6代表周一到周日),
+    # 第二个元素是那个月的天数。
+    _, last_day = calendar.monthrange(year, month)
+    return last_day
+
+def timeFormat(_time, default_first_day=True):
+    '''
+    日期格式化:年-月-日
+    :param _time:
+    :param default_first_day: True取当月第一天,否则取最后一天
+    :return:
+    '''
     current_year = time.strftime("%Y",time.localtime())
     all_match = re.finditer(time_format_pattern,_time)
     for _match in all_match:
@@ -682,6 +696,8 @@ def timeFormat(_time):
                         legal = False
             else:
                 legal = False
+            if day == None:
+                day = "01" if (default_first_day or legal == False) else str(get_maxday(int(year), int(month)))
             if day!="":
                 if re.search("^\d+$", day):
                     if int(day)>31:

+ 29 - 4
BiddingKG/dl/entityLink/entityLink.py

@@ -14,6 +14,8 @@ from BiddingKG.dl.interface.Entitys import *
 import json
 from BiddingKG.dl.common.constDict import ConstDict
 
+business_dic = {}
+
 def edit_distance(source,target):
     dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
     for i in range(len(dp)):
@@ -167,7 +169,11 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
                 if _entity.entity_text in bus_dic:
                     have_bus = True
                 else:
-                    have_bus, dic = get_business_data(_entity.entity_text)
+                    if _entity.entity_text not in business_dic:
+                        have_bus, dic = get_business_data(_entity.entity_text)
+                        business_dic[_entity.entity_text] = (have_bus, dic)
+                    else:
+                        have_bus, dic = business_dic.get(_entity.entity_text)  # 20240708 字典保存查询过的工商数据,避免重复查询redis
                     if re.search('^\w{,5}[分支](行|公司)$|^\w{1,3}公司$|^\w{2,5}段$', _entity.entity_text):
                         have_bus = False
                     if have_bus:
@@ -288,18 +294,33 @@ def doctitle_refine(doctitle):
 def get_nlp_enterprise(list_entity):
     nlp_enterprise = []
     nlp_enterprise_attachment = []
+    dict_enterprise = {}
     max_num = 100
     list_entity = sorted(list_entity,key=lambda x:(x.sentence_index,x.begin_index))
     for entity in list_entity:
         if entity.entity_type in ['org','company']:
+            if entity.entity_text not in dict_enterprise:
+                if entity.entity_text not in business_dic:
+                    have_bus, dic = get_business_data(entity.entity_text)
+                    business_dic[entity.entity_text] = (have_bus, dic)
+                else:
+                    have_bus, dic = business_dic.get(entity.entity_text)  # 20240708 字典保存查询过的工商数据,避免重复查询redis
+                credit_code = dic.get('credit_code', '')
+                in_text = 0 if entity.in_attachment else 1
+                if entity.label in [0,1,2,3,4] or len(dict_enterprise)<=max_num:
+                    dict_enterprise[entity.entity_text] = {'credit_code': credit_code, 'in_text': in_text}
+            else:
+                in_text = 0 if entity.in_attachment else 1
+                if in_text != dict_enterprise[entity.entity_text]['in_text']:
+                    dict_enterprise[entity.entity_text]['in_text'] = 2
+
             if not entity.in_attachment:
                 if entity.entity_text not in nlp_enterprise:
                     nlp_enterprise.append(entity.entity_text)
             else:
                 if entity.entity_text not in nlp_enterprise_attachment:
                     nlp_enterprise_attachment.append(entity.entity_text)
-
-    return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num]
+    return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num], dict_enterprise
 
 ENTERPRISE_HUGE = None
 
@@ -528,7 +549,11 @@ def match_enterprise_max_first(sentence):
                         if len(enter_name)<4: # 20240521 短于4个字的不要
                             break
                         if enter_tail in SET_TAIL_ENTERPRISE or re.search('(中心|中学|小学|医院|学院|大学|学校|监狱|大队|支队|林场|海关|分局|商行)$', enter_tail):
-                            have_bus, dic = get_business_data(enter_name) # 20210124 改为有工商数据的实体才添加
+                            if enter_name not in business_dic:
+                                have_bus, dic = get_business_data(enter_name) # 20210124 改为有工商数据的实体才添加
+                                business_dic[enter_name] = (have_bus, dic)
+                            else:
+                                have_bus, dic = business_dic.get(enter_name) # 20240708 字典保存查询过的工商数据,避免重复查询redis
                             if have_bus:
                             # if is_enterprise_exist(enter_name):
                                 match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(enter_name)}

+ 5 - 4
BiddingKG/dl/interface/Preprocessing.py

@@ -2153,9 +2153,10 @@ def segment(soup,final=True):
                 text = re.sub(punc_del," ",text) # 多个空字符替换为一个空格(防止时间类连接),后面还有对空格处理
 
     #将连续的中文句号替换为一个
-    text_split = text.split("。")
-    text_split = [x for x in text_split if len(x)>0]
-    text = "。".join(text_split)
+    # text_split = text.split("。")
+    # text_split = [x for x in text_split if len(x)>0]
+    # text = "。".join(text_split)
+    text = re.sub('。+', '。', text).lstrip('。') # 20240703 修复上面的方法造成文末句号丢失问题。
 
     # #删除标签中的所有空格
     # for subs in subspaceList:
@@ -3235,7 +3236,7 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
     # 使用正则识别金额
     entity_type = "money"
     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
                           "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
     # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元,合同金额:378.8万元 提取

+ 4 - 2
BiddingKG/dl/interface/extract.py

@@ -268,7 +268,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     start_time = time.time()  # 实体链接
     entityLink.link_entitys(list_entitys)
     doctitle_refine = entityLink.doctitle_refine(title)
-    nlp_enterprise,nlp_enterprise_attachment = entityLink.get_nlp_enterprise(list_entitys[0])
+    nlp_enterprise,nlp_enterprise_attachment, dict_enterprise = entityLink.get_nlp_enterprise(list_entitys[0])
     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time)
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
@@ -359,7 +359,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-06-27'}
+    version_date = {'version_date': '2024-07-08'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
@@ -380,6 +380,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise
     data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
+    data_res["dict_enterprise"] = dict_enterprise
+
     # 要素的个数
     data_res['extract_count'] = extractCount(data_res)
     # 是否有表格

+ 95 - 23
BiddingKG/dl/interface/predictor.py

@@ -846,7 +846,10 @@ class PREMPredict():
                 elif re.search('第一候补|第一后备|备选', front):
                     label = 3
                     values[label] = 0.6
-                elif re.search('放弃中标资格$|是否中标:否|^(中标|成交)(公示|公告)', behind):
+                elif re.search('^放弃中标资格|是否中标:否|^(中标|成交)(公示|公告)', behind):
+                    values[2] = 0.5
+                    label = 5
+                elif re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', front)==None:
                     values[2] = 0.5
                     label = 5
                 elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$', front):  # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位:
@@ -954,7 +957,7 @@ class PREMPredict():
                     values[label] = 0.49
                 elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
                     values[label] = 0.49
-                elif re.search('(含|在|包括|[大小等高低]于)$|[\d.%]+[+×*-]$', front):
+                elif re.search('(含|在|包括|[大小等高低]于|达到)$|[\d.%]+[+×*-]$', front):
                     values[label] = 0.49
                 elif entity.notes == '单价' and float(entity.entity_text)<5000:
                     label = 2
@@ -1515,6 +1518,8 @@ class RoleRulePredictor():
             _label = 5
         elif _label == 2 and re.search('评委|未中标', after[:5]): # 397194341 过滤掉错误召回中标人
             _label = 5
+        elif _label == 2 and re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', after) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', before[-10:])==None: #20240705 处理类似 493939047 错误
+            _label = 5
         if _label == 5:
             _label, _prob, keyword = self.ser_role(self.pattern_whole, before + center + after, entity_text)  # 前后文匹配
             keyword = 'whole_'+ keyword[:keyword.find(entity_text)] if keyword!="" else keyword
@@ -4453,7 +4458,8 @@ class DocChannel():
           11、预测预告,原始为意向、招标且标题无预告关键词,返回原始类别
           '''
           if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
-                  original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(prem_json)==False:
+                  original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(
+              prem_json)==False and re.search(self.title_life_dic['中标信息'], title)==None:
               result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
               msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
           elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
@@ -5888,15 +5894,54 @@ class DistrictPredictor():
                     name, b, e = it
                     area_list.append((name, (e - b + e) / max_len / 2))
             return area_list
+
+        def find_whole_areas(text):
+            '''
+            通过正则匹配字符串返回地址
+            :param pettern: 地址正则 广东省|广西省|...
+            :param text: 待匹配文本
+            :return:
+            '''
+            pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
+            p_pro, p_city, p_dis, p_city, p_dis, p_dis)
+            province_l, city_l, district_l = [], [], []
+            for it in re.finditer(pettern, text):
+                if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
+                        '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
+                    continue
+                if it.group(0) == '站前':  # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
+                    continue
+                for k, v in it.groupdict().items():
+                    if v != None:
+                        if k in ['prov']:
+                            province_l.append((it.group(k), it.start(k), it.end(k)))
+                        elif k in ['city', 'city1']:
+                            if re.search('^(经济开发区|开发区|新区)', text[it.end(k):]):  # 城市不匹配为区的地址 修复 滨州北海经济开发区 北海新区 等提取为北海
+                                continue
+                            city_l.append((it.group(k), it.start(k), it.end(k)))
+                            if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end(k):]):
+                                city_l.append((it.group(k), it.start(k), it.end(k)))
+                        elif k in ['dist', 'dist1', 'dist2']:
+                            if it.group(k)=='昌江' and '景德镇' not in it.group(0):
+                                district_l.append(('昌江黎族', it.start(k), it.end(k)))
+                            else:
+                                district_l.append((it.group(k), it.start(k), it.end(k)))
+            return province_l, city_l, district_l
+
         def get_pro_city_dis_score(text, text_weight=1):
             text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾', ' ', text)
             text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
             text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域:怒江州 识别为广西 - 崇左 - 江州
             text = re.sub('茂名滨海新区', '茂名市', text)
             text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
-            province_l = find_areas(p_pro, text)
-            city_l = find_areas(p_city, text)
-            district_l = find_areas(p_dis, text)
+            ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
+            if ser and '黎族' not in ser.group(0):
+                text = text.replace(ser.group(0), ser.group(0)+'黎族')
+            # province_l = find_areas(p_pro, text)
+            # city_l = find_areas(p_city, text)
+            # district_l = find_areas(p_dis, text)
+
+            province_l, city_l, district_l = find_whole_areas(text) # 20240703 优化地址提取,解决类似 海南昌江 得到 海南 南昌 结果
 
             if len(province_l) == len(city_l) == 0:
                 district_l = [it for it in district_l if
@@ -6076,8 +6121,11 @@ class DistrictPredictor():
 
         def get_project_addr(text):
             p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            p2 = '项目位于(?P<addr>\w{2}市\w{2,4}区)'
             if re.search(p1, text):
                 return re.search(p1, text).group('addr')
+            elif re.search(p2, text):
+                return re.search(p2, text).group('addr')
             else:
                 return ''
 
@@ -7163,15 +7211,15 @@ class ApprovalPredictor():
         项目(法人)单位
         '''
         self.other_part = {
-            "project_name": "(项目|工程|采购|招标|计划)名称?:(?P<main>[^:。]{5,50})[,。](\w{2,10}:|$)?", # 项目名称
-            "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标|计划|任务|备案)(编[号码]|号)):(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)(\w{2,10}:|$)?", # 项目编号
-            "doc_num": "((审[批查核]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案|核准|许可|确认)[文编]?号|综合受理号|文书号):(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-.]{5,30}号?)[,。]?(\w{2,10}:|$)?", # 文号
-            "pro_type": "(申[报请](类型|种类)|项目所属行业|行业(分类|归属)|产业领域|项目行业|项目类型|立项类型):(?P<main>[^:。]{2,30})[,。](\w{2,10}:|$)?", # 项目类型
-            "year_limit": "((建设|工程|服务|项目)(年限|期限|时长)):(?P<main>[\d个年月日.-]{2,20})[,。](\w{2,10}:|$)?", # 建设年限
-            "construction_scale": "(建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|(建设|工程|项目)规模(如下)?):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 建设规模
-            "approval_items": "((审[批查核]|批[复准]申请)(事项|内容)|事项名称|事项审批):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 审批事项
+            "project_name": "((项目|工程|采购|招标|计划|建设|规划)名称?|生产建设项目|申请项目):(?P<main>[^:。]{5,50})[,。](\w{2,10}:|$)?", # 项目名称
+            "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标|计划|任务|备案|索引)(编[号码]|号)):?(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)(\w{2,10}:|$)?", # 项目编号
+            "doc_num": "((审[批查核]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案|核准|许可|确认|受理|申请报告|文件|意见书|办件)[文编]?号|综合受理号|文书?号|合格书号):?(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-.]{5,30}号?)[,。]?(\w{2,10}:|$)?", # 文号
+            "pro_type": "((申[报请]|审核备|项目|立项)(类型|种类)|项目所属行业|行业(分类|归属)|产业领域|项目行业):(?P<main>[^:。]{2,30})[,。](\w{2,10}:|$)?", # 项目类型
+            "year_limit": "((建设|工程|服务|项目)(起止|\w{,2})?(年限|期限|时长|工期)):(约|超过|大概|建设工期|共计|合计)?(?P<main>[\d一二三四五六七八九十]+个月|\d{1,3}(日?历?天|小时)|20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?([至—-]+20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?)?)[(,。](\w{2,10}:|$)?", # 建设年限
+            "construction_scale": "(建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|(建设|招标|采购))?内容|(建设|工程|项目)(主要)?(规模|内容|概况|面积)([及和](主要)?(规模|内容|概况|面积))?(如下)?):(?P<main>[^:。]{2,250})[,。](\w{2,10}:|$)?", # 建设规模
+            "approval_items": "((审[批查核]|批[复准]|申请|监管)(事项|内容|名称)|事项名称|事项审批):(?P<main>[^:。]{2,70})[,。](\w{2,10}:|$)?", # 审批事项
             "properties": "((建设|工程|项目)性质):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 建设性质
-            "approval_result": "((审[批查核]|批[复准])(结果|决定|结论|状态|回复)|(办理|,)(状态|意见|结果)):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 审批结果
+            "approval_result": "((审[批查核]|批[复准]|核[发准]|许可|抽查|备案)(结果|决定|结论|状态|回复|意见)|(办[理件]|,)(状态|意见|结果)|项目(当前|目前)?状态):(?P<main>[^:。]{2,20})[,。](\w{2,10}:|$)?", # 审批结果
             "phone": "(联系)?电话:(?P<main>1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|" # 联系电话
                      '\+86.?1[3-9]\d{9}|'
                      '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
@@ -7185,22 +7233,26 @@ class ApprovalPredictor():
         }
 
         self.role_type = {
-            "declare_company": "(申[请报]|填报|呈报)(部门|机关|单位|企业|公司|机构|组织)",  # 申报单位
-            "construct_company": "(业主|建设|用地|委托|发包|产权|项目))?(部门|机关|单位|企业|公司|方)|主送机关|法人单位|甲方",  # 建设单位
-            "approver": "(审[批查核]|许可|批准|发证|批复|管理)(部门|机关|单位|企业|公司|机构)",  # 审批部门
-            "evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)"  # 环评机构
+            "declare_company": "(申[请报]|填报|呈报)(人|部门|机关|单位|企业|公司|机构|组织)",  # 申报单位
+            "construct_company": "(业主|建设|用地|委托|发包|产权|项目))?(部门|机关|单位|企业|公司|方|业主)|主送机关|法人单位|甲方",  # 建设单位
+            "approver": "(审[批查核议图]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|承办)(部门|机关|单位|企业|公司|机构)|实施主体",  # 审批部门
+            "evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)" , # 环评机构
+            "compilation_unit": "编制单位", # 编制单位 20240701加
+            "publisher": "(发布|发文|公示|公告)(人|部门|机关|单位|企业|公司|机构|组织)" # 发布机构 20240703加
         }
         self.person_type = {
             "legal_person": "项目法人|法定代表人|企业法人"  # 项目法人
         }
         self.date_type = {
             "time_declare": "(申[请报]|填报|呈报)(时间|日期)", # 申报时间
-            "time_commencement": "(开工|动工|施工开始)(时间|日期)", # 开工时间
-            "time_completion": "(竣工|完工|验收|(项目|建设|工程)(完成|结束))(备案)?(时间|日期)" # 竣工时间
+            "time_commencement": "(开工|动工|(项目|建设|工程|施工)开始)(时间|日期)", # 开工时间
+            "time_completion": "(竣工|完工|验收|(项目|建设|工程|施工)(完成|结束))(备案)?(时间|日期)", # 竣工时间
+            "time_approval": "(审[批查核查议]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|决定)(时间|日期)", # 审批时间 20240701加
+            "time_release": "(发布|发文|公告|生成|成文)(时间|日期)" # 发布时间
         }
 
         self.addr_type = {
-            "project_addr": "(建设|工程|项目|施工)(地址|地点|位置|所在地)|[宗土]地坐落|用地位置" # 建设地址
+            "project_addr": "(建设|工程|项目|施工|地块|用地)\w{,2}(地址|地点|位置|所在地)|[宗土]地坐落" # 建设地址
         }
 
         self.money_type = {
@@ -7216,6 +7268,7 @@ class ApprovalPredictor():
         rs_l = []
         found_key = 0
         code_name_set = set() # 项目编号、名称集合
+        org_set = set() # 保存可能为审批部门的角色
         for entity in list_entitys[0]:
             entities[entity.sentence_index].append(entity)
 
@@ -7227,12 +7280,16 @@ class ApprovalPredictor():
             for entity in entities[i]:
                 b, e = entity.wordOffset_begin, entity.wordOffset_end
                 if entity.entity_type in ['org', 'company']:
+                    flag = 1
                     for k, v in self.role_type.items():
                         if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
                             if rs_dic[k] == '':
                                 rs_dic[k] = entity.entity_text
                             multi_project[k] = entity.entity_text
                             found_key = 1
+                            flag = 0
+                    if flag and entity.entity_type == "org" and re.search('(局|委员会|委|厅)$', entity.entity_text):
+                        org_set.add(entity.entity_text)
                 elif entity.entity_type in ['person']:
                     for k, v in self.person_type.items():
                         if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
@@ -7244,9 +7301,12 @@ class ApprovalPredictor():
                 elif entity.entity_type in ['time']:
                     for k, v in self.date_type.items():
                         if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
+                            time = timeFormat(entity.entity_text, default_first_day=False) if k in ['time_completion'] else timeFormat(entity.entity_text)
+                            if time == "":
+                                continue
                             if rs_dic[k] == '':
-                                rs_dic[k] = entity.entity_text
-                            multi_project[k] = entity.entity_text
+                                rs_dic[k] = time
+                            multi_project[k] = time
                             found_key = 1
                 elif entity.entity_type in ['location']:
                     for k, v in self.addr_type.items():
@@ -7288,6 +7348,16 @@ class ApprovalPredictor():
                     multi_project[k] = iter.group('main')
                     found_key = 1
                     break
+            for k, v in self.date_type.items():
+                for iter in re.finditer(v+':?(?P<main>20\d{2}-\d{1,2}(-\d{1,2})?|20\d{2}/\d{1,2}(/\d{1,2})?|20\d{2}\.\d{1,2}(\.\d{1,2})?|20\d{2}(0[1-9]|1[0-2])(0[1-9]|[1-2][0-9]|3[0-1])?)', text): # 规则补充实体识别不到的日期时间
+                    time = timeFormat(iter.group('main'), default_first_day=False) if k in ['time_completion'] else timeFormat(iter.group('main'))
+                    if time == "":
+                        continue
+                    if rs_dic[k] == '':
+                        rs_dic[k] = time
+                    multi_project[k] = time
+                    found_key = 1
+                    break
             if (multi_project['project_code'] != "" or multi_project['project_name'] != "") and multi_project['project_code']+multi_project['project_name'] not in code_name_set:
                 code_name_set.add(multi_project['project_code']+multi_project['project_name'])
                 district = getPredictor('district').get_area(
@@ -7309,6 +7379,8 @@ class ApprovalPredictor():
                 rs_dic['province'] = district['district']['province']
                 rs_dic['city'] = district['district']['city']
                 rs_dic['district'] = district['district']['district']
+            if len(org_set) == 1 and rs_dic['approver'] == "":
+                rs_dic['approver'] == org_set.pop()
             rs_dic = {k: v for k, v in rs_dic.items() if v != ''}
             return [rs_dic]
         return []