11 months ago · aba971b509
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -636,9 +636,23 @@ def isValidDate(year, month, day):
 
				     else:
			
 
				         return True
			
 
				 
			
 
				-time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
			
 
				+time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]?\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3})?)")
			
 
				 from BiddingKG.dl.ratio.re_ratio import getUnifyNum
			
 
				-def timeFormat(_time):
			
 
				+import calendar
			
 
				+
			
 
				+def get_maxday(year, month):
			
 
				+    # calendar.monthrange(year, month) 返回一个元组，其中第一个元素是那个月第一天的星期几（0-6代表周一到周日），
			
 
				+    # 第二个元素是那个月的天数。
			
 
				+    _, last_day = calendar.monthrange(year, month)
			
 
				+    return last_day
			
 
				+
			
 
				+def timeFormat(_time, default_first_day=True):
			
 
				+    '''
			
 
				+    日期格式化：年-月-日
			
 
				+    :param _time:
			
 
				+    :param default_first_day: True取当月第一天，否则取最后一天
			
 
				+    :return:
			
 
				+    '''
			
 
				     current_year = time.strftime("%Y",time.localtime())
			
 
				     all_match = re.finditer(time_format_pattern,_time)
			
 
				     for _match in all_match:
			
@@ -682,6 +696,8 @@ def timeFormat(_time):
 
				                         legal = False
			
 
				             else:
			
 
				                 legal = False
			
 
				+            if day == None:
			
 
				+                day = "01" if (default_first_day or legal == False) else str(get_maxday(int(year), int(month)))
			
 
				             if day!="":
			
 
				                 if re.search("^\d+$", day):
			
 
				                     if int(day)>31:
			
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -14,6 +14,8 @@ from BiddingKG.dl.interface.Entitys import *
 
				 import json
			
 
				 from BiddingKG.dl.common.constDict import ConstDict
			
 
				 
			
 
				+business_dic = {}
			
 
				+
			
 
				 def edit_distance(source,target):
			
 
				     dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
			
 
				     for i in range(len(dp)):
			
@@ -167,7 +169,11 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
 
				                 if _entity.entity_text in bus_dic:
			
 
				                     have_bus = True
			
 
				                 else:
			
 
				-                    have_bus, dic = get_business_data(_entity.entity_text)
			
 
				+                    if _entity.entity_text not in business_dic:
			
 
				+                        have_bus, dic = get_business_data(_entity.entity_text)
			
 
				+                        business_dic[_entity.entity_text] = (have_bus, dic)
			
 
				+                    else:
			
 
				+                        have_bus, dic = business_dic.get(_entity.entity_text)  # 20240708 字典保存查询过的工商数据，避免重复查询redis
			
 
				                     if re.search('^\w{,5}[分支](行|公司)$|^\w{1,3}公司$|^\w{2,5}段$', _entity.entity_text):
			
 
				                         have_bus = False
			
 
				                     if have_bus:
			
@@ -288,18 +294,33 @@ def doctitle_refine(doctitle):
 
				 def get_nlp_enterprise(list_entity):
			
 
				     nlp_enterprise = []
			
 
				     nlp_enterprise_attachment = []
			
 
				+    dict_enterprise = {}
			
 
				     max_num = 100
			
 
				     list_entity = sorted(list_entity,key=lambda x:(x.sentence_index,x.begin_index))
			
 
				     for entity in list_entity:
			
 
				         if entity.entity_type in ['org','company']:
			
 
				+            if entity.entity_text not in dict_enterprise:
			
 
				+                if entity.entity_text not in business_dic:
			
 
				+                    have_bus, dic = get_business_data(entity.entity_text)
			
 
				+                    business_dic[entity.entity_text] = (have_bus, dic)
			
 
				+                else:
			
 
				+                    have_bus, dic = business_dic.get(entity.entity_text)  # 20240708 字典保存查询过的工商数据，避免重复查询redis
			
 
				+                credit_code = dic.get('credit_code', '')
			
 
				+                in_text = 0 if entity.in_attachment else 1
			
 
				+                if entity.label in [0,1,2,3,4] or len(dict_enterprise)<=max_num:
			
 
				+                    dict_enterprise[entity.entity_text] = {'credit_code': credit_code, 'in_text': in_text}
			
 
				+            else:
			
 
				+                in_text = 0 if entity.in_attachment else 1
			
 
				+                if in_text != dict_enterprise[entity.entity_text]['in_text']:
			
 
				+                    dict_enterprise[entity.entity_text]['in_text'] = 2
			
 
				+
			
 
				             if not entity.in_attachment:
			
 
				                 if entity.entity_text not in nlp_enterprise:
			
 
				                     nlp_enterprise.append(entity.entity_text)
			
 
				             else:
			
 
				                 if entity.entity_text not in nlp_enterprise_attachment:
			
 
				                     nlp_enterprise_attachment.append(entity.entity_text)
			
 
				-
			
 
				-    return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num]
			
 
				+    return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num], dict_enterprise
			
 
				 
			
 
				 ENTERPRISE_HUGE = None
			
 
				 
			
@@ -528,7 +549,11 @@ def match_enterprise_max_first(sentence):
 
				                         if len(enter_name)<4: # 20240521 短于4个字的不要
			
 
				                             break
			
 
				                         if enter_tail in SET_TAIL_ENTERPRISE or re.search('(中心|中学|小学|医院|学院|大学|学校|监狱|大队|支队|林场|海关|分局|商行)$', enter_tail):
			
 
				-                            have_bus, dic = get_business_data(enter_name) # 20210124 改为有工商数据的实体才添加
			
 
				+                            if enter_name not in business_dic:
			
 
				+                                have_bus, dic = get_business_data(enter_name) # 20210124 改为有工商数据的实体才添加
			
 
				+                                business_dic[enter_name] = (have_bus, dic)
			
 
				+                            else:
			
 
				+                                have_bus, dic = business_dic.get(enter_name) # 20240708 字典保存查询过的工商数据，避免重复查询redis
			
 
				                             if have_bus:
			
 
				                             # if is_enterprise_exist(enter_name):
			
 
				                                 match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(enter_name)}
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -2153,9 +2153,10 @@ def segment(soup,final=True):
 
				                 text = re.sub(punc_del," ",text) # 多个空字符替换为一个空格（防止时间类连接），后面还有对空格处理
			
 
				 
			
 
				     #将连续的中文句号替换为一个
			
 
				-    text_split = text.split("。")
			
 
				-    text_split = [x for x in text_split if len(x)>0]
			
 
				-    text = "。".join(text_split)
			
 
				+    # text_split = text.split("。")
			
 
				+    # text_split = [x for x in text_split if len(x)>0]
			
 
				+    # text = "。".join(text_split)
			
 
				+    text = re.sub('。+', '。', text).lstrip('。') # 20240703 修复上面的方法造成文末句号丢失问题。
			
 
				 
			
 
				     # #删除标签中的所有空格
			
 
				     # for subs in subspaceList:
			
@@ -3235,7 +3236,7 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
 
				     # 使用正则识别金额
			
 
				     entity_type = "money"
			
 
				     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
			
 
				-                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价)(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
			
 
				+                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
			
 
				                           "front_m": "((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)]?)\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
			
 
				                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
			
 
				     # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元，合同金额：378.8万元 提取
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -268,7 +268,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     start_time = time.time()  # 实体链接
			
 
				     entityLink.link_entitys(list_entitys)
			
 
				     doctitle_refine = entityLink.doctitle_refine(title)
			
 
				-    nlp_enterprise,nlp_enterprise_attachment = entityLink.get_nlp_enterprise(list_entitys[0])
			
 
				+    nlp_enterprise,nlp_enterprise_attachment, dict_enterprise = entityLink.get_nlp_enterprise(list_entitys[0])
			
 
				     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time)
			
 
				     log("get attributes done of doc_id%s"%(doc_id))
			
 
				     cost_time["attrs"] = round(time.time()-start_time,2)
			
@@ -359,7 +359,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2024-06-27'}
			
 
				+    version_date = {'version_date': '2024-07-08'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
			
 
				 
			
 
				     if original_docchannel == 302:
			
@@ -380,6 +380,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     data_res["doctitle_refine"] = doctitle_refine
			
 
				     data_res["nlp_enterprise"] = nlp_enterprise
			
 
				     data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
			
 
				+    data_res["dict_enterprise"] = dict_enterprise
			
 
				+
			
 
				     # 要素的个数
			
 
				     data_res['extract_count'] = extractCount(data_res)
			
 
				     # 是否有表格
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -846,7 +846,10 @@ class PREMPredict():
 
				                 elif re.search('第一候补|第一后备|备选', front):
			
 
				                     label = 3
			
 
				                     values[label] = 0.6
			
 
				-                elif re.search('放弃中标资格$|是否中标：否|^(中标|成交)(公示|公告)', behind):
			
 
				+                elif re.search('^放弃中标资格|是否中标：否|^(中标|成交)(公示|公告)', behind):
			
 
				+                    values[2] = 0.5
			
 
				+                    label = 5
			
 
				+                elif re.search('^，?(投标报价|(资格性审查：|符合性审查：)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', front)==None:
			
 
				                     values[2] = 0.5
			
 
				                     label = 5
			
 
				                 elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单)：$|确定为标的的受让方，$|[主次出]入口?，?$|确定(项目|\w{,2})成交供应商，$', front):  # 234501112 民币元，序号：1，债务人： 东营市海宁工贸有限责任公司 ，债权本金： 262414286 八、中标后签约单位，合同签约单位：
			
@@ -954,7 +957,7 @@ class PREMPredict():
 
				                     values[label] = 0.49
			
 
				                 elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[（）]?[+×*-][\d.%]+', behind):
			
 
				                     values[label] = 0.49
			
 
				-                elif re.search('(含|在|包括|[大小等高低]于)$|[\d.%]+[+×*-]$', front):
			
 
				+                elif re.search('(含|在|包括|[大小等高低]于|达到)$|[\d.%]+[+×*-]$', front):
			
 
				                     values[label] = 0.49
			
 
				                 elif entity.notes == '单价' and float(entity.entity_text)<5000:
			
 
				                     label = 2
			
@@ -1515,6 +1518,8 @@ class RoleRulePredictor():
 
				             _label = 5
			
 
				         elif _label == 2 and re.search('评委|未中标', after[:5]): # 397194341 过滤掉错误召回中标人
			
 
				             _label = 5
			
 
				+        elif _label == 2 and re.search('^，?(投标报价|(资格性审查：|符合性审查：)?(不通过|不符合))', after) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', before[-10:])==None: #20240705 处理类似 493939047 错误
			
 
				+            _label = 5
			
 
				         if _label == 5:
			
 
				             _label, _prob, keyword = self.ser_role(self.pattern_whole, before + center + after, entity_text)  # 前后文匹配
			
 
				             keyword = 'whole_'+ keyword[:keyword.find(entity_text)] if keyword!="" else keyword
			
@@ -4453,7 +4458,8 @@ class DocChannel():
 
				           11、预测预告，原始为意向、招标且标题无预告关键词，返回原始类别
			
 
				           '''
			
 
				           if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
			
 
				-                  original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(prem_json)==False:
			
 
				+                  original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(
			
 
				+              prem_json)==False and re.search(self.title_life_dic['中标信息'], title)==None:
			
 
				               result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
			
 
				               msc += '最终规则修改：中标公告、合同公告无中标人且原始为非中标，返回原类型'
			
 
				           elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
			
@@ -5888,15 +5894,54 @@ class DistrictPredictor():
 
				                     name, b, e = it
			
 
				                     area_list.append((name, (e - b + e) / max_len / 2))
			
 
				             return area_list
			
 
				+
			
 
				+        def find_whole_areas(text):
			
 
				+            '''
			
 
				+            通过正则匹配字符串返回地址
			
 
				+            :param pettern: 地址正则 广东省|广西省|...
			
 
				+            :param text: 待匹配文本
			
 
				+            :return:
			
 
				+            '''
			
 
				+            pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
			
 
				+            p_pro, p_city, p_dis, p_city, p_dis, p_dis)
			
 
				+            province_l, city_l, district_l = [], [], []
			
 
				+            for it in re.finditer(pettern, text):
			
 
				+                if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
			
 
				+                        '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
			
 
				+                    continue
			
 
				+                if it.group(0) == '站前':  # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份：辽宁， 城市：营口，区县：站前
			
 
				+                    continue
			
 
				+                for k, v in it.groupdict().items():
			
 
				+                    if v != None:
			
 
				+                        if k in ['prov']:
			
 
				+                            province_l.append((it.group(k), it.start(k), it.end(k)))
			
 
				+                        elif k in ['city', 'city1']:
			
 
				+                            if re.search('^(经济开发区|开发区|新区)', text[it.end(k):]):  # 城市不匹配为区的地址 修复 滨州北海经济开发区 北海新区 等提取为北海
			
 
				+                                continue
			
 
				+                            city_l.append((it.group(k), it.start(k), it.end(k)))
			
 
				+                            if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end(k):]):
			
 
				+                                city_l.append((it.group(k), it.start(k), it.end(k)))
			
 
				+                        elif k in ['dist', 'dist1', 'dist2']:
			
 
				+                            if it.group(k)=='昌江' and '景德镇' not in it.group(0):
			
 
				+                                district_l.append(('昌江黎族', it.start(k), it.end(k)))
			
 
				+                            else:
			
 
				+                                district_l.append((it.group(k), it.start(k), it.end(k)))
			
 
				+            return province_l, city_l, district_l
			
 
				+
			
 
				         def get_pro_city_dis_score(text, text_weight=1):
			
 
				             text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾', ' ', text)
			
 
				             text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
			
 
				             text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域：怒江州 识别为广西 - 崇左 - 江州
			
 
				             text = re.sub('茂名滨海新区', '茂名市', text)
			
 
				             text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
			
 
				-            province_l = find_areas(p_pro, text)
			
 
				-            city_l = find_areas(p_city, text)
			
 
				-            district_l = find_areas(p_dis, text)
			
 
				+            ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
			
 
				+            if ser and '黎族' not in ser.group(0):
			
 
				+                text = text.replace(ser.group(0), ser.group(0)+'黎族')
			
 
				+            # province_l = find_areas(p_pro, text)
			
 
				+            # city_l = find_areas(p_city, text)
			
 
				+            # district_l = find_areas(p_dis, text)
			
 
				+
			
 
				+            province_l, city_l, district_l = find_whole_areas(text) # 20240703 优化地址提取，解决类似 海南昌江 得到 海南 南昌 结果
			
 
				 
			
 
				             if len(province_l) == len(city_l) == 0:
			
 
				                 district_l = [it for it in district_l if
			
@@ -6076,8 +6121,11 @@ class DistrictPredictor():
 
				 
			
 
				         def get_project_addr(text):
			
 
				             p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+            p2 = '项目位于(?P<addr>\w{2}市\w{2,4}区)'
			
 
				             if re.search(p1, text):
			
 
				                 return re.search(p1, text).group('addr')
			
 
				+            elif re.search(p2, text):
			
 
				+                return re.search(p2, text).group('addr')
			
 
				             else:
			
 
				                 return ''
			
 
				 
			
@@ -7163,15 +7211,15 @@ class ApprovalPredictor():
 
				         项目（法人）单位
			
 
				         '''
			
 
				         self.other_part = {
			
 
				-            "project_name": "(项目|工程|采购|招标|计划)名称?：(?P<main>[^：。]{5,50})[，。](\w{2,10}：|$)?", # 项目名称
			
 
				-            "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标|计划|任务|备案)(编[号码]|号))：(?P<main>(\w{2,8})?[（）〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)(\w{2,10}：|$)?", # 项目编号
			
 
				-            "doc_num": "((审[批查核]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案|核准|许可|确认)[文编]?号|综合受理号|文书号)：(?P<main>(\w{2,8})?[（）〔〕【】\[\]a-zA-Z0-9-.]{5,30}号?)[，。]?(\w{2,10}：|$)?", # 文号
			
 
				-            "pro_type": "(申[报请](类型|种类)|项目所属行业|行业(分类|归属)|产业领域|项目行业|项目类型|立项类型)：(?P<main>[^：。]{2,30})[，。](\w{2,10}：|$)?", # 项目类型
			
 
				-            "year_limit": "((建设|工程|服务|项目)(年限|期限|时长))：(?P<main>[\d个年月日.-]{2,20})[，。](\w{2,10}：|$)?", # 建设年限
			
 
				-            "construction_scale": "(建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|(建设|工程|项目)规模(如下)?)：(?P<main>[^：。]{2,50})[，。](\w{2,10}：|$)?", # 建设规模
			
 
				-            "approval_items": "((审[批查核]|批[复准]申请)(事项|内容)|事项名称|事项审批)：(?P<main>[^：。]{2,50})[，。](\w{2,10}：|$)?", # 审批事项
			
 
				+            "project_name": "((项目|工程|采购|招标|计划|建设|规划)名称?|生产建设项目|申请项目)：(?P<main>[^：。]{5,50})[，。](\w{2,10}：|$)?", # 项目名称
			
 
				+            "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标|计划|任务|备案|索引)(编[号码]|号))：?(?P<main>(\w{2,8})?[（）〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)(\w{2,10}：|$)?", # 项目编号
			
 
				+            "doc_num": "((审[批查核]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案|核准|许可|确认|受理|申请报告|文件|意见书|办件)[文编]?号|综合受理号|文书?号|合格书号)：?(?P<main>(\w{2,8})?[（）〔〕【】\[\]a-zA-Z0-9-.]{5,30}号?)[，。]?(\w{2,10}：|$)?", # 文号
			
 
				+            "pro_type": "((申[报请]|审核备|项目|立项)(类型|种类)|项目所属行业|行业(分类|归属)|产业领域|项目行业)：(?P<main>[^：。]{2,30})[，。](\w{2,10}：|$)?", # 项目类型
			
 
				+            "year_limit": "((建设|工程|服务|项目)(起止|\w{,2})?(年限|期限|时长|工期))：(约|超过|大概|建设工期|共计|合计)?(?P<main>[\d一二三四五六七八九十]+个月|\d{1,3}(日?历?天|小时)|20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?([至—-]+20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?)?)[（，。](\w{2,10}：|$)?", # 建设年限
			
 
				+            "construction_scale": "(建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|(建设|招标|采购)）?内容|(建设|工程|项目)(主要)?(规模|内容|概况|面积)([及和](主要)?(规模|内容|概况|面积))?(如下)?)：(?P<main>[^：。]{2,250})[，。](\w{2,10}：|$)?", # 建设规模
			
 
				+            "approval_items": "((审[批查核]|批[复准]|申请|监管)(事项|内容|名称)|事项名称|事项审批)：(?P<main>[^：。]{2,70})[，。](\w{2,10}：|$)?", # 审批事项
			
 
				             "properties": "((建设|工程|项目)性质)：(?P<main>[^：。]{2,50})[，。](\w{2,10}：|$)?", # 建设性质
			
 
				-            "approval_result": "((审[批查核]|批[复准])(结果|决定|结论|状态|回复)|(办理|，)(状态|意见|结果))：(?P<main>[^：。]{2,50})[，。](\w{2,10}：|$)?", # 审批结果
			
 
				+            "approval_result": "((审[批查核]|批[复准]|核[发准]|许可|抽查|备案)(结果|决定|结论|状态|回复|意见)|(办[理件]|，)(状态|意见|结果)|项目(当前|目前)?状态)：(?P<main>[^：。]{2,20})[，。](\w{2,10}：|$)?", # 审批结果
			
 
				             "phone": "(联系)?电话：(?P<main>1[3-9][0-9][-—－―]?\d{4}[-—－―]?\d{4}|" # 联系电话
			
 
				                      '\+86.?1[3-9]\d{9}|'
			
 
				                      '0[1-9]\d{1,2}[-—－―][2-9]\d{6}\d?[-—－―]\d{1,4}|'
			
@@ -7185,22 +7233,26 @@ class ApprovalPredictor():
 
				         }
			
 
				 
			
 
				         self.role_type = {
			
 
				-            "declare_company": "(申[请报]|填报|呈报)(部门|机关|单位|企业|公司|机构|组织)",  # 申报单位
			
 
				-            "construct_company": "(业主|建设|用地|委托|发包|产权|项目)）?(部门|机关|单位|企业|公司|方)|主送机关|法人单位|甲方",  # 建设单位
			
 
				-            "approver": "(审[批查核]|许可|批准|发证|批复|管理)(部门|机关|单位|企业|公司|机构)",  # 审批部门
			
 
				-            "evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)"  # 环评机构
			
 
				+            "declare_company": "(申[请报]|填报|呈报)(人|部门|机关|单位|企业|公司|机构|组织)",  # 申报单位
			
 
				+            "construct_company": "(业主|建设|用地|委托|发包|产权|项目)）?(部门|机关|单位|企业|公司|方|业主)|主送机关|法人单位|甲方",  # 建设单位
			
 
				+            "approver": "(审[批查核议图]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|承办)(部门|机关|单位|企业|公司|机构)|实施主体",  # 审批部门
			
 
				+            "evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)" , # 环评机构
			
 
				+            "compilation_unit": "编制单位", # 编制单位 20240701加
			
 
				+            "publisher": "(发布|发文|公示|公告)(人|部门|机关|单位|企业|公司|机构|组织)" # 发布机构 20240703加
			
 
				         }
			
 
				         self.person_type = {
			
 
				             "legal_person": "项目法人|法定代表人|企业法人"  # 项目法人
			
 
				         }
			
 
				         self.date_type = {
			
 
				             "time_declare": "(申[请报]|填报|呈报)(时间|日期)", # 申报时间
			
 
				-            "time_commencement": "(开工|动工|施工开始)(时间|日期)", # 开工时间
			
 
				-            "time_completion": "(竣工|完工|验收|(项目|建设|工程)(完成|结束))(备案)?(时间|日期)" # 竣工时间
			
 
				+            "time_commencement": "(开工|动工|(项目|建设|工程|施工)开始)(时间|日期)", # 开工时间
			
 
				+            "time_completion": "(竣工|完工|验收|(项目|建设|工程|施工)(完成|结束))(备案)?(时间|日期)", # 竣工时间
			
 
				+            "time_approval": "(审[批查核查议]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|决定)(时间|日期)", # 审批时间 20240701加
			
 
				+            "time_release": "(发布|发文|公告|生成|成文)(时间|日期)" # 发布时间
			
 
				         }
			
 
				 
			
 
				         self.addr_type = {
			
 
				-            "project_addr": "(建设|工程|项目|施工)(地址|地点|位置|所在地)|[宗土]地坐落|用地位置" # 建设地址
			
 
				+            "project_addr": "(建设|工程|项目|施工|地块|用地)\w{,2}(地址|地点|位置|所在地)|[宗土]地坐落" # 建设地址
			
 
				         }
			
 
				 
			
 
				         self.money_type = {
			
@@ -7216,6 +7268,7 @@ class ApprovalPredictor():
 
				         rs_l = []
			
 
				         found_key = 0
			
 
				         code_name_set = set() # 项目编号、名称集合
			
 
				+        org_set = set() # 保存可能为审批部门的角色
			
 
				         for entity in list_entitys[0]:
			
 
				             entities[entity.sentence_index].append(entity)
			
 
				 
			
@@ -7227,12 +7280,16 @@ class ApprovalPredictor():
 
				             for entity in entities[i]:
			
 
				                 b, e = entity.wordOffset_begin, entity.wordOffset_end
			
 
				                 if entity.entity_type in ['org', 'company']:
			
 
				+                    flag = 1
			
 
				                     for k, v in self.role_type.items():
			
 
				                         if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
			
 
				                             if rs_dic[k] == '':
			
 
				                                 rs_dic[k] = entity.entity_text
			
 
				                             multi_project[k] = entity.entity_text
			
 
				                             found_key = 1
			
 
				+                            flag = 0
			
 
				+                    if flag and entity.entity_type == "org" and re.search('(局|委员会|委|厅)$', entity.entity_text):
			
 
				+                        org_set.add(entity.entity_text)
			
 
				                 elif entity.entity_type in ['person']:
			
 
				                     for k, v in self.person_type.items():
			
 
				                         if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
			
@@ -7244,9 +7301,12 @@ class ApprovalPredictor():
 
				                 elif entity.entity_type in ['time']:
			
 
				                     for k, v in self.date_type.items():
			
 
				                         if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
			
 
				+                            time = timeFormat(entity.entity_text, default_first_day=False) if k in ['time_completion'] else timeFormat(entity.entity_text)
			
 
				+                            if time == "":
			
 
				+                                continue
			
 
				                             if rs_dic[k] == '':
			
 
				-                                rs_dic[k] = entity.entity_text
			
 
				-                            multi_project[k] = entity.entity_text
			
 
				+                                rs_dic[k] = time
			
 
				+                            multi_project[k] = time
			
 
				                             found_key = 1
			
 
				                 elif entity.entity_type in ['location']:
			
 
				                     for k, v in self.addr_type.items():
			
@@ -7288,6 +7348,16 @@ class ApprovalPredictor():
 
				                     multi_project[k] = iter.group('main')
			
 
				                     found_key = 1
			
 
				                     break
			
 
				+            for k, v in self.date_type.items():
			
 
				+                for iter in re.finditer(v+'：?(?P<main>20\d{2}-\d{1,2}(-\d{1,2})?|20\d{2}/\d{1,2}(/\d{1,2})?|20\d{2}\.\d{1,2}(\.\d{1,2})?|20\d{2}(0[1-9]|1[0-2])(0[1-9]|[1-2][0-9]|3[0-1])?)', text): # 规则补充实体识别不到的日期时间
			
 
				+                    time = timeFormat(iter.group('main'), default_first_day=False) if k in ['time_completion'] else timeFormat(iter.group('main'))
			
 
				+                    if time == "":
			
 
				+                        continue
			
 
				+                    if rs_dic[k] == '':
			
 
				+                        rs_dic[k] = time
			
 
				+                    multi_project[k] = time
			
 
				+                    found_key = 1
			
 
				+                    break
			
 
				             if (multi_project['project_code'] != "" or multi_project['project_name'] != "") and multi_project['project_code']+multi_project['project_name'] not in code_name_set:
			
 
				                 code_name_set.add(multi_project['project_code']+multi_project['project_name'])
			
 
				                 district = getPredictor('district').get_area(
			
@@ -7309,6 +7379,8 @@ class ApprovalPredictor():
 
				                 rs_dic['province'] = district['district']['province']
			
 
				                 rs_dic['city'] = district['district']['city']
			
 
				                 rs_dic['district'] = district['district']['district']
			
 
				+            if len(org_set) == 1 and rs_dic['approver'] == "":
			
 
				+                rs_dic['approver'] == org_set.pop()
			
 
				             rs_dic = {k: v for k, v in rs_dic.items() if v != ''}
			
 
				             return [rs_dic]
			
 
				         return []