Kaynağa Gözat

Merge remote-tracking branch 'origin/master'

lsm 2 yıl önce
ebeveyn
işleme
ac47ae521c

+ 46 - 9
BiddingKG/dl/common/Utils.py

@@ -606,7 +606,18 @@ def fitDataByRule(data):
     result = re.sub("[。]","",result)
     return  result
 
-time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
+from datetime import date
+# 时间合法性判断
+def isValidDate(year, month, day):
+    try:
+        date(year, month, day)
+    except:
+        return False
+    else:
+        return True
+
+time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
+from BiddingKG.dl.ratio.re_ratio import getUnifyNum
 def timeFormat(_time):
     current_year = time.strftime("%Y",time.localtime())
     all_match = re.finditer(time_format_pattern,_time)
@@ -624,22 +635,48 @@ def timeFormat(_time):
                 if k=="day":
                     day = v
             if year!="":
-                if len(year)==2:
-                    year = "20"+year
-                if int(year)>int(current_year):
-                    legal = False
+                if re.search("^\d+$",year):
+                    if len(year)==2:
+                        year = "20"+year
+                    if int(year)>int(current_year):
+                        legal = False
+                else:
+                    _year = ""
+                    for word in year:
+                        if word == '0':
+                            _year += word
+                        else:
+                            _year += str(getDigitsDic(word))
+                    year = _year
             else:
                 legal = False
             if month!="":
-                if int(month)>12:
-                    legal = False
+                if re.search("^\d+$", month):
+                    if int(month)>12:
+                        legal = False
+                else:
+                    month = int(getUnifyNum(month))
+                    if month>=1 and month<=12:
+                        month = str(month)
+                    else:
+                        legal = False
             else:
                 legal = False
             if day!="":
-                if int(day)>31:
-                    legal = False
+                if re.search("^\d+$", day):
+                    if int(day)>31:
+                        legal = False
+                else:
+                    day = int(getUnifyNum(day))
+                    if day >= 1 and day <= 31:
+                        day = str(day)
+                    else:
+                        legal = False
             else:
                 legal = False
+            # print(year,month,day)
+            if not isValidDate(int(year),int(month),int(day)):
+                legal = False
             if legal:
                 return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
     return ""

+ 50 - 6
BiddingKG/dl/interface/Preprocessing.py

@@ -1392,7 +1392,6 @@ def segment(soup,final=True):
                     text = re.sub(punc_del,punc_del.strip()[0],text)   #2021/12/09 修正由于某些标签后插入符号把原来符号替换
             else:
                 text = re.sub(punc_del,"",text)
-        
 
     #将连续的中文句号替换为一个
     text_split = text.split("。")
@@ -1419,7 +1418,7 @@ def segment(soup,final=True):
 
     if len(text)<10000000:
         while(LOOP_BEGIN<len(text)):
-            _text += re.sub(")",")",re.sub("(","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
+            _text += re.sub(")",")",re.sub("(","(",re.sub("\s{2,}","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
             LOOP_BEGIN += LOOP_LEN
         text = _text
     # 附件标识前修改为句号,避免正文和附件内容混合在一起
@@ -2191,7 +2190,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = segment(article_processed)
 
         article_processed = article_processed.replace('(', '(').replace(')', ')')  #2022/8/10 统一为中文括号
-        article_processed = article_processed.replace(':', ':')  #2023/1/5 统一为中文冒号
+        # article_processed = article_processed.replace(':', ':')  #2023/1/5 统一为中文冒号
+        article_processed = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])", ":", article_processed)
         article_processed = article_processed.replace('.','.').replace('-', '-') # 2021/12/01 修正OCR识别PDF小数点错误问题
         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
@@ -2347,7 +2347,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
                             else:
                                 outline_list.append(Outline(doc_id,outline_index,'',sentence_begin_index,sentence_end_index,wordOffset_begin,wordOffset_end))
                             outline_index += 1
-                        sentence_text = re.sub("##split##", "", sentence_text,count=1)
+                        sentence_text = re.sub("##split##,?", "", sentence_text,count=1)
                         last_sentence_index = (sentence_index,_match.start())
                     temp_sentences.append(sentence_text)
                 if attachment_begin_index>-1 and last_sentence_index[0]<attachment_begin_index:
@@ -2701,6 +2701,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     company_index[sentence_index].add((ner_entity[0],ner_entity[1]))
             #识别package
 
+            ner_time_list = []
             #识别实体
             for ner_entity in ner_entitys:
                 begin_index_temp = ner_entity[0]
@@ -2708,6 +2709,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_type = ner_entity[2]
                 entity_text = ner_entity[3]
 
+                if entity_type=='time':
+                    ner_time_list.append((begin_index_temp,end_index_temp))
                 if entity_type in ["org","company"] and not isLegalEnterprise(entity_text):
                     continue
                 # 实体长度限制
@@ -2733,7 +2736,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
 
                 #去掉标点符号
-                entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
+                if entity_type!='time':
+                    entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
                 entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
                 # 组织机构实体名称补充
                 if entity_type in ["org", "company"]:
@@ -2858,6 +2862,46 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
                            begin_index_temp, end_index_temp,in_attachment=in_attachment))
 
+            # 时间实体格式补充
+            re_time_new = re.compile("20\d{2}-\d{1,2}-\d{1,2}|20\d{2}/\d{1,2}/\d{1,2}|20\d{2}\.\d{1,2}\.\d{1,2}|20\d{2}(?:0[1-9]|1[0-2])(?:0[1-9]|[1-2][0-9]|3[0-1])")
+            entity_type = "time"
+            for _time in re.finditer(re_time_new,sentence_text):
+                entity_text = _time.group()
+                begin_index_temp = _time.start()
+                end_index_temp = _time.end()
+                is_same = False
+                for t_index in ner_time_list:
+                    if begin_index_temp>=t_index[0] and end_index_temp<=t_index[1]:
+                        is_same = True
+                        break
+                if is_same:
+                    continue
+                if _time.start()!=0 and re.search("\d",sentence_text[_time.start()-1:_time.start()]):
+                    continue
+                # 纯数字格式,例:20190509
+                if re.search("^\d{8}$",entity_text):
+                    if _time.end()!=len(sentence_text) and re.search("[\da-zA-z]",sentence_text[_time.end():_time.end()+1]):
+                        continue
+                    entity_text = entity_text[:4] + "-" + entity_text[4:6] + "-" + entity_text[6:8]
+                if not timeFormat(entity_text):
+                    continue
+
+                for j in range(len(list_tokenbegin)):
+                    if list_tokenbegin[j] == begin_index_temp:
+                        begin_index = j
+                        break
+                    elif list_tokenbegin[j] > begin_index_temp:
+                        begin_index = j - 1
+                        break
+                for j in range(begin_index, len(list_tokenbegin)):
+                    if list_tokenbegin[j] >= end_index_temp:
+                        end_index = j - 1
+                        break
+                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
+                list_sentence_entitys.append(
+                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
+                           begin_index_temp, end_index_temp, in_attachment=in_attachment))
+
             # 资金来源提取  2020/12/30 新增
             list_moneySource = extract_moneySource(sentence_text)
             entity_type = "moneysource"
@@ -2882,7 +2926,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
                 list_sentence_entitys.append(
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
-                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
+                           begin_index_temp, end_index_temp,in_attachment=in_attachment,prob=moneySource['prob']))
 
             # 电子邮箱提取 2021/11/04 新增
             list_email = extract_email(sentence_text)

+ 20 - 0
BiddingKG/dl/interface/extract.py

@@ -13,6 +13,7 @@ import os
 import codecs
 import requests
 import time
+from unicodedata import normalize
 
 _time1 = time.time()
 sys.path.append(os.path.abspath("../.."))
@@ -106,11 +107,30 @@ def extractCount(extract_dict):
         extract_count += 1
     return extract_count
 
+# 字符编码标准化
+def str_normalize(text):
+    # time1 = time.time()
+    cn_punctuation = "¥,。:;{}!?()"
+    text_split = re.split("([{}])+".format(cn_punctuation),text)
+    # print(text_split)
+    new_text = ""
+    for s in text_split:
+        if re.search("^[{}]+$".format(cn_punctuation),s):
+            new_text += s
+        else:
+            new_text += normalize('NFKD', s)
+    # print("str_normalize cost time %s"%str(time.time()-time1))
+    # print(new_text)
+
+    return new_text
+
 def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',**kwargs):
     cost_time = dict()
 
     start_time = time.time()
     log("start process doc %s"%(str(doc_id)))
+    # 字符编码标准化
+    text = str_normalize(text)
     list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
     log("get preprocessed done of doc_id%s"%(doc_id))
     cost_time["preprocess"] = round(time.time()-start_time,2)

+ 143 - 13
BiddingKG/dl/interface/getAttributes.py

@@ -1,6 +1,6 @@
 
 
-from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name,money_process
+from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name,money_process,getDigitsDic,isValidDate
 from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
 from decimal import Decimal
 import re
@@ -2835,7 +2835,43 @@ def turnBidWay(bidway):
     else:
         return "其他"
 
-my_time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
+def turnMoneySource(moneysource):
+    result_list = []
+    if re.search("自筹|业主筹集|筹资|自有",moneysource):
+        result_list.append("自筹")
+    if re.search("财政",moneysource) and not re.search("非财政",moneysource):
+        result_list.append("财政资金")
+    if re.search("拨款|补助|划拨|拨付|国拨|上级资金",moneysource):
+        result_list.append("上级拨款")
+    if re.search("社会资本|社会资金",moneysource):
+        result_list.append("社会资本")
+    if re.search("贷款|借款|借贷",moneysource):
+        result_list.append("贷款资金")
+    if re.search("债券|债|国债",moneysource):
+        result_list.append("债券资金")
+    if re.search("专项|项目资金",moneysource):
+        result_list.append("项目专项资金")
+    if re.search("配套",moneysource):
+        result_list.append("配套资金")
+    if re.search("外资",moneysource):
+        result_list.append("外资")
+    if re.search("国有资金|国企资金|国资|国家投资",moneysource):
+        result_list.append("国有资金")
+    if re.search("投资|融资",moneysource):
+        result_list.append("投资资金")
+    if re.search("预算(?<!外)|预算内",moneysource):
+        result_list.append("预算内资金")
+    if re.search("预算外",moneysource):
+        result_list.append("预算外资金")
+
+    result_list = sorted(result_list,key = lambda x:x)
+    if len(result_list)>0 and len(result_list)<5:
+        return ",".join(result_list)
+    else:
+        return "其他资金"
+
+my_time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
+from BiddingKG.dl.ratio.re_ratio import getUnifyNum
 import time
 def my_timeFormat(_time):
     current_year = time.strftime("%Y",time.localtime())
@@ -2855,24 +2891,52 @@ def my_timeFormat(_time):
                 if k=="day":
                     day = v
             if year!="":
-                if len(year)==2:
-                    year = "20"+year
-                if int(year)>int(current_year):
-                    legal = False
+                if re.search("^\d+$", year):
+                    if len(year) == 2:
+                        year = "20" + year
+                    if int(year) > int(current_year):
+                        legal = False
+                else:
+                    _year = ""
+                    for word in year:
+                        if word == '0':
+                            _year += word
+                        else:
+                            _year += str(getDigitsDic(word))
+                    year = _year
             else:
                 legal = False
             if month!="":
-                if int(month)>12:
-                    legal = False
+                if re.search("^\d+$", month):
+                    if int(month) > 12:
+                        legal = False
+                else:
+                    month = int(getUnifyNum(month))
+                    if month >= 1 and month <= 12:
+                        month = str(month)
+                    else:
+                        legal = False
             else:
                 legal = False
             if day!="":
-                if int(day)>31:
-                    legal = False
+                if re.search("^\d+$", day):
+                    if int(day) > 31:
+                        legal = False
+                else:
+                    day = int(getUnifyNum(day))
+                    if day >= 1 and day <= 31:
+                        day = str(day)
+                    else:
+                        legal = False
             else:
                 legal = False
+            if not isValidDate(int(year),int(month),int(day)):
+                legal = False
             if legal:
-                # return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
+                # 数字字符格式化
+                year = str(int(year))
+                month = str(int(month))
+                day = str(int(day))
                 time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
     return time_list
 
@@ -2914,12 +2978,67 @@ def getTimeAttributes(list_entity,list_sentence):
         sentence_text = list_sentence[entity.sentence_index].sentence_text
         entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
         entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
+        entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 20):entity.wordOffset_begin]
         entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
         label_prob = entity.values[entity.label]
         entity_text = entity.entity_text
         in_attachment = entity.in_attachment
         extract_time = my_timeFormat(entity_text)
+        # definite_time = "00:00:00"
+        # if extract_time:
+        #     t = re.compile("(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{2})?[::分]?(?P<second>\d{2})?秒?")
+        #     t_in_word = re.search(t,entity_text)
+        #     t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,sentence_text[entity.wordOffset_end:])
+        #     if t_in_word:
+        #         print('t_in_word',entity_text,t_in_word.groupdict())
+        #         day = t_in_word.groupdict().get('day',"")
+        #         hour = t_in_word.groupdict().get('hour',"")
+        #         half_hour = t_in_word.groupdict().get('half_hour',"")
+        #         minute = t_in_word.groupdict().get('minute',"")
+        #         second = t_in_word.groupdict().get('second',"")
+        #         if hour:
+        #             if day=='下午' and int(hour)<12:
+        #                 hour = str(int(hour)+12)
+        #             if int(hour)>24:
+        #                 continue
+        #         else:
+        #             hour = "00"
+        #         if not minute:
+        #             if half_hour:
+        #                 minute = "30"
+        #             else:
+        #                 minute = "00"
+        #         if int(minute)>60:
+        #             continue
+        #         if not second:
+        #             second = "00"
+        #         if int(second)>60:
+        #             continue
+        #         # 数字字符格式化
+        #         # hour = str(int(hour))
+        #         # minute = str(int(minute))
+        #         # second = str(int(second))
+        #         definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
+        #         print(definite_time)
+        #
+        #     elif t_out_of_word:
+        #         print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
+
+
+
         if extract_time:
+            # 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间
+            if entity.label in [2,3,9]:
+                if entity.label==2 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
+                    dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
+                if entity.label==3 and re.search("开标|评审.{,2}(?:开始)?时间|选取.{,2}时间",entity_left3):
+                    dict_time['time_bidopen'].append((extract_time[0], 0.5, in_attachment))
+                if entity.label==3 and re.search("报名",entity_left3):
+                    dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
+                if entity.label==9 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
+                    dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
+
+
             # 2022/12/12 新增挂牌时间正则
             if re.search("挂牌.{,4}(?:时间|日期)",entity_left2):
                 if re.search("挂牌.{,4}(?:时间|日期)",entity_left2).end()>len(entity_left2)/2:
@@ -3105,12 +3224,22 @@ def getOtherAttributes(list_entity):
                   "total_tendereeMoney":0,
                   "total_tendereeMoneyUnit":''}
     list_serviceTime = []
+    last_moneysource_prob = 0
     for entity in list_entity:
         if entity.entity_type == 'bidway':
             dict_other["bidway"] = turnBidWay(entity.entity_text)
         elif entity.entity_type=='moneysource':
-            dict_other["moneysource"] = entity.entity_text
+            if dict_other["moneysource"] and entity.in_attachment:
+                continue
+            if not dict_other["moneysource"]:
+                dict_other["moneysource"] = entity.entity_text
+                last_moneysource_prob = entity.prob
+            elif entity.prob>last_moneysource_prob:
+                dict_other["moneysource"] = entity.entity_text
+                last_moneysource_prob = entity.prob
         elif entity.entity_type=='serviceTime':
+            if list_serviceTime and entity.in_attachment:
+                continue
             if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[\-\./]\d{1,2}", entity.entity_text):
                 list_serviceTime.append(entity)
         elif entity.entity_type=="person" and entity.label ==4:
@@ -3126,7 +3255,8 @@ def getOtherAttributes(list_entity):
         max_prob_serviceTime = [ent for ent in list_serviceTime if ent.prob==max_prob]
         max_prob_serviceTime.sort(key=lambda x:(x.sentence_index,x.begin_index))
         dict_other["serviceTime"] = max_prob_serviceTime[0].entity_text
-
+    if dict_other['moneysource']:
+        dict_other['moneysource'] = turnMoneySource(dict_other['moneysource'])
     # dict_other["product"] = list(set(dict_other["product"])) # 已在添加时 顺序去重保留
     return dict_other
 

+ 11 - 7
BiddingKG/dl/money/moneySource/ruleExtra.py

@@ -10,8 +10,8 @@ def re_rule():
     data = pd.read_csv("C:\\Users\\admin\\Desktop\\alldata_error.csv", index_col=0)
 
     rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)" 
-              "(?P<moneySource>([^,,。;;已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" 
-              "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]*(资本[金]|资金|自筹|贷款|补助|拨款|"
+              "(?P<moneySource>([^,,。;;已]{,20}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" 
+              "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]{,20}(资本[金]|资金|自筹|贷款|补助|拨款|"
                "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)")
     num = 0
     moneySourceList = []
@@ -76,13 +76,13 @@ def re_rule():
 
 def extract_moneySource(text):
     rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
-                      "(?P<moneySource>([^,,。;;已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
-                      "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]*(资本[金]|资金|自筹|贷款|补助|拨款|"
+                      "(?P<moneySource>([^,,。;;已]{,30}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
+                      "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]{,30}(资本[金]|资金|自筹|贷款|补助|拨款|"
                       "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)")
 
     re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已,,。.;;]|资[金佥]性质)")
     re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
-                     r"(?P<moneySource>[^,,。;;已]{2,}?)[,。;,]")
+                     r"(?P<moneySource>[^,,。;;已]{4,}?)[,。;,]")
     re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析|是否")
 
     sub = re.compile("[::。]|^[.,,、\)]|[,,;;]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
@@ -115,6 +115,7 @@ def extract_moneySource(text):
                 # print(groupdict1)
                 if source1:
                     groupdict1["index"] = word_index
+                    groupdict1["prob"] = 0.9
                     # print(groupdict1['index'])
                     results.append(groupdict1)
             word_index += len(item)
@@ -127,8 +128,9 @@ def extract_moneySource(text):
                     groupdict2 = res.groupdict()
                     source2 = groupdict2['moneySource']
                     # print("source2==>",source2)
-                    if source2 and not re_error.search(source2):
+                    if source2 and not re_error.search(res.group()):
                         groupdict2["index"] = copy_index
+                        groupdict2["prob"] = 0.8
                         results.append(groupdict2)
                 copy_index += len(item)
     first = []
@@ -148,7 +150,7 @@ def extract_moneySource(text):
     for result in first:
         entity_text = sub.sub("",result['moneySource'])
         # wordOffset_begin = result['index'] + re.search(entity_text,result['start']+result['moneySource']).start()
-        if entity_text is None:
+        if entity_text is None or len(entity_text)>40:
             continue
         else:
             wordOffset_begin = result['index'] + (result['start']+result['moneySource']).find(entity_text)
@@ -158,6 +160,7 @@ def extract_moneySource(text):
             _moneySource['body'] = entity_text
             _moneySource['begin_index'] = wordOffset_begin
             _moneySource['end_index'] = wordOffset_end
+            _moneySource['prob'] = result['prob']
             # print(_moneySource)
             list_moneySource.append(_moneySource)
     return list_moneySource
@@ -168,6 +171,7 @@ if __name__ == '__main__':
     # re_rule()
     test ="a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。" \
           "1、采购内容及资金来源:采购内容为汉上实验学校采购24台3匹柜机空调。资金来源为财政资金。"
+    # test = ",资金来源是否都是要具体到每条来源明细,"
     # 11,23 35,37
     print(extract_moneySource(test))
     pass

+ 1 - 1
BiddingKG/dl/ratio/re_ratio.py

@@ -182,7 +182,7 @@ def getUnifyNum(money):
                         result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
                 # subMoneys[0]中无金额单位,不可再拆分
                 elif subMoneys[0] == "":
-                    result += 0
+                    result += getMultipleFactor(factorUnit)
                 elif re.search(re.compile("[%s]" % ("".join(chnFactorUnits))), subMoneys[0]) is None:
                     # print(subMoneys)
                     # subMoneys[0] = subMoneys[0][0]