2 yıl önce · ac47ae521c
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -606,7 +606,18 @@ def fitDataByRule(data):
 
				     result = re.sub("[。]","",result)
			
 
				     return  result
			
 
				 
			
 
				-time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
			
 
				+from datetime import date
			
 
				+# 时间合法性判断
			
 
				+def isValidDate(year, month, day):
			
 
				+    try:
			
 
				+        date(year, month, day)
			
 
				+    except:
			
 
				+        return False
			
 
				+    else:
			
 
				+        return True
			
 
				+
			
 
				+time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
			
 
				+from BiddingKG.dl.ratio.re_ratio import getUnifyNum
			
 
				 def timeFormat(_time):
			
 
				     current_year = time.strftime("%Y",time.localtime())
			
 
				     all_match = re.finditer(time_format_pattern,_time)
			
@@ -624,22 +635,48 @@ def timeFormat(_time):
 
				                 if k=="day":
			
 
				                     day = v
			
 
				             if year!="":
			
 
				-                if len(year)==2:
			
 
				-                    year = "20"+year
			
 
				-                if int(year)>int(current_year):
			
 
				-                    legal = False
			
 
				+                if re.search("^\d+$",year):
			
 
				+                    if len(year)==2:
			
 
				+                        year = "20"+year
			
 
				+                    if int(year)>int(current_year):
			
 
				+                        legal = False
			
 
				+                else:
			
 
				+                    _year = ""
			
 
				+                    for word in year:
			
 
				+                        if word == '0':
			
 
				+                            _year += word
			
 
				+                        else:
			
 
				+                            _year += str(getDigitsDic(word))
			
 
				+                    year = _year
			
 
				             else:
			
 
				                 legal = False
			
 
				             if month!="":
			
 
				-                if int(month)>12:
			
 
				-                    legal = False
			
 
				+                if re.search("^\d+$", month):
			
 
				+                    if int(month)>12:
			
 
				+                        legal = False
			
 
				+                else:
			
 
				+                    month = int(getUnifyNum(month))
			
 
				+                    if month>=1 and month<=12:
			
 
				+                        month = str(month)
			
 
				+                    else:
			
 
				+                        legal = False
			
 
				             else:
			
 
				                 legal = False
			
 
				             if day!="":
			
 
				-                if int(day)>31:
			
 
				-                    legal = False
			
 
				+                if re.search("^\d+$", day):
			
 
				+                    if int(day)>31:
			
 
				+                        legal = False
			
 
				+                else:
			
 
				+                    day = int(getUnifyNum(day))
			
 
				+                    if day >= 1 and day <= 31:
			
 
				+                        day = str(day)
			
 
				+                    else:
			
 
				+                        legal = False
			
 
				             else:
			
 
				                 legal = False
			
 
				+            # print(year,month,day)
			
 
				+            if not isValidDate(int(year),int(month),int(day)):
			
 
				+                legal = False
			
 
				             if legal:
			
 
				                 return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
			
 
				     return ""
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -1392,7 +1392,6 @@ def segment(soup,final=True):
 
				                     text = re.sub(punc_del,punc_del.strip()[0],text)   #2021/12/09 修正由于某些标签后插入符号把原来符号替换
			
 
				             else:
			
 
				                 text = re.sub(punc_del,"",text)
			
 
				-        
			
 
				 
			
 
				     #将连续的中文句号替换为一个
			
 
				     text_split = text.split("。")
			
@@ -1419,7 +1418,7 @@ def segment(soup,final=True):
 
				 
			
 
				     if len(text)<10000000:
			
 
				         while(LOOP_BEGIN<len(text)):
			
 
				-            _text += re.sub("）",")",re.sub("（","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
			
 
				+            _text += re.sub("）",")",re.sub("（","(",re.sub("\s{2,}","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
			
 
				             LOOP_BEGIN += LOOP_LEN
			
 
				         text = _text
			
 
				     # 附件标识前修改为句号，避免正文和附件内容混合在一起
			
@@ -2191,7 +2190,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         article_processed = segment(article_processed)
			
 
				 
			
 
				         article_processed = article_processed.replace('(', '（').replace(')', '）')  #2022/8/10 统一为中文括号
			
 
				-        article_processed = article_processed.replace(':', '：')  #2023/1/5 统一为中文冒号
			
 
				+        # article_processed = article_processed.replace(':', '：')  #2023/1/5 统一为中文冒号
			
 
				+        article_processed = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])", "：", article_processed)
			
 
				         article_processed = article_processed.replace('．','.').replace('－', '-') # 2021/12/01 修正OCR识别PDF小数点错误问题
			
 
				         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
			
 
				         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
			
@@ -2347,7 +2347,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
				                             else:
			
 
				                                 outline_list.append(Outline(doc_id,outline_index,'',sentence_begin_index,sentence_end_index,wordOffset_begin,wordOffset_end))
			
 
				                             outline_index += 1
			
 
				-                        sentence_text = re.sub("##split##", "", sentence_text,count=1)
			
 
				+                        sentence_text = re.sub("##split##，?", "", sentence_text,count=1)
			
 
				                         last_sentence_index = (sentence_index,_match.start())
			
 
				                     temp_sentences.append(sentence_text)
			
 
				                 if attachment_begin_index>-1 and last_sentence_index[0]<attachment_begin_index:
			
@@ -2701,6 +2701,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                     company_index[sentence_index].add((ner_entity[0],ner_entity[1]))
			
 
				             #识别package
			
 
				 
			
 
				+            ner_time_list = []
			
 
				             #识别实体
			
 
				             for ner_entity in ner_entitys:
			
 
				                 begin_index_temp = ner_entity[0]
			
@@ -2708,6 +2709,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                 entity_type = ner_entity[2]
			
 
				                 entity_text = ner_entity[3]
			
 
				 
			
 
				+                if entity_type=='time':
			
 
				+                    ner_time_list.append((begin_index_temp,end_index_temp))
			
 
				                 if entity_type in ["org","company"] and not isLegalEnterprise(entity_text):
			
 
				                     continue
			
 
				                 # 实体长度限制
			
@@ -2733,7 +2736,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
			
 
				 
			
 
				                 #去掉标点符号
			
 
				-                entity_text = re.sub("[,，。：!&@$\*]","",entity_text)
			
 
				+                if entity_type!='time':
			
 
				+                    entity_text = re.sub("[,，。：!&@$\*]","",entity_text)
			
 
				                 entity_text = entity_text.replace("(","（").replace(")","）") if isinstance(entity_text,str) else entity_text
			
 
				                 # 组织机构实体名称补充
			
 
				                 if entity_type in ["org", "company"]:
			
@@ -2858,6 +2862,46 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
			
 
				                            begin_index_temp, end_index_temp,in_attachment=in_attachment))
			
 
				 
			
 
				+            # 时间实体格式补充
			
 
				+            re_time_new = re.compile("20\d{2}-\d{1,2}-\d{1,2}|20\d{2}/\d{1,2}/\d{1,2}|20\d{2}\.\d{1,2}\.\d{1,2}|20\d{2}(?:0[1-9]|1[0-2])(?:0[1-9]|[1-2][0-9]|3[0-1])")
			
 
				+            entity_type = "time"
			
 
				+            for _time in re.finditer(re_time_new,sentence_text):
			
 
				+                entity_text = _time.group()
			
 
				+                begin_index_temp = _time.start()
			
 
				+                end_index_temp = _time.end()
			
 
				+                is_same = False
			
 
				+                for t_index in ner_time_list:
			
 
				+                    if begin_index_temp>=t_index[0] and end_index_temp<=t_index[1]:
			
 
				+                        is_same = True
			
 
				+                        break
			
 
				+                if is_same:
			
 
				+                    continue
			
 
				+                if _time.start()!=0 and re.search("\d",sentence_text[_time.start()-1:_time.start()]):
			
 
				+                    continue
			
 
				+                # 纯数字格式，例：20190509
			
 
				+                if re.search("^\d{8}$",entity_text):
			
 
				+                    if _time.end()!=len(sentence_text) and re.search("[\da-zA-z]",sentence_text[_time.end():_time.end()+1]):
			
 
				+                        continue
			
 
				+                    entity_text = entity_text[:4] + "-" + entity_text[4:6] + "-" + entity_text[6:8]
			
 
				+                if not timeFormat(entity_text):
			
 
				+                    continue
			
 
				+
			
 
				+                for j in range(len(list_tokenbegin)):
			
 
				+                    if list_tokenbegin[j] == begin_index_temp:
			
 
				+                        begin_index = j
			
 
				+                        break
			
 
				+                    elif list_tokenbegin[j] > begin_index_temp:
			
 
				+                        begin_index = j - 1
			
 
				+                        break
			
 
				+                for j in range(begin_index, len(list_tokenbegin)):
			
 
				+                    if list_tokenbegin[j] >= end_index_temp:
			
 
				+                        end_index = j - 1
			
 
				+                        break
			
 
				+                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
			
 
				+                list_sentence_entitys.append(
			
 
				+                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
			
 
				+                           begin_index_temp, end_index_temp, in_attachment=in_attachment))
			
 
				+
			
 
				             # 资金来源提取  2020/12/30 新增
			
 
				             list_moneySource = extract_moneySource(sentence_text)
			
 
				             entity_type = "moneysource"
			
@@ -2882,7 +2926,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                 entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
			
 
				                 list_sentence_entitys.append(
			
 
				                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
			
 
				-                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
			
 
				+                           begin_index_temp, end_index_temp,in_attachment=in_attachment,prob=moneySource['prob']))
			
 
				 
			
 
				             # 电子邮箱提取 2021/11/04 新增
			
 
				             list_email = extract_email(sentence_text)
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -13,6 +13,7 @@ import os
 
				 import codecs
			
 
				 import requests
			
 
				 import time
			
 
				+from unicodedata import normalize
			
 
				 
			
 
				 _time1 = time.time()
			
 
				 sys.path.append(os.path.abspath("../.."))
			
@@ -106,11 +107,30 @@ def extractCount(extract_dict):
 
				         extract_count += 1
			
 
				     return extract_count
			
 
				 
			
 
				+# 字符编码标准化
			
 
				+def str_normalize(text):
			
 
				+    # time1 = time.time()
			
 
				+    cn_punctuation = "￥，｡：；｛｝！？（）"
			
 
				+    text_split = re.split("([{}])+".format(cn_punctuation),text)
			
 
				+    # print(text_split)
			
 
				+    new_text = ""
			
 
				+    for s in text_split:
			
 
				+        if re.search("^[{}]+$".format(cn_punctuation),s):
			
 
				+            new_text += s
			
 
				+        else:
			
 
				+            new_text += normalize('NFKD', s)
			
 
				+    # print("str_normalize cost time %s"%str(time.time()-time1))
			
 
				+    # print(new_text)
			
 
				+
			
 
				+    return new_text
			
 
				+
			
 
				 def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',**kwargs):
			
 
				     cost_time = dict()
			
 
				 
			
 
				     start_time = time.time()
			
 
				     log("start process doc %s"%(str(doc_id)))
			
 
				+    # 字符编码标准化
			
 
				+    text = str_normalize(text)
			
 
				     list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
			
 
				     log("get preprocessed done of doc_id%s"%(doc_id))
			
 
				     cost_time["preprocess"] = round(time.time()-start_time,2)
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -1,6 +1,6 @@
 
				 
			
 
				 
			
 
				-from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name,money_process
			
 
				+from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name,money_process,getDigitsDic,isValidDate
			
 
				 from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
			
 
				 from decimal import Decimal
			
 
				 import re
			
@@ -2835,7 +2835,43 @@ def turnBidWay(bidway):
 
				     else:
			
 
				         return "其他"
			
 
				 
			
 
				-my_time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
			
 
				+def turnMoneySource(moneysource):
			
 
				+    result_list = []
			
 
				+    if re.search("自筹|业主筹集|筹资|自有",moneysource):
			
 
				+        result_list.append("自筹")
			
 
				+    if re.search("财政",moneysource) and not re.search("非财政",moneysource):
			
 
				+        result_list.append("财政资金")
			
 
				+    if re.search("拨款|补助|划拨|拨付|国拨|上级资金",moneysource):
			
 
				+        result_list.append("上级拨款")
			
 
				+    if re.search("社会资本|社会资金",moneysource):
			
 
				+        result_list.append("社会资本")
			
 
				+    if re.search("贷款|借款|借贷",moneysource):
			
 
				+        result_list.append("贷款资金")
			
 
				+    if re.search("债券|债|国债",moneysource):
			
 
				+        result_list.append("债券资金")
			
 
				+    if re.search("专项|项目资金",moneysource):
			
 
				+        result_list.append("项目专项资金")
			
 
				+    if re.search("配套",moneysource):
			
 
				+        result_list.append("配套资金")
			
 
				+    if re.search("外资",moneysource):
			
 
				+        result_list.append("外资")
			
 
				+    if re.search("国有资金|国企资金|国资|国家投资",moneysource):
			
 
				+        result_list.append("国有资金")
			
 
				+    if re.search("投资|融资",moneysource):
			
 
				+        result_list.append("投资资金")
			
 
				+    if re.search("预算(?<!外)|预算内",moneysource):
			
 
				+        result_list.append("预算内资金")
			
 
				+    if re.search("预算外",moneysource):
			
 
				+        result_list.append("预算外资金")
			
 
				+
			
 
				+    result_list = sorted(result_list,key = lambda x:x)
			
 
				+    if len(result_list)>0 and len(result_list)<5:
			
 
				+        return ",".join(result_list)
			
 
				+    else:
			
 
				+        return "其他资金"
			
 
				+
			
 
				+my_time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
			
 
				+from BiddingKG.dl.ratio.re_ratio import getUnifyNum
			
 
				 import time
			
 
				 def my_timeFormat(_time):
			
 
				     current_year = time.strftime("%Y",time.localtime())
			
@@ -2855,24 +2891,52 @@ def my_timeFormat(_time):
 
				                 if k=="day":
			
 
				                     day = v
			
 
				             if year!="":
			
 
				-                if len(year)==2:
			
 
				-                    year = "20"+year
			
 
				-                if int(year)>int(current_year):
			
 
				-                    legal = False
			
 
				+                if re.search("^\d+$", year):
			
 
				+                    if len(year) == 2:
			
 
				+                        year = "20" + year
			
 
				+                    if int(year) > int(current_year):
			
 
				+                        legal = False
			
 
				+                else:
			
 
				+                    _year = ""
			
 
				+                    for word in year:
			
 
				+                        if word == '0':
			
 
				+                            _year += word
			
 
				+                        else:
			
 
				+                            _year += str(getDigitsDic(word))
			
 
				+                    year = _year
			
 
				             else:
			
 
				                 legal = False
			
 
				             if month!="":
			
 
				-                if int(month)>12:
			
 
				-                    legal = False
			
 
				+                if re.search("^\d+$", month):
			
 
				+                    if int(month) > 12:
			
 
				+                        legal = False
			
 
				+                else:
			
 
				+                    month = int(getUnifyNum(month))
			
 
				+                    if month >= 1 and month <= 12:
			
 
				+                        month = str(month)
			
 
				+                    else:
			
 
				+                        legal = False
			
 
				             else:
			
 
				                 legal = False
			
 
				             if day!="":
			
 
				-                if int(day)>31:
			
 
				-                    legal = False
			
 
				+                if re.search("^\d+$", day):
			
 
				+                    if int(day) > 31:
			
 
				+                        legal = False
			
 
				+                else:
			
 
				+                    day = int(getUnifyNum(day))
			
 
				+                    if day >= 1 and day <= 31:
			
 
				+                        day = str(day)
			
 
				+                    else:
			
 
				+                        legal = False
			
 
				             else:
			
 
				                 legal = False
			
 
				+            if not isValidDate(int(year),int(month),int(day)):
			
 
				+                legal = False
			
 
				             if legal:
			
 
				-                # return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
			
 
				+                # 数字字符格式化
			
 
				+                year = str(int(year))
			
 
				+                month = str(int(month))
			
 
				+                day = str(int(day))
			
 
				                 time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
			
 
				     return time_list
			
 
				 
			
@@ -2914,12 +2978,67 @@ def getTimeAttributes(list_entity,list_sentence):
 
				         sentence_text = list_sentence[entity.sentence_index].sentence_text
			
 
				         entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
			
 
				         entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
			
 
				+        entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 20):entity.wordOffset_begin]
			
 
				         entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
			
 
				         label_prob = entity.values[entity.label]
			
 
				         entity_text = entity.entity_text
			
 
				         in_attachment = entity.in_attachment
			
 
				         extract_time = my_timeFormat(entity_text)
			
 
				+        # definite_time = "00:00:00"
			
 
				+        # if extract_time:
			
 
				+        #     t = re.compile("(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[:：时点](?P<half_hour>半)?(?P<minute>\d{2})?[:：分]?(?P<second>\d{2})?秒?")
			
 
				+        #     t_in_word = re.search(t,entity_text)
			
 
				+        #     t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,sentence_text[entity.wordOffset_end:])
			
 
				+        #     if t_in_word:
			
 
				+        #         print('t_in_word',entity_text,t_in_word.groupdict())
			
 
				+        #         day = t_in_word.groupdict().get('day',"")
			
 
				+        #         hour = t_in_word.groupdict().get('hour',"")
			
 
				+        #         half_hour = t_in_word.groupdict().get('half_hour',"")
			
 
				+        #         minute = t_in_word.groupdict().get('minute',"")
			
 
				+        #         second = t_in_word.groupdict().get('second',"")
			
 
				+        #         if hour:
			
 
				+        #             if day=='下午' and int(hour)<12:
			
 
				+        #                 hour = str(int(hour)+12)
			
 
				+        #             if int(hour)>24:
			
 
				+        #                 continue
			
 
				+        #         else:
			
 
				+        #             hour = "00"
			
 
				+        #         if not minute:
			
 
				+        #             if half_hour:
			
 
				+        #                 minute = "30"
			
 
				+        #             else:
			
 
				+        #                 minute = "00"
			
 
				+        #         if int(minute)>60:
			
 
				+        #             continue
			
 
				+        #         if not second:
			
 
				+        #             second = "00"
			
 
				+        #         if int(second)>60:
			
 
				+        #             continue
			
 
				+        #         # 数字字符格式化
			
 
				+        #         # hour = str(int(hour))
			
 
				+        #         # minute = str(int(minute))
			
 
				+        #         # second = str(int(second))
			
 
				+        #         definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
			
 
				+        #         print(definite_time)
			
 
				+        #
			
 
				+        #     elif t_out_of_word:
			
 
				+        #         print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
			
 
				+
			
 
				+
			
 
				+
			
 
				         if extract_time:
			
 
				+            # 优化多个并列的时间，如：开标时间和截标时间，截标时间和报名结束时间
			
 
				+            if entity.label in [2,3,9]:
			
 
				+                if entity.label==2 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
			
 
				+                    dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
			
 
				+                if entity.label==3 and re.search("开标|评审.{,2}(?:开始)?时间|选取.{,2}时间",entity_left3):
			
 
				+                    dict_time['time_bidopen'].append((extract_time[0], 0.5, in_attachment))
			
 
				+                if entity.label==3 and re.search("报名",entity_left3):
			
 
				+                    dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
			
 
				+                if entity.label==9 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
			
 
				+                    dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
			
 
				+
			
 
				+
			
 
				             # 2022/12/12 新增挂牌时间正则
			
 
				             if re.search("挂牌.{,4}(?:时间|日期)",entity_left2):
			
 
				                 if re.search("挂牌.{,4}(?:时间|日期)",entity_left2).end()>len(entity_left2)/2:
			
@@ -3105,12 +3224,22 @@ def getOtherAttributes(list_entity):
 
				                   "total_tendereeMoney":0,
			
 
				                   "total_tendereeMoneyUnit":''}
			
 
				     list_serviceTime = []
			
 
				+    last_moneysource_prob = 0
			
 
				     for entity in list_entity:
			
 
				         if entity.entity_type == 'bidway':
			
 
				             dict_other["bidway"] = turnBidWay(entity.entity_text)
			
 
				         elif entity.entity_type=='moneysource':
			
 
				-            dict_other["moneysource"] = entity.entity_text
			
 
				+            if dict_other["moneysource"] and entity.in_attachment:
			
 
				+                continue
			
 
				+            if not dict_other["moneysource"]:
			
 
				+                dict_other["moneysource"] = entity.entity_text
			
 
				+                last_moneysource_prob = entity.prob
			
 
				+            elif entity.prob>last_moneysource_prob:
			
 
				+                dict_other["moneysource"] = entity.entity_text
			
 
				+                last_moneysource_prob = entity.prob
			
 
				         elif entity.entity_type=='serviceTime':
			
 
				+            if list_serviceTime and entity.in_attachment:
			
 
				+                continue
			
 
				             if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[\-\./]\d{1,2}", entity.entity_text):
			
 
				                 list_serviceTime.append(entity)
			
 
				         elif entity.entity_type=="person" and entity.label ==4:
			
@@ -3126,7 +3255,8 @@ def getOtherAttributes(list_entity):
 
				         max_prob_serviceTime = [ent for ent in list_serviceTime if ent.prob==max_prob]
			
 
				         max_prob_serviceTime.sort(key=lambda x:(x.sentence_index,x.begin_index))
			
 
				         dict_other["serviceTime"] = max_prob_serviceTime[0].entity_text
			
 
				-
			
 
				+    if dict_other['moneysource']:
			
 
				+        dict_other['moneysource'] = turnMoneySource(dict_other['moneysource'])
			
 
				     # dict_other["product"] = list(set(dict_other["product"])) # 已在添加时 顺序去重保留
			
 
				     return dict_other
			
 
				 
			
--- a/BiddingKG/dl/money/moneySource/ruleExtra.py
+++ b/BiddingKG/dl/money/moneySource/ruleExtra.py
@@ -10,8 +10,8 @@ def re_rule():
 
				     data = pd.read_csv("C:\\Users\\admin\\Desktop\\alldata_error.csv", index_col=0)
			
 
				 
			
 
				     rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来，?源[为于]?|来自于?)?(，|,|；+|：+)?)" 
			
 
				-              "(?P<moneySource>([^,，。;；已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" 
			
 
				-              "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([:：.、\d]+%)?[，,;；]?)?([^,，.。;；已]*(资本[金]|资金|自筹|贷款|补助|拨款|"
			
 
				+              "(?P<moneySource>([^,，。;；已]{,20}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" 
			
 
				+              "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([:：.、\d]+%)?[，,;；]?)?([^,，.。;；已]{,20}(资本[金]|资金|自筹|贷款|补助|拨款|"
			
 
				                "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[:：.、\d]*%[，,;；]?)*)")
			
 
				     num = 0
			
 
				     moneySourceList = []
			
@@ -76,13 +76,13 @@ def re_rule():
 
				 
			
 
				 def extract_moneySource(text):
			
 
				     rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来，?源[为于]?|来自于?)?(，|,|；+|：+)?)"
			
 
				-                      "(?P<moneySource>([^,，。;；已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
			
 
				-                      "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([:：.、\d]+%)?[，,;；]?)?([^,，.。;；已]*(资本[金]|资金|自筹|贷款|补助|拨款|"
			
 
				+                      "(?P<moneySource>([^,，。;；已]{,30}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
			
 
				+                      "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([:：.、\d]+%)?[，,;；]?)?([^,，.。;；已]{,30}(资本[金]|资金|自筹|贷款|补助|拨款|"
			
 
				                       "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[:：.、\d]*%[，,;；]?)*)")
			
 
				 
			
 
				     re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已，,。.;；]|资[金佥]性质)")
			
 
				     re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来，?源[为于]?|来自于?)?(，|,|；+|：+)?)"
			
 
				-                     r"(?P<moneySource>[^,，。;；已]{2,}?)[，。；,]")
			
 
				+                     r"(?P<moneySource>[^,，。;；已]{4,}?)[，。；,]")
			
 
				     re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析|是否")
			
 
				 
			
 
				     sub = re.compile("[：:。]|^[.,，、\)]|[,，;；]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
			
@@ -115,6 +115,7 @@ def extract_moneySource(text):
 
				                 # print(groupdict1)
			
 
				                 if source1:
			
 
				                     groupdict1["index"] = word_index
			
 
				+                    groupdict1["prob"] = 0.9
			
 
				                     # print(groupdict1['index'])
			
 
				                     results.append(groupdict1)
			
 
				             word_index += len(item)
			
@@ -127,8 +128,9 @@ def extract_moneySource(text):
 
				                     groupdict2 = res.groupdict()
			
 
				                     source2 = groupdict2['moneySource']
			
 
				                     # print("source2==>",source2)
			
 
				-                    if source2 and not re_error.search(source2):
			
 
				+                    if source2 and not re_error.search(res.group()):
			
 
				                         groupdict2["index"] = copy_index
			
 
				+                        groupdict2["prob"] = 0.8
			
 
				                         results.append(groupdict2)
			
 
				                 copy_index += len(item)
			
 
				     first = []
			
@@ -148,7 +150,7 @@ def extract_moneySource(text):
 
				     for result in first:
			
 
				         entity_text = sub.sub("",result['moneySource'])
			
 
				         # wordOffset_begin = result['index'] + re.search(entity_text,result['start']+result['moneySource']).start()
			
 
				-        if entity_text is None:
			
 
				+        if entity_text is None or len(entity_text)>40:
			
 
				             continue
			
 
				         else:
			
 
				             wordOffset_begin = result['index'] + (result['start']+result['moneySource']).find(entity_text)
			
@@ -158,6 +160,7 @@ def extract_moneySource(text):
 
				             _moneySource['body'] = entity_text
			
 
				             _moneySource['begin_index'] = wordOffset_begin
			
 
				             _moneySource['end_index'] = wordOffset_end
			
 
				+            _moneySource['prob'] = result['prob']
			
 
				             # print(_moneySource)
			
 
				             list_moneySource.append(_moneySource)
			
 
				     return list_moneySource
			
@@ -168,6 +171,7 @@ if __name__ == '__main__':
 
				     # re_rule()
			
 
				     test ="a建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，as，建设资金来自呜呜呜。" \
			
 
				           "1、采购内容及资金来源：采购内容为汉上实验学校采购24台3匹柜机空调。资金来源为财政资金。"
			
 
				+    # test = "，资金来源是否都是要具体到每条来源明细，"
			
 
				     # 11,23 35,37
			
 
				     print(extract_moneySource(test))
			
 
				     pass
			
--- a/BiddingKG/dl/ratio/re_ratio.py
+++ b/BiddingKG/dl/ratio/re_ratio.py
@@ -182,7 +182,7 @@ def getUnifyNum(money):
 
				                         result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
			
 
				                 # subMoneys[0]中无金额单位，不可再拆分
			
 
				                 elif subMoneys[0] == "":
			
 
				-                    result += 0
			
 
				+                    result += getMultipleFactor(factorUnit)
			
 
				                 elif re.search(re.compile("[%s]" % ("".join(chnFactorUnits))), subMoneys[0]) is None:
			
 
				                     # print(subMoneys)
			
 
				                     # subMoneys[0] = subMoneys[0][0]