Преглед изворни кода

截标时间优化,资金来源统一,公告字符标准化

znj пре 2 година
родитељ
комит
ec1c9a2b13

+ 46 - 9
BiddingKG/dl/common/Utils.py

@@ -604,7 +604,18 @@ def fitDataByRule(data):
     result = re.sub("[。]","",result)
     return  result
 
-time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
+from datetime import date
+# 时间合法性判断
+def isValidDate(year, month, day):
+    try:
+        date(year, month, day)
+    except:
+        return False
+    else:
+        return True
+
+time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
+from BiddingKG.dl.ratio.re_ratio import getUnifyNum
 def timeFormat(_time):
     current_year = time.strftime("%Y",time.localtime())
     all_match = re.finditer(time_format_pattern,_time)
@@ -622,22 +633,48 @@ def timeFormat(_time):
                 if k=="day":
                     day = v
             if year!="":
-                if len(year)==2:
-                    year = "20"+year
-                if int(year)>int(current_year):
-                    legal = False
+                if re.search("^\d+$",year):
+                    if len(year)==2:
+                        year = "20"+year
+                    if int(year)>int(current_year):
+                        legal = False
+                else:
+                    _year = ""
+                    for word in year:
+                        if word == '0':
+                            _year += word
+                        else:
+                            _year += str(getDigitsDic(word))
+                    year = _year
             else:
                 legal = False
             if month!="":
-                if int(month)>12:
-                    legal = False
+                if re.search("^\d+$", month):
+                    if int(month)>12:
+                        legal = False
+                else:
+                    month = int(getUnifyNum(month))
+                    if month>=1 and month<=12:
+                        month = str(month)
+                    else:
+                        legal = False
             else:
                 legal = False
             if day!="":
-                if int(day)>31:
-                    legal = False
+                if re.search("^\d+$", day):
+                    if int(day)>31:
+                        legal = False
+                else:
+                    day = int(getUnifyNum(day))
+                    if day >= 1 and day <= 31:
+                        day = str(day)
+                    else:
+                        legal = False
             else:
                 legal = False
+            # print(year,month,day)
+            if not isValidDate(int(year),int(month),int(day)):
+                legal = False
             if legal:
                 return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
     return ""

Разлика између датотеке није приказан због своје велике величине
+ 0 - 1
BiddingKG/dl/interface/Preprocessing.py


+ 20 - 0
BiddingKG/dl/interface/extract.py

@@ -13,6 +13,7 @@ import os
 import codecs
 import requests
 import time
+from unicodedata import normalize
 
 _time1 = time.time()
 sys.path.append(os.path.abspath("../.."))
@@ -106,11 +107,30 @@ def extractCount(extract_dict):
         extract_count += 1
     return extract_count
 
+# 字符编码标准化
+def str_normalize(text):
+    # time1 = time.time()
+    cn_punctuation = "¥,。:;{}!?()"
+    text_split = re.split("([{}])+".format(cn_punctuation),text)
+    # print(text_split)
+    new_text = ""
+    for s in text_split:
+        if re.search("^[{}]+$".format(cn_punctuation),s):
+            new_text += s
+        else:
+            new_text += normalize('NFKD', s)
+    # print("str_normalize cost time %s"%str(time.time()-time1))
+    # print(new_text)
+
+    return new_text
+
 def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',**kwargs):
     cost_time = dict()
 
     start_time = time.time()
     log("start process doc %s"%(str(doc_id)))
+    # 字符编码标准化
+    text = str_normalize(text)
     list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
     log("get preprocessed done of doc_id%s"%(doc_id))
     cost_time["preprocess"] = round(time.time()-start_time,2)

+ 143 - 13
BiddingKG/dl/interface/getAttributes.py

@@ -1,6 +1,6 @@
 
 
-from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name,money_process
+from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name,money_process,getDigitsDic,isValidDate
 from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
 from decimal import Decimal
 import re
@@ -3040,7 +3040,43 @@ def turnBidWay(bidway):
     else:
         return "其他"
 
-my_time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
+def turnMoneySource(moneysource):
+    result_list = []
+    if re.search("自筹|业主筹集|筹资|自有",moneysource):
+        result_list.append("自筹")
+    if re.search("财政",moneysource) and not re.search("非财政",moneysource):
+        result_list.append("财政资金")
+    if re.search("拨款|补助|划拨|拨付|国拨|上级资金",moneysource):
+        result_list.append("上级拨款")
+    if re.search("社会资本|社会资金",moneysource):
+        result_list.append("社会资本")
+    if re.search("贷款|借款|借贷",moneysource):
+        result_list.append("贷款资金")
+    if re.search("债券|债|国债",moneysource):
+        result_list.append("债券资金")
+    if re.search("专项|项目资金",moneysource):
+        result_list.append("项目专项资金")
+    if re.search("配套",moneysource):
+        result_list.append("配套资金")
+    if re.search("外资",moneysource):
+        result_list.append("外资")
+    if re.search("国有资金|国企资金|国资|国家投资",moneysource):
+        result_list.append("国有资金")
+    if re.search("投资|融资",moneysource):
+        result_list.append("投资资金")
+    if re.search("预算(?<!外)|预算内",moneysource):
+        result_list.append("预算内资金")
+    if re.search("预算外",moneysource):
+        result_list.append("预算外资金")
+
+    result_list = sorted(result_list,key = lambda x:x)
+    if len(result_list)>0 and len(result_list)<5:
+        return ",".join(result_list)
+    else:
+        return "其他资金"
+
+my_time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
+from BiddingKG.dl.ratio.re_ratio import getUnifyNum
 import time
 def my_timeFormat(_time):
     current_year = time.strftime("%Y",time.localtime())
@@ -3060,24 +3096,52 @@ def my_timeFormat(_time):
                 if k=="day":
                     day = v
             if year!="":
-                if len(year)==2:
-                    year = "20"+year
-                if int(year)>int(current_year):
-                    legal = False
+                if re.search("^\d+$", year):
+                    if len(year) == 2:
+                        year = "20" + year
+                    if int(year) > int(current_year):
+                        legal = False
+                else:
+                    _year = ""
+                    for word in year:
+                        if word == '0':
+                            _year += word
+                        else:
+                            _year += str(getDigitsDic(word))
+                    year = _year
             else:
                 legal = False
             if month!="":
-                if int(month)>12:
-                    legal = False
+                if re.search("^\d+$", month):
+                    if int(month) > 12:
+                        legal = False
+                else:
+                    month = int(getUnifyNum(month))
+                    if month >= 1 and month <= 12:
+                        month = str(month)
+                    else:
+                        legal = False
             else:
                 legal = False
             if day!="":
-                if int(day)>31:
-                    legal = False
+                if re.search("^\d+$", day):
+                    if int(day) > 31:
+                        legal = False
+                else:
+                    day = int(getUnifyNum(day))
+                    if day >= 1 and day <= 31:
+                        day = str(day)
+                    else:
+                        legal = False
             else:
                 legal = False
+            if not isValidDate(int(year),int(month),int(day)):
+                legal = False
             if legal:
-                # return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
+                # 数字字符格式化
+                year = str(int(year))
+                month = str(int(month))
+                day = str(int(day))
                 time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
     return time_list
 
@@ -3119,12 +3183,67 @@ def getTimeAttributes(list_entity,list_sentence):
         sentence_text = list_sentence[entity.sentence_index].sentence_text
         entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
         entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
+        entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 20):entity.wordOffset_begin]
         entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
         label_prob = entity.values[entity.label]
         entity_text = entity.entity_text
         in_attachment = entity.in_attachment
         extract_time = my_timeFormat(entity_text)
+        # definite_time = "00:00:00"
+        # if extract_time:
+        #     t = re.compile("(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{2})?[::分]?(?P<second>\d{2})?秒?")
+        #     t_in_word = re.search(t,entity_text)
+        #     t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,sentence_text[entity.wordOffset_end:])
+        #     if t_in_word:
+        #         print('t_in_word',entity_text,t_in_word.groupdict())
+        #         day = t_in_word.groupdict().get('day',"")
+        #         hour = t_in_word.groupdict().get('hour',"")
+        #         half_hour = t_in_word.groupdict().get('half_hour',"")
+        #         minute = t_in_word.groupdict().get('minute',"")
+        #         second = t_in_word.groupdict().get('second',"")
+        #         if hour:
+        #             if day=='下午' and int(hour)<12:
+        #                 hour = str(int(hour)+12)
+        #             if int(hour)>24:
+        #                 continue
+        #         else:
+        #             hour = "00"
+        #         if not minute:
+        #             if half_hour:
+        #                 minute = "30"
+        #             else:
+        #                 minute = "00"
+        #         if int(minute)>60:
+        #             continue
+        #         if not second:
+        #             second = "00"
+        #         if int(second)>60:
+        #             continue
+        #         # 数字字符格式化
+        #         # hour = str(int(hour))
+        #         # minute = str(int(minute))
+        #         # second = str(int(second))
+        #         definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
+        #         print(definite_time)
+        #
+        #     elif t_out_of_word:
+        #         print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
+
+
+
         if extract_time:
+            # 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间
+            if entity.label in [2,3,9]:
+                if entity.label==2 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
+                    dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
+                if entity.label==3 and re.search("开标|评审.{,2}(?:开始)?时间|选取.{,2}时间",entity_left3):
+                    dict_time['time_bidopen'].append((extract_time[0], 0.5, in_attachment))
+                if entity.label==3 and re.search("报名",entity_left3):
+                    dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
+                if entity.label==9 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
+                    dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
+
+
             # 2022/12/12 新增挂牌时间正则
             if re.search("挂牌.{,4}(?:时间|日期)",entity_left2):
                 if re.search("挂牌.{,4}(?:时间|日期)",entity_left2).end()>len(entity_left2)/2:
@@ -3310,12 +3429,22 @@ def getOtherAttributes(list_entity):
                   "total_tendereeMoney":0,
                   "total_tendereeMoneyUnit":''}
     list_serviceTime = []
+    last_moneysource_prob = 0
     for entity in list_entity:
         if entity.entity_type == 'bidway':
             dict_other["bidway"] = turnBidWay(entity.entity_text)
         elif entity.entity_type=='moneysource':
-            dict_other["moneysource"] = entity.entity_text
+            if dict_other["moneysource"] and entity.in_attachment:
+                continue
+            if not dict_other["moneysource"]:
+                dict_other["moneysource"] = entity.entity_text
+                last_moneysource_prob = entity.prob
+            elif entity.prob>last_moneysource_prob:
+                dict_other["moneysource"] = entity.entity_text
+                last_moneysource_prob = entity.prob
         elif entity.entity_type=='serviceTime':
+            if list_serviceTime and entity.in_attachment:
+                continue
             if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[\-\./]\d{1,2}", entity.entity_text):
                 list_serviceTime.append(entity)
         elif entity.entity_type=="person" and entity.label ==4:
@@ -3331,7 +3460,8 @@ def getOtherAttributes(list_entity):
         max_prob_serviceTime = [ent for ent in list_serviceTime if ent.prob==max_prob]
         max_prob_serviceTime.sort(key=lambda x:(x.sentence_index,x.begin_index))
         dict_other["serviceTime"] = max_prob_serviceTime[0].entity_text
-
+    if dict_other['moneysource']:
+        dict_other['moneysource'] = turnMoneySource(dict_other['moneysource'])
     # dict_other["product"] = list(set(dict_other["product"])) # 已在添加时 顺序去重保留
     return dict_other
 

+ 11 - 7
BiddingKG/dl/money/moneySource/ruleExtra.py

@@ -10,8 +10,8 @@ def re_rule():
     data = pd.read_csv("C:\\Users\\admin\\Desktop\\alldata_error.csv", index_col=0)
 
     rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)" 
-              "(?P<moneySource>([^,,。;;已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" 
-              "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]*(资本[金]|资金|自筹|贷款|补助|拨款|"
+              "(?P<moneySource>([^,,。;;已]{,20}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" 
+              "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]{,20}(资本[金]|资金|自筹|贷款|补助|拨款|"
                "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)")
     num = 0
     moneySourceList = []
@@ -76,13 +76,13 @@ def re_rule():
 
 def extract_moneySource(text):
     rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
-                      "(?P<moneySource>([^,,。;;已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
-                      "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]*(资本[金]|资金|自筹|贷款|补助|拨款|"
+                      "(?P<moneySource>([^,,。;;已]{,30}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
+                      "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]{,30}(资本[金]|资金|自筹|贷款|补助|拨款|"
                       "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)")
 
     re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已,,。.;;]|资[金佥]性质)")
     re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
-                     r"(?P<moneySource>[^,,。;;已]{2,}?)[,。;,]")
+                     r"(?P<moneySource>[^,,。;;已]{4,}?)[,。;,]")
     re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析|是否")
 
     sub = re.compile("[::。]|^[.,,、\)]|[,,;;]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
@@ -115,6 +115,7 @@ def extract_moneySource(text):
                 # print(groupdict1)
                 if source1:
                     groupdict1["index"] = word_index
+                    groupdict1["prob"] = 0.9
                     # print(groupdict1['index'])
                     results.append(groupdict1)
             word_index += len(item)
@@ -127,8 +128,9 @@ def extract_moneySource(text):
                     groupdict2 = res.groupdict()
                     source2 = groupdict2['moneySource']
                     # print("source2==>",source2)
-                    if source2 and not re_error.search(source2):
+                    if source2 and not re_error.search(res.group()):
                         groupdict2["index"] = copy_index
+                        groupdict2["prob"] = 0.8
                         results.append(groupdict2)
                 copy_index += len(item)
     first = []
@@ -148,7 +150,7 @@ def extract_moneySource(text):
     for result in first:
         entity_text = sub.sub("",result['moneySource'])
         # wordOffset_begin = result['index'] + re.search(entity_text,result['start']+result['moneySource']).start()
-        if entity_text is None:
+        if entity_text is None or len(entity_text)>40:
             continue
         else:
             wordOffset_begin = result['index'] + (result['start']+result['moneySource']).find(entity_text)
@@ -158,6 +160,7 @@ def extract_moneySource(text):
             _moneySource['body'] = entity_text
             _moneySource['begin_index'] = wordOffset_begin
             _moneySource['end_index'] = wordOffset_end
+            _moneySource['prob'] = result['prob']
             # print(_moneySource)
             list_moneySource.append(_moneySource)
     return list_moneySource
@@ -168,6 +171,7 @@ if __name__ == '__main__':
     # re_rule()
     test ="a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。" \
           "1、采购内容及资金来源:采购内容为汉上实验学校采购24台3匹柜机空调。资金来源为财政资金。"
+    # test = ",资金来源是否都是要具体到每条来源明细,"
     # 11,23 35,37
     print(extract_moneySource(test))
     pass

+ 1 - 1
BiddingKG/dl/ratio/re_ratio.py

@@ -182,7 +182,7 @@ def getUnifyNum(money):
                         result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
                 # subMoneys[0]中无金额单位,不可再拆分
                 elif subMoneys[0] == "":
-                    result += 0
+                    result += getMultipleFactor(factorUnit)
                 elif re.search(re.compile("[%s]" % ("".join(chnFactorUnits))), subMoneys[0]) is None:
                     # print(subMoneys)
                     # subMoneys[0] = subMoneys[0][0]

Неке датотеке нису приказане због велике количине промена