Forráskód Böngészése

调整金额正则,新增返回金额单位,缩小channel输入字数。

bidi 3 éve
szülő
commit
e13e72c1f6

+ 4 - 1
BiddingKG/dl/interface/Entitys.py

@@ -167,6 +167,7 @@ class Entity():
         self.is_tail = False
         self.person_phone = person_phone
         self.notes = ''  # 2021/7/20 新增,保存金额大小写,单位等备注
+        self.money_unit = '' #2021/8/17 新增,保存金额单位 元、万元 、亿元
 
     def set_Role(self,role_label,role_values):
         self.label = int(role_label)
@@ -255,10 +256,12 @@ class Role():
         self.money = money
         self.money_prob = money_prob
         self.linklist = linklist
+        self.money_unit = '' # 2021/8/17 新增 保存金额单位
         
     def getString(self):
         self.linklist = [item for item in set(self.linklist)]
-        result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist]
+        # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist]
+        result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit]
         return result
 
 # 用于KM算法的组合配对

+ 7 - 4
BiddingKG/dl/interface/Preprocessing.py

@@ -1688,9 +1688,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             #                       "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                                  "key_word":"((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|标的基本情况|CNY|成交结果:)(?:[,(\(]*\s*(?P<unit_key_word_before>[万元]*(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号]{,8}?))(第[123一二三]名[::])?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万元]*(?P<filter_unit1>[台个只吨斤棵株页亩方条米]*))\s*[)\)]?))",
-                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
-                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
+                                  "key_word":"((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|标的基本情况|CNY|成交结果:)(?:[,(\(]*\s*(?P<unit_key_word_before>[万亿]?元?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号]{,8}?))(第[123一二三]名[::])?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?元?(?P<filter_unit1>[台个只吨斤棵株页亩方条米]*))\s*[)\)]?))",
+                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?元)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
+                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千]*)[\((]?(?P<unit_behind_m>[万亿]?元(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
             # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
 
             pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
@@ -1855,8 +1855,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         # print("补充备注:单价 ",sentence_text[_match.span()[0]-2:_match.span()[1]])
                     if len(unit)>0:
                         if unit.find('万')>=0 and len(entity_text.split('.')[0])>=8: # 2021/7/19 修正万元金额过大的情况
-                            # print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
+                            print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
                             entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(unit[0])/10000)
+                            unit = '元' # 修正金额后单位 重置为元
                         else:
                             # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
                             entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0]))
@@ -1885,6 +1886,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         if float(entity_text)>1:
                             list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp))
                             list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
+                            list_sentence_entitys[-1].money_unit = unit  # 2021/7/20 新增金额备注
+                            # print('预处理中的 金额:%s, 单位:%s'%(entity_text,unit))
                 else:
                     index += 1
 

+ 17 - 13
BiddingKG/dl/interface/extract.py

@@ -130,17 +130,21 @@ def test(name,content):
 
 if __name__=="__main__":
     import pandas as pd
-
-    df = pd.read_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0812.xlsx')
-    new_prem = []
-    for i in range(len(df)):
-    # i = 246
-        doc_id = df.loc[i, 'docid']
-        text = df.loc[i, 'dochtmlcon']
-        title = df.loc[i, 'doctitle']
-        rs = predict(doc_id,text,title)
-        # print(rs)
-        new_prem.append(rs)
-    df['new_prem'] = pd.Series(new_prem)
-    df.to_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0813.xlsx')
+    t1 = time.time()
+    text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。'
+    title = '合同公告'
+    print(predict('',text,title))
+    # df = pd.read_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0812.xlsx')[:20]
+    # new_prem = []
+    # for i in range(len(df)):
+    # # i = 246
+    #     doc_id = df.loc[i, 'docid']
+    #     text = df.loc[i, 'dochtmlcon']
+    #     title = df.loc[i, 'doctitle']
+    #     rs = predict(doc_id,text,title)
+    #     # print(rs)
+    #     new_prem.append(rs)
+    # df['new_prem'] = pd.Series(new_prem)
+    # print('耗时:', time.time()-t1)
+    # df.to_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0813.xlsx')
     pass

+ 17 - 4
BiddingKG/dl/interface/getAttributes.py

@@ -795,11 +795,14 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                 if packDict[packageName]["roleList"][i].money_prob==0 :  # 2021/7/20第一次更新金额
                     packDict[packageName]["roleList"][i].money = money.entity_text
                     packDict[packageName]["roleList"][i].money_prob = money_prob
+                    packDict[packageName]["roleList"][i].money_unit = money.money_unit
                 elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or money.notes in ['大写']: # 2021/7/20改为优先选择大写金额,
                     # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob)
                     # print('链接金额备注 ',money.notes, money.entity_text, money.values)
                     packDict[packageName]["roleList"][i].money = money.entity_text
                     packDict[packageName]["roleList"][i].money_prob = money_prob
+                    packDict[packageName]["roleList"][i].money_unit = money.money_unit
+                # print('链接中的金额:{0}, 单位:{1}'.format(money.entity_text, money.money_unit))
         return packDict
     
     #根据实体名称得到角色
@@ -1826,6 +1829,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
     set_tenderer_money = set()
     list_tenderer_money = []  #2021/7/16 新增列表,倒序保存所有中标金额
+    unit_list = [] #2021/8/17 新增,保存金额单位
 
     #遍历所有实体
     while(p_entity>=0):
@@ -1835,6 +1839,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                 if str(entity.label)=="1":
                     set_tenderer_money.add(float(entity.entity_text))
                     list_tenderer_money.append(float(entity.entity_text))  # 2021/7/16 新增列表,倒序保存所有中标金额
+                    unit_list.append(entity.money_unit)
                 # if str(entity.label)=="0":
                 if str(entity.label)=="0" and entity.notes!='总投资':
                     '''
@@ -1855,8 +1860,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                         #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
                         if entity.values[entity.label]>on_value:
                             PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
+                            PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit
                     else:
                         PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
+                        PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit
                         #add pointer_tendereeMoney
                         packagePointer.pointer_tendereeMoney = entity
         p_entity -= 1            
@@ -1888,6 +1895,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
     #只找到一个中标人和中标金额
     if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
         list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
+        list(set_tenderer_role)[0].money_unit = unit_list[0]
         # print('一个中标人一个金额:', list(set_tenderer_money)[0])
     #找到一个中标人和多个招标金额
     if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
@@ -1904,9 +1912,11 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
             # list(set_tenderer_role)[0].money = _maxMoney
             if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000:
                 list(set_tenderer_role)[0].money = min(list_tenderer_money)
+                list(set_tenderer_role)[0].money_unit = unit_list[list_tenderer_money.index(min(list_tenderer_money))]
                 # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额:', min(list_tenderer_money))
             else:
                 list(set_tenderer_role)[0].money = list_tenderer_money[-1]  # 2021/7/16 修改 不是单价合计方式取第一个中标金额
+                list(set_tenderer_role)[0].money_unit = unit_list[-1] # 金额单位
                 # print('一人多金额 取第一个中标金额:', list_tenderer_money[-1])
     #每个包都只找到一个金额
     _flag_pack_money = True
@@ -1955,13 +1965,14 @@ def initPackageAttr(RoleList,PackageSet):
     @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
     '''   
     packDict = dict()
-    packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[]}
+    packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
     for item in list(PackageSet):
-        packDict[item] = {"code":"","tendereeMoney":0,"roleList":[]}
+        packDict[item] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
     for item in RoleList:
         if packDict[item.packageName]["code"] =="":
             packDict[item.packageName]["code"] = item.packageCode
-        packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
+        # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
+        packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
     return packDict
                 
 def getPackageRoleMoney(list_sentence,list_entity):
@@ -2016,7 +2027,8 @@ def getOtherAttributes(list_entity):
                   "time_bidclose":"",
                   "serviceTime":"",
                   "product":[],
-                  "total_tendereeMoney":0}
+                  "total_tendereeMoney":0,
+                  "total_tendereeMoneyUnit":''}
     for entity in list_entity:
         if entity.entity_type == 'bidway':
             dict_other["bidway"] = turnBidWay(entity.entity_text)
@@ -2036,6 +2048,7 @@ def getOtherAttributes(list_entity):
             dict_other["product"].append(entity.entity_text)
         elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
             dict_other["total_tendereeMoney"] = float(entity.entity_text)
+            dict_other["total_tendereeMoneyUnit"] = entity.money_unit
     dict_other["product"] = list(set(dict_other["product"]))
     return dict_other
 

+ 1 - 1
BiddingKG/dl/interface/predictor.py

@@ -1629,7 +1629,7 @@ class DocChannel():
     if isinstance(content, list):
       token_l = [it.tokens for it in content]
       tokens = [it for l in token_l for it in l]
-      content = ' '.join(tokens)
+      content = ' '.join(tokens[:500])
 
     data_content, data_title = self.predict_process(docid='', doctitle=title[:50], dochtmlcon=content) # 标题最多取50字
     text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len