Parcourir la source

Merge remote-tracking branch 'origin/master'

lsm il y a 2 ans
Parent
commit
39b0f82c23

+ 18 - 16
BiddingKG/dl/entityLink/entityLink.py

@@ -191,26 +191,28 @@ SET_TAIL_ENTERPRISE_HUGE_FILE = "SET_TAIL_ENTERPRISE_HUGE.pk"
 def getDict_enterprise():
     global DICT_ENTERPRISE_DONE,SET_ENTERPRISE,SET_PREFIX_ENTERPRISE,SET_TAIL_ENTERPRISE
     real_path,is_huge = getEnterprisePath()
+    _ok = False
     if is_huge:
         if os.path.exists(SET_PREFIX_ENTERPRISE_HUGE_FILE) and os.path.exists(SET_TAIL_ENTERPRISE_HUGE_FILE):
             SET_PREFIX_ENTERPRISE = load(SET_PREFIX_ENTERPRISE_HUGE_FILE)
             SET_TAIL_ENTERPRISE = load(SET_TAIL_ENTERPRISE_HUGE_FILE)
-        else:
-            with open(real_path,"r",encoding="UTF8") as f:
-                for _e in f:
-                    if not _e:
-                        continue
-                    _e = _e.strip()
-                    if len(_e)>=4:
-                        key_enter = _e[:ENTERPRISE_KEY_LEN]
-                        SET_PREFIX_ENTERPRISE.add(key_enter)
-                        SET_TAIL_ENTERPRISE.add(_e[-ENTERPRISE_TAIL_LEN:])
-                        if not is_huge:
-                            SET_ENTERPRISE.add(_e)
-            #仅在大文件情况下才使用缓存加载
-            if is_huge:
-                save(SET_PREFIX_ENTERPRISE,SET_PREFIX_ENTERPRISE_HUGE_FILE)
-                save(SET_TAIL_ENTERPRISE,SET_TAIL_ENTERPRISE_HUGE_FILE)
+            _ok = True
+    if not _ok:
+        with open(real_path,"r",encoding="UTF8") as f:
+            for _e in f:
+                if not _e:
+                    continue
+                _e = _e.strip()
+                if len(_e)>=4:
+                    key_enter = _e[:ENTERPRISE_KEY_LEN]
+                    SET_PREFIX_ENTERPRISE.add(key_enter)
+                    SET_TAIL_ENTERPRISE.add(_e[-ENTERPRISE_TAIL_LEN:])
+                    if not is_huge:
+                        SET_ENTERPRISE.add(_e)
+        #仅在大文件情况下才使用缓存加载
+        if is_huge:
+            save(SET_PREFIX_ENTERPRISE,SET_PREFIX_ENTERPRISE_HUGE_FILE)
+            save(SET_TAIL_ENTERPRISE,SET_TAIL_ENTERPRISE_HUGE_FILE)
 
 
     log("SET_PREFIX_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_PREFIX_ENTERPRISE)/1024/1024,len(SET_PREFIX_ENTERPRISE)))

+ 31 - 12
BiddingKG/dl/interface/Entitys.py

@@ -204,6 +204,7 @@ class Entity():
         self.origin_entity_text = ''  # 2022/1/5 新增,记录字典替换的原来的实体名
         self.in_attachment = in_attachment  # 2022/02/10添加,实体是否在附件中
         self.prob = prob  # 2022/06/20添加,实体的概率
+        self.ratio_value = None # 2022/10/18 新增费率处理数据,(value,ratio_type) 费率数值,类型
 
     def set_Role(self,role_label,role_values):
         self.label = int(role_label)
@@ -294,7 +295,7 @@ class Role():
         self.linklist = linklist
         self.money_unit = '' # 2021/8/17 新增 保存金额单位
         # 中投标人属性
-        self.ratio = "" #2022/01/06 新增 保存中投标金额相关费率
+        self.ratio = None #2022/01/06 新增 保存中投标金额相关费率 (ratio_value,ratio_type)
         self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期)
         self.address = ""  #2022/08/08 新增 角色地址
 
@@ -307,17 +308,35 @@ class Role():
         downward_floating_ratio = "" # 下浮率
         discount_ratio = "" # 折扣率/费率
         if self.ratio:
-            num_value = re.search("[\d\.]+",self.ratio).group()
-            num_value = float(num_value)
-            if re.search("%|百分之",self.ratio):
-                num_value = num_value / 100
-            num_value = str('%.4f'%(num_value))
-            if re.search("上浮",self.ratio):
-                floating_ratio = num_value
-            elif re.search("下浮",self.ratio):
-                downward_floating_ratio = num_value
-            else:
-                discount_ratio = num_value
+            # num_value = re.search("\d+(?:\.\d+)?",self.ratio).group()
+            # num_value = float(num_value)
+            # _decimal = str(num_value).split('.')[1]
+            # if _decimal=='0':
+            #     round_len = 0
+            # else:
+            #     round_len = len(_decimal)
+            # if re.search("%|百分之",self.ratio):
+            #     num_value = num_value * 0.01
+            #     round_len += 2
+            # elif re.search("‰|千分之",self.ratio):
+            #     num_value = num_value * 0.001
+            #     round_len += 3
+            # num_value = str(round(num_value,round_len))
+            #
+            # if re.search("上浮",self.ratio):
+            #     floating_ratio = num_value
+            # elif re.search("下浮",self.ratio):
+            #     downward_floating_ratio = num_value
+            # else:
+            #     discount_ratio = num_value
+            ratio_type = self.ratio[1]
+            ratio_value = str(self.ratio[0])
+            if ratio_type=='floating_ratio':
+                floating_ratio = ratio_value
+            elif ratio_type=='downward_floating_ratio':
+                downward_floating_ratio = ratio_value
+            elif ratio_type=='discount_ratio':
+                discount_ratio = ratio_value
         result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
                   'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
                   'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}

+ 5 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -2928,9 +2928,11 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         break
                 entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
                 entity_text = ratio['body']
-                list_sentence_entitys.append(
-                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
-                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
+                ratio_value = (ratio['value'],ratio['type'])
+                _entity = Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
+                           begin_index_temp, end_index_temp,in_attachment=in_attachment)
+                _entity.ratio_value = ratio_value
+                list_sentence_entitys.append(_entity)
 
             list_sentence_entitys.sort(key=lambda x:x.begin_index)
             list_entitys_temp = list_entitys_temp+list_sentence_entitys

+ 84 - 49
BiddingKG/dl/interface/getAttributes.py

@@ -31,6 +31,12 @@ dict_role_id = {"0":"tenderee",
                 "3":"second_tenderer",
                 "4":"third_tenderer"}
 
+role2id_dict = {"tenderee":0,
+                "agency":1,
+                "win_tenderer":2,
+                "second_tenderer":3,
+                "third_tenderer":4}
+
 def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
     '''
     @param:
@@ -1097,7 +1103,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     def addRatioByEntity(packDict,packageName,entity,ratio):
         for i in range(len(packDict[packageName]["roleList"])):
             if packDict[packageName]["roleList"][i].entity_text==entity:
-                packDict[packageName]["roleList"][i].ratio = ratio.entity_text
+                packDict[packageName]["roleList"][i].ratio = ratio.ratio_value
     def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
         for i in range(len(packDict[packageName]["roleList"])):
             if packDict[packageName]["roleList"][i].entity_text==entity:
@@ -1499,8 +1505,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
                        '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
+                       '400\d{7}转\d{1,4}|'
                        '[2-9]\d{6,7}')
-    url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
+    url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[#$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
     email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
                             "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
     phone_entitys = []
@@ -1554,7 +1561,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 pass
             else:
                 # 排除“传真号”和其它错误项
-                if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
+                if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
                     if not re.search("电,?话", phone_left):
                         error_numStr_index.append(numStr_index)
                         last_phone_mask = False
@@ -1596,6 +1603,20 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             error_numStr_index.append(numStr_index)
                             last_phone_mask = False
                             continue
+                left_context = re.search("[\da-zA-Z\-—-―]+$",sentence_text[:item[1]])
+                if left_context:
+                    if len(left_context.group()) != len("".join(re.findall(phone, left_context.group()))):
+                    # if not re.search("(" + phone.pattern + ")$", left_context.group()):
+                        error_numStr_index.append(numStr_index)
+                        last_phone_mask = False
+                        continue
+                right_context = re.search("^[\da-zA-Z\-—-―]+", sentence_text[item[2]:])
+                if right_context:
+                    if len(right_context.group()) != len("".join(re.findall(phone, right_context.group()))):
+                    # if not re.search("^(" + phone.pattern + ")", right_context.group()):
+                        error_numStr_index.append(numStr_index)
+                        last_phone_mask = False
+                        continue
                 # if:上一个phone实体不符合条件
                 if not last_phone_mask:
                     item_start = item[1]
@@ -1771,52 +1792,58 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 break
                 # print(3,combo[0].entity_text,combo[1].entity_text)
 
-        # "公司——地址" 链接规则补充
-        company_lacation_EntityList = [ent for ent in pre_entity if ent.entity_type in ['company', 'org', 'location']]
-        company_lacation_EntityList = sorted(company_lacation_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
-        t_match_list = []
-        for ent_idx in range(len(company_lacation_EntityList)):
-            entity = company_lacation_EntityList[ent_idx]
-            if entity.entity_type in ['company', 'org']:
-                match_nums = 0
-                company_nums = 0  # 经过其他公司的数量
-                location_nums = 0  # 经过电话的数量
-                for after_index in range(ent_idx + 1, min(len(company_lacation_EntityList), ent_idx + 5)):
-                    after_entity = company_lacation_EntityList[after_index]
-                    if after_entity.entity_type == "location":
-                        distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
-                                tokens_num_dict[entity.sentence_index] + entity.end_index)
-                        location_nums += 1
-                        if distance > 100 or location_nums >= 3:
-                            break
-                        sentence_distance = after_entity.sentence_index - entity.sentence_index
-                        value = (-1 / 2 * (distance ** 2)) / 10000
-                        if sentence_distance == 0:
-                            if distance < 80:
-                                t_match_list.append(Match(entity, after_entity, value))
-                                match_nums += 1
-                                if company_nums:
-                                    break
-                        else:
-                            if distance < 50:
-                                t_match_list.append(Match(entity, after_entity, value))
-                                match_nums += 1
-                                if company_nums:
-                                    break
+    # "公司——地址" 链接规则补充
+    company_lacation_EntityList = [ent for ent in pre_entity if ent.entity_type in ['company', 'org', 'location']]
+    # company_lacation_EntityList = [ent for ent in pre_entity if (ent.entity_type in ['company', 'org'] and ent.label!=5) or ent.entity_type=="location"]
+    company_lacation_EntityList = sorted(company_lacation_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
+    t_match_list = []
+    for ent_idx in range(len(company_lacation_EntityList)):
+        entity = company_lacation_EntityList[ent_idx]
+        if entity.entity_type in ['company', 'org'] and entity.label!=5:
+            match_nums = 0
+            company_nums = 0  # 经过其他公司的数量
+            location_nums = 0  # 经过电话的数量
+            for after_index in range(ent_idx + 1, min(len(company_lacation_EntityList), ent_idx + 5)):
+                after_entity = company_lacation_EntityList[after_index]
+                if after_entity.entity_type == "location":
+                    distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                            tokens_num_dict[entity.sentence_index] + entity.end_index)
+                    location_nums += 1
+                    if distance > 100 or location_nums >= 3:
+                        break
+                    sentence_distance = after_entity.sentence_index - entity.sentence_index
+                    value = (-1 / 2 * (distance ** 2)) / 10000
+                    if sentence_distance == 0:
+                        if distance < 80:
+                            t_match_list.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                            if company_nums:
+                                break
                     else:
-                        # type:company/org
-                        company_nums += 1
-                        if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
-                            break
+                        if distance < 50:
+                            t_match_list.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                            if company_nums:
+                                break
+                else:
+                    # type:company/org
+                    company_nums += 1
+                    if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
+                        break
+                    if entity.label in [0, 1] and after_entity.label in [2, 3, 4]:
+                        break
 
-        # km算法分配求解
-        relate_location_result = dispatch(t_match_list)
-        relate_location_result = sorted(relate_location_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
-        for match in relate_location_result:
-            _company = match[0]
-            _relation = match[1]
-            if not _company.pointer_address:
-                _company.pointer_address = _relation
+    # km算法分配求解
+    # for item in t_match_list:
+    #     print("loc_rela",item.main_role.entity_text,item.attribute.entity_text)
+    relate_location_result = dispatch(t_match_list)
+    relate_location_result = sorted(relate_location_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
+    for match in relate_location_result:
+        _company = match[0]
+        _relation = match[1]
+        # print("loc_rela2", _company.entity_text, _relation.entity_text, )
+        if not _company.pointer_address:
+            _company.pointer_address = _relation
     # "联系人——联系电话" 链接规则补充
     person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']]
     person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
@@ -2428,6 +2455,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 PackDict[k]["roleList"][i].linklist.remove(_item)
 
     # PackDict更新company/org地址
+    last_role_prob = {}
     for ent in pre_entity:
         if ent.entity_type in ['company','org']:
             if ent.pointer_address:
@@ -2436,9 +2464,16 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         if PackDict[k]["roleList"][i].entity_text == ent.entity_text:
                             if not PackDict[k]["roleList"][i].address:
                                 PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
+                                last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]]
                             else:
-                                if len(ent.pointer_address.entity_text) > len(PackDict[k]["roleList"][i].address):
-                                    PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
+                                if PackDict[k]["roleList"][i].role_name in ['tenderee','agency']:
+                                    # 角色为招标/代理人时,取其实体概率高的链接地址作为角色address
+                                    if ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]] > last_role_prob[PackDict[k]["roleList"][i].role_name]:
+                                        PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
+                                        last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]]
+                                else:
+                                    if len(ent.pointer_address.entity_text) > len(PackDict[k]["roleList"][i].address):
+                                        PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
 
     # 联系人——电子邮箱链接
     temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]

+ 22 - 0
BiddingKG/dl/interface/test/3.py

@@ -0,0 +1,22 @@
+
+
+import json
+
+
+# 自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32,
+                              np.float64)):
+            return float(obj)
+        elif isinstance(obj,str):
+            return obj
+        return json.JSONEncoder.default(self, obj)
+
+a = ['1231"23']
+
+print(json.dumps(a,cls=MyEncoder))

+ 181 - 12
BiddingKG/dl/ratio/re_ratio.py

@@ -1,14 +1,16 @@
 import re
-
+from decimal import Decimal
 # ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)'
 # ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率)([((]?%[))]?|)[))]?[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)'
-ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率)([((]?%[))]?|)[))]?[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?'
-                   '|[0-9]+\.?[0-9]{0,3}[((]?%?[))]?[((]?(费率|折扣率|(上浮|下浮)费?率)[))]?)')
-ratio = ratio.pattern
 
+ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率|优惠率)([((]?[%‰][))]?|)(报价|取值|)([((].{1,20}[))])?[))]?[为是:: ,]{0,3}'
+                   '([0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰]?[))]?|[百千]分之[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+(?:点[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+)?)'
+                   '|[0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰][))]?[((]?[\u4e00-\u9fa5]{,2}(?:费率|折扣率|优惠率|(上浮|下浮)费?率)[))]?)')
+ratio = ratio.pattern
+# print(ratio)
 
-# 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%,
-# 基准利率上浮率:百分之三十(30%)
+# 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%, X
+# 基准利率上浮率:百分之三十(30%) X
 # 租金上浮率
 # 上浮率活期20%
 # 上浮率:活期20%、一年定期35%
@@ -25,7 +27,12 @@ def re_standard_ratio(_str):
             m_span = m.span()
             keyword_index = [m_span[0], m_span[1]]
             keyword = m_dict.get("value")
-            ratio_list.append([keyword, keyword_index])
+            left = _str[max(0,m_span[0]-15):m_span[0]]
+            right = _str[m_span[1]:m_span[1]+10]
+            context = left + keyword + right
+            # print(1,keyword)
+            if not re.search("利率|保险",context) and not re.search("^[万元]",right):
+                ratio_list.append([keyword, keyword_index])
 
     return ratio_list
 
@@ -39,20 +46,182 @@ def re_ratio(text):
 def extract_ratio(text):
     result_list = []
     total_money_list = re_ratio(text)
+    # print(total_money_list)
     if total_money_list:
         for word, text_index in total_money_list:
-            d = {"body": word, "begin_index": text_index[0],
-                 "end_index": text_index[1]}
-            result_list.append(d)
+            num_value = re.search("\d+(?:\.\d+)?[((]?[%‰]?|[零壹贰叁肆伍陆柒捌玖拾佰百一二三四五六七八九十]+(?:点[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+)?(?!分之)", word).group()
+            if re.search("[零壹贰叁肆伍陆柒捌玖拾佰一二三四五六七八九十]",num_value):
+                if '点' in num_value:
+                    num_split = num_value.split("点")
+                    round_len = len(num_split[1])
+                    num_integer = num_split[0]
+                    if re.search("^[十拾佰百]",num_integer):
+                        num_integer = "壹" + num_integer
+                    num_value = getUnifyNum(num_integer)
+                    for index,num_word in enumerate(list(num_split[1])):
+                        num_value = float(num_value) + getDigitsDic(num_word) * 0.1**(index+1)
+                else:
+                    round_len = 0
+                    num_value = getUnifyNum(num_value)
+                    num_value = float(num_value)
+                if re.search("%|百分之", word):
+                    num_value = num_value / 100
+                    round_len += 2
+                elif re.search("‰|千分之", word):
+                    num_value = num_value / 1000
+                    round_len += 3
+            else:
+                # if not re.search("[%‰]",word):
+                #     continue
+                match_text = num_value
+                num_value = round(Decimal(re.sub('[((]|[%‰]','',num_value)),10)
+                # print(num_value)
+                # _num = str(num_value).split('.')[0]
+                if len(str(num_value).split('.'))<2:
+                    continue
+                _decimal = str(num_value).split('.')[1]
+                _decimal = re.sub("0+$","",_decimal)
+                # print(_decimal)
+                if _decimal=="":
+                    _decimal = "0"
+                # num_value = float(_num+"."+_decimal)
+                # print(num_value)
+                if _decimal == '0':
+                    round_len = 0
+                else:
+                    round_len = len(_decimal)
+                if num_value<1 and not re.search('[%‰]',match_text):
+                    pass
+
+                else:
+                    if re.search("%|百分之",word):
+                        num_value = num_value / 100
+                        round_len += 2
+                    elif re.search("‰|千分之",word):
+                        num_value = num_value / 1000
+                        round_len += 3
+                    else:
+                        num_value = num_value / 100
+                        round_len += 2
+
+            num_value = round(num_value, round_len)
+            # print(word,num_value)
+            if re.search("上浮",word):
+                ratio_type = 'floating_ratio'
+            elif re.search("下浮|优惠",word):
+                ratio_type = 'downward_floating_ratio'
+            elif re.search("折扣",word):
+                if num_value>0.5:
+                    ratio_type = 'discount_ratio'
+                else:
+                    ratio_type = 'downward_floating_ratio'
+            else:
+                ratio_type = 'discount_ratio'
+            if num_value<=1:
+                d = {"body": word, "begin_index": text_index[0],
+                     "end_index": text_index[1],"value":num_value,"type":ratio_type}
+                result_list.append(d)
     return result_list
 
 
+def getDigitsDic(unit):
+    '''
+    @summary:拿到中文对应的数字
+    '''
+    DigitsDic = {"零": 0, "壹": 1, "贰": 2, "叁": 3, "肆": 4, "伍": 5, "陆": 6, "柒": 7, "捌": 8, "玖": 9,
+                 "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
+    return DigitsDic.get(unit)
+
+
+def getMultipleFactor(unit):
+    '''
+    @summary:拿到单位对应的值
+    '''
+    MultipleFactor = {"兆": Decimal(1000000000000), "亿": Decimal(100000000), "万": Decimal(10000), "仟": Decimal(1000),
+                      "千": Decimal(1000), "佰": Decimal(100), "百": Decimal(100), "拾": Decimal(10), "十": Decimal(10),
+                      "元": Decimal(1), "圆": Decimal(1), "角": round(Decimal(0.1), 1), "分": round(Decimal(0.01), 2)}
+    return MultipleFactor.get(unit)
+
+
+def getUnifyNum(money):
+    '''
+    @summary:将中文金额字符串转换为数字金额
+    @param:
+        money:中文金额字符串
+    @return: decimal,数据金额
+    '''
+
+    MAX_MONEY = 1000000000000
+    MAX_NUM = 12
+    # 去掉逗号
+    money = re.sub("[,,]", "", money)
+    money = re.sub("[^0-9.一二三四五六七八九零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", money)
+    result = Decimal(0)
+    chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖","一","二","三","四","五","六","七","八","九"]
+    chnFactorUnits = ["圆", "元", "兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
+
+    LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
+    BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$" % ("".join(chnDigits)))
+    try:
+        if re.search(LowMoneypattern, money) is not None:
+            return Decimal(money)
+        elif re.search(BigMoneypattern, money) is not None:
+            return getDigitsDic(re.search(BigMoneypattern, money).group("BigMoney"))
+        for factorUnit in chnFactorUnits:
+            if re.search(re.compile(".*%s.*" % (factorUnit)), money) is not None:
+                subMoneys = re.split(re.compile("%s(?!.*%s.*)" % (factorUnit, factorUnit)), money)
+                if re.search(re.compile("^(\d+)(\.\d+)?$"), subMoneys[0]) is not None:
+                    if MAX_MONEY / getMultipleFactor(factorUnit) < Decimal(subMoneys[0]):
+                        return Decimal(0)
+                    result += Decimal(subMoneys[0]) * (getMultipleFactor(factorUnit))
+                elif len(subMoneys[0]) == 1:
+                    if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[0]) is not None:
+                        result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
+                # subMoneys[0]中无金额单位,不可再拆分
+                elif subMoneys[0] == "":
+                    result += 0
+                elif re.search(re.compile("[%s]" % ("".join(chnFactorUnits))), subMoneys[0]) is None:
+                    # print(subMoneys)
+                    # subMoneys[0] = subMoneys[0][0]
+                    result += Decimal(getUnifyNum(subMoneys[0])) * (getMultipleFactor(factorUnit))
+                else:
+                    result += Decimal(getUnifyNum(subMoneys[0])) * (getMultipleFactor(factorUnit))
+                if len(subMoneys) > 1:
+                    if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"), subMoneys[1]) is not None:
+                        result += Decimal(subMoneys[1])
+                    elif len(subMoneys[1]) == 1:
+                        if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[1]) is not None:
+                            result += Decimal(getDigitsDic(subMoneys[1]))
+                    else:
+                        result += Decimal(getUnifyNum(subMoneys[1]))
+                break
+    except Exception as e:
+        # traceback.print_exc()
+        return Decimal(0)
+    return result
+
+
 def test_str():
     s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
     s = '年利率较基准利率的上浮率(%): 30 活期存款下浮率:0.455% 协定存的下浮率,(1-下浮率)' \
         ' 上浮率....  上浮率30(%)  (下浮率%):43  下浮率报价0.5%'
-    s = '费率或单价等:报价:94.00%, 幕墙工程费率为25.08%, 投标成本警戒费率为90%, 下浮率3.15%'
-
+    s = '''费率%)61.20万
+费率(精确到小数点后两位)60.00%
+下浮率取值13%
+下浮率报价13%
+下浮率 百分之十点零陆(10.00%
+下浮率 大写:无 下浮率百分之贰拾陆 无 小写: 下浮26%
+下浮率% 30
+成交优惠率% 5.00
+下浮率 0.25
+下浮率 0.25%
+中标金额:57.75%(商业优惠率)
+费率):1800
+费率):12
+折扣率(%):99.2063
+投标报价:96.00%(折扣率
+'''
+    # s = '下浮率 百分之十点零陆(10.00%'
     print(extract_ratio(s))
 
 

Fichier diff supprimé car celui-ci est trop grand
+ 1 - 4
BiddingKG/dl/table_head/predict.py


+ 1 - 1
BiddingKG/dl_dev/test/test4.py

@@ -105,7 +105,7 @@ def run_one():
     # '''
     print("start")
     _time1 = time.time()
-    print(predict("12", text,"打印机"))
+    print(predict("12", text,""))
     # test(12,content)
     # test(12,text)
     print("takes",time.time()-a)

Certains fichiers n'ont pas été affichés car il y a eu trop de fichiers modifiés dans ce diff