فهرست منبع

角色地址链接优化,电话提取优化等

znj 2 سال پیش
والد
کامیت
bba127d991

+ 31 - 12
BiddingKG/dl/interface/Entitys.py

@@ -204,6 +204,7 @@ class Entity():
         self.origin_entity_text = ''  # 2022/1/5 新增,记录字典替换的原来的实体名
         self.in_attachment = in_attachment  # 2022/02/10添加,实体是否在附件中
         self.prob = prob  # 2022/06/20添加,实体的概率
+        self.ratio_value = None # 2022/10/18 新增费率处理数据,(value,ratio_type) 费率数值,类型
 
     def set_Role(self,role_label,role_values):
         self.label = int(role_label)
@@ -294,7 +295,7 @@ class Role():
         self.linklist = linklist
         self.money_unit = '' # 2021/8/17 新增 保存金额单位
         # 中投标人属性
-        self.ratio = "" #2022/01/06 新增 保存中投标金额相关费率
+        self.ratio = None #2022/01/06 新增 保存中投标金额相关费率 (ratio_value,ratio_type)
         self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期)
         self.address = ""  #2022/08/08 新增 角色地址
 
@@ -307,17 +308,35 @@ class Role():
         downward_floating_ratio = "" # 下浮率
         discount_ratio = "" # 折扣率/费率
         if self.ratio:
-            num_value = re.search("[\d\.]+",self.ratio).group()
-            num_value = float(num_value)
-            if re.search("%|百分之",self.ratio):
-                num_value = num_value / 100
-            num_value = str('%.4f'%(num_value))
-            if re.search("上浮",self.ratio):
-                floating_ratio = num_value
-            elif re.search("下浮",self.ratio):
-                downward_floating_ratio = num_value
-            else:
-                discount_ratio = num_value
+            # num_value = re.search("\d+(?:\.\d+)?",self.ratio).group()
+            # num_value = float(num_value)
+            # _decimal = str(num_value).split('.')[1]
+            # if _decimal=='0':
+            #     round_len = 0
+            # else:
+            #     round_len = len(_decimal)
+            # if re.search("%|百分之",self.ratio):
+            #     num_value = num_value * 0.01
+            #     round_len += 2
+            # elif re.search("‰|千分之",self.ratio):
+            #     num_value = num_value * 0.001
+            #     round_len += 3
+            # num_value = str(round(num_value,round_len))
+            #
+            # if re.search("上浮",self.ratio):
+            #     floating_ratio = num_value
+            # elif re.search("下浮",self.ratio):
+            #     downward_floating_ratio = num_value
+            # else:
+            #     discount_ratio = num_value
+            ratio_type = self.ratio[1]
+            ratio_value = str(self.ratio[0])
+            if ratio_type=='floating_ratio':
+                floating_ratio = ratio_value
+            elif ratio_type=='downward_floating_ratio':
+                downward_floating_ratio = ratio_value
+            elif ratio_type=='discount_ratio':
+                discount_ratio = ratio_value
         result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
                   'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
                   'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}

+ 21 - 18
BiddingKG/dl/interface/Preprocessing.py

@@ -1004,8 +1004,10 @@ def tableToText(soup):
             for _tr in _tbody.find_all(recursive=False):
                 len_td = len(_tr.find_all(recursive=False))
                 _td_len_list.append(len_td)
-            if len(list(set(_td_len_list)))>8:
-                return None
+            if _td_len_list:
+                if len(list(set(_td_len_list)))>=8 or max(_td_len_list)>100:
+                    return None
+
         fixSpan(tbody)
         inner_table = getTable(tbody)
         inner_table = fixTable(inner_table)
@@ -1060,12 +1062,20 @@ def tableToText(soup):
     # 遍历表格中的每个tbody
     tbodies = []
     in_attachment = False
+    tmp_part_list = []
     for _part in soup.find_all():
         if _part.name=='table':
-            tbodies.append((_part,in_attachment))
+            if _part in tmp_part_list:
+                # 表格在合并的附件中
+                tbodies.append((_part,True))
+            else:
+                tbodies.append((_part,in_attachment))
         elif _part.name=='div':
             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
                 in_attachment = True
+            # 记录被合并到正文的附件信息 find_all
+            if 'filemd5' in _part.attrs and in_attachment==False:
+                tmp_part_list = _part.find_all()
     #逆序处理嵌套表格
     for tbody_index in range(1,len(tbodies)+1):
         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
@@ -2467,9 +2477,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             #                       "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?[美日欧]?元?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?[美日欧]?元?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
-                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?[美日欧]?元)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
-                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?[美日欧]?元(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
+                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
+                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
             # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
 
             pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
@@ -2675,15 +2685,6 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000:', entity_text, unit)
                         continue
 
-                    if re.search("美元",_match.group()):
-                        Dollar2RMB = 7
-                        entity_text = str(float(entity_text)*Dollar2RMB)
-                    elif re.search("日元",_match.group()):
-                        JPyen2RMB = 0.05
-                        entity_text = str(float(entity_text)*JPyen2RMB)
-                    elif re.search("欧元",_match.group()):
-                        Euro2RMB = 6.9
-                        entity_text = str(float(entity_text)*Euro2RMB)
 
                     _exists = False
                     for item in list_sentence_entitys:
@@ -2863,9 +2864,11 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         break
                 entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
                 entity_text = ratio['body']
-                list_sentence_entitys.append(
-                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
-                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
+                ratio_value = (ratio['value'],ratio['type'])
+                _entity = Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
+                           begin_index_temp, end_index_temp,in_attachment=in_attachment)
+                _entity.ratio_value = ratio_value
+                list_sentence_entitys.append(_entity)
 
             list_sentence_entitys.sort(key=lambda x:x.begin_index)
             list_entitys_temp = list_entitys_temp+list_sentence_entitys

+ 84 - 49
BiddingKG/dl/interface/getAttributes.py

@@ -31,6 +31,12 @@ dict_role_id = {"0":"tenderee",
                 "3":"second_tenderer",
                 "4":"third_tenderer"}
 
+role2id_dict = {"tenderee":0,
+                "agency":1,
+                "win_tenderer":2,
+                "second_tenderer":3,
+                "third_tenderer":4}
+
 def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
     '''
     @param:
@@ -851,7 +857,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     def addRatioByEntity(packDict,packageName,entity,ratio):
         for i in range(len(packDict[packageName]["roleList"])):
             if packDict[packageName]["roleList"][i].entity_text==entity:
-                packDict[packageName]["roleList"][i].ratio = ratio.entity_text
+                packDict[packageName]["roleList"][i].ratio = ratio.ratio_value
     def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
         for i in range(len(packDict[packageName]["roleList"])):
             if packDict[packageName]["roleList"][i].entity_text==entity:
@@ -1253,8 +1259,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
                        '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
+                       '400\d{7}转\d{1,4}|'
                        '[2-9]\d{6,7}')
-    url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
+    url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[#$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
     email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
                             "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
     phone_entitys = []
@@ -1308,7 +1315,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 pass
             else:
                 # 排除“传真号”和其它错误项
-                if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
+                if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
                     if not re.search("电,?话", phone_left):
                         error_numStr_index.append(numStr_index)
                         last_phone_mask = False
@@ -1350,6 +1357,20 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             error_numStr_index.append(numStr_index)
                             last_phone_mask = False
                             continue
+                left_context = re.search("[\da-zA-Z\-—-―]+$",sentence_text[:item[1]])
+                if left_context:
+                    if len(left_context.group()) != len("".join(re.findall(phone, left_context.group()))):
+                    # if not re.search("(" + phone.pattern + ")$", left_context.group()):
+                        error_numStr_index.append(numStr_index)
+                        last_phone_mask = False
+                        continue
+                right_context = re.search("^[\da-zA-Z\-—-―]+", sentence_text[item[2]:])
+                if right_context:
+                    if len(right_context.group()) != len("".join(re.findall(phone, right_context.group()))):
+                    # if not re.search("^(" + phone.pattern + ")", right_context.group()):
+                        error_numStr_index.append(numStr_index)
+                        last_phone_mask = False
+                        continue
                 # if:上一个phone实体不符合条件
                 if not last_phone_mask:
                     item_start = item[1]
@@ -1525,52 +1546,58 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 break
                 # print(3,combo[0].entity_text,combo[1].entity_text)
 
-        # "公司——地址" 链接规则补充
-        company_lacation_EntityList = [ent for ent in pre_entity if ent.entity_type in ['company', 'org', 'location']]
-        company_lacation_EntityList = sorted(company_lacation_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
-        t_match_list = []
-        for ent_idx in range(len(company_lacation_EntityList)):
-            entity = company_lacation_EntityList[ent_idx]
-            if entity.entity_type in ['company', 'org']:
-                match_nums = 0
-                company_nums = 0  # 经过其他公司的数量
-                location_nums = 0  # 经过电话的数量
-                for after_index in range(ent_idx + 1, min(len(company_lacation_EntityList), ent_idx + 5)):
-                    after_entity = company_lacation_EntityList[after_index]
-                    if after_entity.entity_type == "location":
-                        distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
-                                tokens_num_dict[entity.sentence_index] + entity.end_index)
-                        location_nums += 1
-                        if distance > 100 or location_nums >= 3:
-                            break
-                        sentence_distance = after_entity.sentence_index - entity.sentence_index
-                        value = (-1 / 2 * (distance ** 2)) / 10000
-                        if sentence_distance == 0:
-                            if distance < 80:
-                                t_match_list.append(Match(entity, after_entity, value))
-                                match_nums += 1
-                                if company_nums:
-                                    break
-                        else:
-                            if distance < 50:
-                                t_match_list.append(Match(entity, after_entity, value))
-                                match_nums += 1
-                                if company_nums:
-                                    break
+    # "公司——地址" 链接规则补充
+    company_lacation_EntityList = [ent for ent in pre_entity if ent.entity_type in ['company', 'org', 'location']]
+    # company_lacation_EntityList = [ent for ent in pre_entity if (ent.entity_type in ['company', 'org'] and ent.label!=5) or ent.entity_type=="location"]
+    company_lacation_EntityList = sorted(company_lacation_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
+    t_match_list = []
+    for ent_idx in range(len(company_lacation_EntityList)):
+        entity = company_lacation_EntityList[ent_idx]
+        if entity.entity_type in ['company', 'org'] and entity.label!=5:
+            match_nums = 0
+            company_nums = 0  # 经过其他公司的数量
+            location_nums = 0  # 经过电话的数量
+            for after_index in range(ent_idx + 1, min(len(company_lacation_EntityList), ent_idx + 5)):
+                after_entity = company_lacation_EntityList[after_index]
+                if after_entity.entity_type == "location":
+                    distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                            tokens_num_dict[entity.sentence_index] + entity.end_index)
+                    location_nums += 1
+                    if distance > 100 or location_nums >= 3:
+                        break
+                    sentence_distance = after_entity.sentence_index - entity.sentence_index
+                    value = (-1 / 2 * (distance ** 2)) / 10000
+                    if sentence_distance == 0:
+                        if distance < 80:
+                            t_match_list.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                            if company_nums:
+                                break
                     else:
-                        # type:company/org
-                        company_nums += 1
-                        if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
-                            break
+                        if distance < 50:
+                            t_match_list.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                            if company_nums:
+                                break
+                else:
+                    # type:company/org
+                    company_nums += 1
+                    if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
+                        break
+                    if entity.label in [0, 1] and after_entity.label in [2, 3, 4]:
+                        break
 
-        # km算法分配求解
-        relate_location_result = dispatch(t_match_list)
-        relate_location_result = sorted(relate_location_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
-        for match in relate_location_result:
-            _company = match[0]
-            _relation = match[1]
-            if not _company.pointer_address:
-                _company.pointer_address = _relation
+    # km算法分配求解
+    # for item in t_match_list:
+    #     print("loc_rela",item.main_role.entity_text,item.attribute.entity_text)
+    relate_location_result = dispatch(t_match_list)
+    relate_location_result = sorted(relate_location_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
+    for match in relate_location_result:
+        _company = match[0]
+        _relation = match[1]
+        # print("loc_rela2", _company.entity_text, _relation.entity_text, )
+        if not _company.pointer_address:
+            _company.pointer_address = _relation
     # "联系人——联系电话" 链接规则补充
     person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']]
     person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
@@ -2182,6 +2209,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 PackDict[k]["roleList"][i].linklist.remove(_item)
 
     # PackDict更新company/org地址
+    last_role_prob = {}
     for ent in pre_entity:
         if ent.entity_type in ['company','org']:
             if ent.pointer_address:
@@ -2190,9 +2218,16 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         if PackDict[k]["roleList"][i].entity_text == ent.entity_text:
                             if not PackDict[k]["roleList"][i].address:
                                 PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
+                                last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]]
                             else:
-                                if len(ent.pointer_address.entity_text) > len(PackDict[k]["roleList"][i].address):
-                                    PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
+                                if PackDict[k]["roleList"][i].role_name in ['tenderee','agency']:
+                                    # 角色为招标/代理人时,取其实体概率高的链接地址作为角色address
+                                    if ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]] > last_role_prob[PackDict[k]["roleList"][i].role_name]:
+                                        PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
+                                        last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]]
+                                else:
+                                    if len(ent.pointer_address.entity_text) > len(PackDict[k]["roleList"][i].address):
+                                        PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
 
     # 联系人——电子邮箱链接
     temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]

+ 83 - 83
BiddingKG/dl/interface/predictor.py

@@ -1659,7 +1659,7 @@ class TendereeRuleRecall():
                             self.get_tenderee = True
                     else:
                         if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent.entity_text
-                                     ) or not re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent.entity_text) or re.search("自行.?采购",list_sentences[0][ent.sentence_index]):
+                                     ) or not re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent.entity_text) or re.search("自行.?采购",list_sentences[0][ent.sentence_index].sentence_text):
                             ent.label = 0
                             ent.values[0] = 0.5 + ent.values[0] / 10
                             self.get_tenderee = True
@@ -2804,88 +2804,88 @@ class ProductAttributesPredictor():
                             headers.append('_'.join(header_list))
                             headers_demand.append('_'.join(header_list2))
                             header_col.append('_'.join(tmp_head_list))
-                        # print('header_dic: ',header_dic)
-                        id1 = header_dic.get('名称', "")
-                        id2 = header_dic.get('数量', "")
-                        id3 = header_dic.get('单价', "")
-                        id4 = header_dic.get('品牌', "")
-                        id5 = header_dic.get('规格', "")
-
-                        id6 = header_dic.get('需求', "")
-                        id7 = header_dic.get('预算', "")
-                        id8 = header_dic.get('时间', "")
-                        if re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
-                                re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
-                            product = deal_list[id1]
-                            if id2 != "":
-                                if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
-                                    quantity = deal_list[id2]
-                                else:
-                                    quantity = ""
-                            if id3 != "":
-                                if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id3]):
-                                    _unitPrice = deal_list[id3]
-                                    re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
-                                    if re_price:
-                                        _unitPrice = re_price[0]
-                                        if '万元' in header_list[2] and '万' not in _unitPrice:
-                                            _unitPrice += '万元'
-                                        unitPrice = str(getUnifyMoney(_unitPrice))
-                            if id4 != "":
-                                if re.search('\w', deal_list[id4]):
-                                    brand = deal_list[id4]
-                                else:
-                                    brand = ""
-                            if id5 != "":
-                                if re.search('\w', deal_list[id5]):
-                                    specs = deal_list[id5]
-                                else:
-                                    specs = ""
-                            if id6 != "":
-                                if re.search('\w', deal_list[id6]):
-                                    demand = deal_list[id6]
-                                else:
-                                    demand = ""
-                            if id7 != "":
-                                if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id7]):
-                                    _budget = deal_list[id7]
-                                    re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_budget)
-                                    if re_price:
-                                        _budget = re_price[0]
-                                        if '万元' in header_list2[2] and '万' not in _budget:
-                                            _budget += '万元'
-                                        budget = str(getUnifyMoney(_budget))
-
-                            if id8 != "":
-                                if re.search('\w', deal_list[id8]):
-                                    order_time = deal_list[id8].strip()
-                                    order_begin, order_end = self.fix_time(order_time, html, page_time)
-                            # print(quantity,unitPrice,brand,specs)
-                            if quantity != "" or unitPrice != "" or brand != "" or specs != "":
-                                link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
-                                        'brand': brand[:50], 'specs': specs}
-                                if link not in product_link:
-                                    product_link.append(link)
-                                    # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
-                                    # if link['unitPrice'] != "" and mat:
-                                    #     try:
-                                    #         total_product_money += float(link['unitPrice']) * float(
-                                    #             mat.group(1).replace(',', ''))
-                                    #     except:
-                                    #         log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
-                                    #         link['unitPrice'], link['quantity']))
-                            if order_begin != "" and order_end != "":
-                                order_begin_year = int(order_begin.split("-")[0])
-                                order_end_year = int(order_end.split("-")[0])
-                                # 限制附件错误识别时间
-                                if order_begin_year >= 2050 or order_end_year >= 2050:
-                                    order_begin = order_end = ""
-                            # print(budget, order_time)
-                            if budget != "" and order_time != "":
-                                link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
-                                        'order_begin': order_begin, 'order_end': order_end}
-                                if link not in demand_link:
-                                    demand_link.append(link)
+                            # print('header_dic: ',header_dic)
+                            id1 = header_dic.get('名称', "")
+                            id2 = header_dic.get('数量', "")
+                            id3 = header_dic.get('单价', "")
+                            id4 = header_dic.get('品牌', "")
+                            id5 = header_dic.get('规格', "")
+
+                            id6 = header_dic.get('需求', "")
+                            id7 = header_dic.get('预算', "")
+                            id8 = header_dic.get('时间', "")
+                            if re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
+                                    re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
+                                product = deal_list[id1]
+                                if id2 != "":
+                                    if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
+                                        quantity = deal_list[id2]
+                                    else:
+                                        quantity = ""
+                                if id3 != "":
+                                    if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id3]):
+                                        _unitPrice = deal_list[id3]
+                                        re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
+                                        if re_price:
+                                            _unitPrice = re_price[0]
+                                            if '万元' in header_list[2] and '万' not in _unitPrice:
+                                                _unitPrice += '万元'
+                                            unitPrice = str(getUnifyMoney(_unitPrice))
+                                if id4 != "":
+                                    if re.search('\w', deal_list[id4]):
+                                        brand = deal_list[id4]
+                                    else:
+                                        brand = ""
+                                if id5 != "":
+                                    if re.search('\w', deal_list[id5]):
+                                        specs = deal_list[id5]
+                                    else:
+                                        specs = ""
+                                if id6 != "":
+                                    if re.search('\w', deal_list[id6]):
+                                        demand = deal_list[id6]
+                                    else:
+                                        demand = ""
+                                if id7 != "":
+                                    if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id7]):
+                                        _budget = deal_list[id7]
+                                        re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_budget)
+                                        if re_price:
+                                            _budget = re_price[0]
+                                            if '万元' in header_list2[2] and '万' not in _budget:
+                                                _budget += '万元'
+                                            budget = str(getUnifyMoney(_budget))
+
+                                if id8 != "":
+                                    if re.search('\w', deal_list[id8]):
+                                        order_time = deal_list[id8].strip()
+                                        order_begin, order_end = self.fix_time(order_time, html, page_time)
+                                # print(quantity,unitPrice,brand,specs)
+                                if quantity != "" or unitPrice != "" or brand != "" or specs != "":
+                                    link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
+                                            'brand': brand[:50], 'specs': specs}
+                                    if link not in product_link:
+                                        product_link.append(link)
+                                        # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
+                                        # if link['unitPrice'] != "" and mat:
+                                        #     try:
+                                        #         total_product_money += float(link['unitPrice']) * float(
+                                        #             mat.group(1).replace(',', ''))
+                                        #     except:
+                                        #         log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
+                                        #         link['unitPrice'], link['quantity']))
+                                if order_begin != "" and order_end != "":
+                                    order_begin_year = int(order_begin.split("-")[0])
+                                    order_end_year = int(order_end.split("-")[0])
+                                    # 限制附件错误识别时间
+                                    if order_begin_year >= 2050 or order_end_year >= 2050:
+                                        order_begin = order_end = ""
+                                # print(budget, order_time)
+                                if budget != "" and order_time != "":
+                                    link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
+                                            'order_begin': order_begin, 'order_end': order_end}
+                                    if link not in demand_link:
+                                        demand_link.append(link)
 
                     if len(product_link) > 0:
                         attr_dic = {'product_attrs': {'data': product_link, 'header': list(set(headers)), 'header_col': list(set(header_col))}}

+ 171 - 12
BiddingKG/dl/ratio/re_ratio.py

@@ -1,14 +1,16 @@
 import re
-
+from decimal import Decimal
 # ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)'
 # ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率)([((]?%[))]?|)[))]?[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)'
-ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率)([((]?%[))]?|)[))]?[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?'
-                   '|[0-9]+\.?[0-9]{0,3}[((]?%?[))]?[((]?(费率|折扣率|(上浮|下浮)费?率)[))]?)')
-ratio = ratio.pattern
 
+ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率|优惠率)([((]?[%‰][))]?|)(报价|取值|)([((].{1,20}[))])?[))]?[为是:: ,]{0,3}'
+                   '([0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰]?[))]?|[百千]分之[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+(?:点[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+)?)'
+                   '|[0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰][))]?[((]?[\u4e00-\u9fa5]{,2}(?:费率|折扣率|优惠率|(上浮|下浮)费?率)[))]?)')
+ratio = ratio.pattern
+# print(ratio)
 
-# 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%,
-# 基准利率上浮率:百分之三十(30%)
+# 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%, X
+# 基准利率上浮率:百分之三十(30%) X
 # 租金上浮率
 # 上浮率活期20%
 # 上浮率:活期20%、一年定期35%
@@ -25,7 +27,12 @@ def re_standard_ratio(_str):
             m_span = m.span()
             keyword_index = [m_span[0], m_span[1]]
             keyword = m_dict.get("value")
-            ratio_list.append([keyword, keyword_index])
+            left = _str[max(0,m_span[0]-15):m_span[0]]
+            right = _str[m_span[1]:m_span[1]+10]
+            context = left + keyword + right
+            print(1,keyword)
+            if not re.search("利率",context) and not re.search("^[万元]",right):
+                ratio_list.append([keyword, keyword_index])
 
     return ratio_list
 
@@ -39,20 +46,172 @@ def re_ratio(text):
 def extract_ratio(text):
     result_list = []
     total_money_list = re_ratio(text)
+    # print(total_money_list)
     if total_money_list:
         for word, text_index in total_money_list:
-            d = {"body": word, "begin_index": text_index[0],
-                 "end_index": text_index[1]}
-            result_list.append(d)
+            num_value = re.search("\d+(?:\.\d+)?[((]?[%‰]?|[零壹贰叁肆伍陆柒捌玖拾佰百一二三四五六七八九十]+(?:点[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+)?(?!分之)", word).group()
+            if re.search("[零壹贰叁肆伍陆柒捌玖拾佰一二三四五六七八九十]",num_value):
+                if '点' in num_value:
+                    num_split = num_value.split("点")
+                    round_len = len(num_split[1])
+                    num_integer = num_split[0]
+                    if re.search("^[十拾佰百]",num_integer):
+                        num_integer = "壹" + num_integer
+                    num_value = getUnifyNum(num_integer)
+                    for index,num_word in enumerate(list(num_split[1])):
+                        num_value = float(num_value) + getDigitsDic(num_word) * 0.1**(index+1)
+                else:
+                    round_len = 0
+                    num_value = getUnifyNum(num_value)
+                    num_value = float(num_value)
+                if re.search("%|百分之", word):
+                    num_value = num_value / 100
+                    round_len += 2
+                elif re.search("‰|千分之", word):
+                    num_value = num_value / 1000
+                    round_len += 3
+            else:
+                # if not re.search("[%‰]",word):
+                #     continue
+                match_text = num_value
+                num_value = float(re.sub('[((]|[%‰]','',num_value))
+                _decimal = str(num_value).split('.')[1]
+                if _decimal == '0':
+                    round_len = 0
+                else:
+                    round_len = len(_decimal)
+                if num_value<1 and not re.search('[%‰]',match_text):
+                    pass
+
+                else:
+                    if re.search("%|百分之",word):
+                        num_value = num_value / 100
+                        round_len += 2
+                    elif re.search("‰|千分之",word):
+                        num_value = num_value / 1000
+                        round_len += 3
+                    else:
+                        num_value = num_value / 100
+                        round_len += 2
+
+            num_value = round(num_value, round_len)
+            # print(word,num_value)
+            if re.search("上浮",word):
+                ratio_type = 'floating_ratio'
+            elif re.search("下浮|优惠",word):
+                ratio_type = 'downward_floating_ratio'
+            elif re.search("折扣",word):
+                if num_value>0.5:
+                    ratio_type = 'discount_ratio'
+                else:
+                    ratio_type = 'downward_floating_ratio'
+            else:
+                ratio_type = 'discount_ratio'
+            if num_value<=1:
+                d = {"body": word, "begin_index": text_index[0],
+                     "end_index": text_index[1],"value":num_value,"type":ratio_type}
+                result_list.append(d)
     return result_list
 
 
+def getDigitsDic(unit):
+    '''
+    @summary:拿到中文对应的数字
+    '''
+    DigitsDic = {"零": 0, "壹": 1, "贰": 2, "叁": 3, "肆": 4, "伍": 5, "陆": 6, "柒": 7, "捌": 8, "玖": 9,
+                 "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
+    return DigitsDic.get(unit)
+
+
+def getMultipleFactor(unit):
+    '''
+    @summary:拿到单位对应的值
+    '''
+    MultipleFactor = {"兆": Decimal(1000000000000), "亿": Decimal(100000000), "万": Decimal(10000), "仟": Decimal(1000),
+                      "千": Decimal(1000), "佰": Decimal(100), "百": Decimal(100), "拾": Decimal(10), "十": Decimal(10),
+                      "元": Decimal(1), "圆": Decimal(1), "角": round(Decimal(0.1), 1), "分": round(Decimal(0.01), 2)}
+    return MultipleFactor.get(unit)
+
+
+def getUnifyNum(money):
+    '''
+    @summary:将中文金额字符串转换为数字金额
+    @param:
+        money:中文金额字符串
+    @return: decimal,数据金额
+    '''
+
+    MAX_MONEY = 1000000000000
+    MAX_NUM = 12
+    # 去掉逗号
+    money = re.sub("[,,]", "", money)
+    money = re.sub("[^0-9.一二三四五六七八九零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", money)
+    result = Decimal(0)
+    chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖","一","二","三","四","五","六","七","八","九"]
+    chnFactorUnits = ["圆", "元", "兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
+
+    LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
+    BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$" % ("".join(chnDigits)))
+    try:
+        if re.search(LowMoneypattern, money) is not None:
+            return Decimal(money)
+        elif re.search(BigMoneypattern, money) is not None:
+            return getDigitsDic(re.search(BigMoneypattern, money).group("BigMoney"))
+        for factorUnit in chnFactorUnits:
+            if re.search(re.compile(".*%s.*" % (factorUnit)), money) is not None:
+                subMoneys = re.split(re.compile("%s(?!.*%s.*)" % (factorUnit, factorUnit)), money)
+                if re.search(re.compile("^(\d+)(\.\d+)?$"), subMoneys[0]) is not None:
+                    if MAX_MONEY / getMultipleFactor(factorUnit) < Decimal(subMoneys[0]):
+                        return Decimal(0)
+                    result += Decimal(subMoneys[0]) * (getMultipleFactor(factorUnit))
+                elif len(subMoneys[0]) == 1:
+                    if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[0]) is not None:
+                        result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
+                # subMoneys[0]中无金额单位,不可再拆分
+                elif subMoneys[0] == "":
+                    result += 0
+                elif re.search(re.compile("[%s]" % ("".join(chnFactorUnits))), subMoneys[0]) is None:
+                    # print(subMoneys)
+                    # subMoneys[0] = subMoneys[0][0]
+                    result += Decimal(getUnifyNum(subMoneys[0])) * (getMultipleFactor(factorUnit))
+                else:
+                    result += Decimal(getUnifyNum(subMoneys[0])) * (getMultipleFactor(factorUnit))
+                if len(subMoneys) > 1:
+                    if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"), subMoneys[1]) is not None:
+                        result += Decimal(subMoneys[1])
+                    elif len(subMoneys[1]) == 1:
+                        if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[1]) is not None:
+                            result += Decimal(getDigitsDic(subMoneys[1]))
+                    else:
+                        result += Decimal(getUnifyNum(subMoneys[1]))
+                break
+    except Exception as e:
+        # traceback.print_exc()
+        return Decimal(0)
+    return result
+
+
 def test_str():
     s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
     s = '年利率较基准利率的上浮率(%): 30 活期存款下浮率:0.455% 协定存的下浮率,(1-下浮率)' \
         ' 上浮率....  上浮率30(%)  (下浮率%):43  下浮率报价0.5%'
-    s = '费率或单价等:报价:94.00%, 幕墙工程费率为25.08%, 投标成本警戒费率为90%, 下浮率3.15%'
-
+    s = '''费率%)61.20万
+费率(精确到小数点后两位)60.00%
+下浮率取值13%
+下浮率报价13%
+下浮率 百分之十点零陆(10.00%
+下浮率 大写:无 下浮率百分之贰拾陆 无 小写: 下浮26%
+下浮率% 30
+成交优惠率% 5.00
+下浮率 0.25
+下浮率 0.25%
+中标金额:57.75%(商业优惠率)
+费率):1800
+费率):12
+折扣率(%):99.2063
+投标报价:96.00%(折扣率
+'''
+    # s = '下浮率 百分之十点零陆(10.00%'
     print(extract_ratio(s))
 
 

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 1 - 4
BiddingKG/dl/table_head/predict.py


برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است