il y a 6 mois · 5115a05a10
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -947,29 +947,36 @@ def money_process(money_text, header):
 
				     '''
			
 
				     money = 0
			
 
				     money_unit = ""
			
 
				-    # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[（(]?万?", money_text)
			
 
				-    money_text = re.sub('\s', '', money_text) # 2024/04/19 修复 457699044 556.46751 万元 金额与单位有空格造成万漏提取
			
 
				-    if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text) and re.search('\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[（(]?万?', money_text):
			
 
				-        money_text = re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text).group(0)  # 如果表格同时包含大小写金额，取大写金额，避免单位取错 463310590 790000（柒拾玖万元整）
			
 
				-    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[（(]?万?", money_text)
			
 
				-    if re_price:
			
 
				-        money_re = re_price.group(0)
			
 
				-        if (re.search('万元|[（(]万[)）]',  header) or re.search('万元|[（(]万[)）]', money_text)) and '万' not in money_re:  # 修复37797825 控制价（万） # 修复 460307391 万元不在表头，在数字前面
			
 
				-            money_re += '万元'
			
 
				-        elif (re.search('亿元|[（(]亿[)）]',  header) or re.search('亿元|[（(]亿[)）]', money_text)) and '亿' not in money_re:  # 修复37797825 控制价（万） # 修复 460307391 万元不在表头，在数字前面
			
 
				-            money_re += '亿元'
			
 
				-        # money = float(getUnifyMoney(money_text))
			
 
				-        money = float(getUnifyMoney(money_re))
			
 
				-        if money > 10000000000000:  # 大于万亿的去除
			
 
				-            money = 0
			
 
				-        # money_unit = '万元' if '万' in money_re and re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None else '元'
			
 
				-        if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None:
			
 
				-            if '万' in money_re:
			
 
				-                money_unit = '万元'
			
 
				-            elif '亿' in money_re:
			
 
				-                money_unit = '亿元'
			
 
				-            else:
			
 
				-                money_unit = '元'
			
 
				+    moneys, _ = get_money_entity('%s：%s' % (header, money_text))
			
 
				+    if len(moneys) == 1:
			
 
				+        money = float(moneys[0][0])
			
 
				+        money_unit = moneys[0][3]
			
 
				+    elif len(moneys) == 2 and moneys[0][0]==moneys[1][0]:
			
 
				+        money = float(moneys[0][0])
			
 
				+        money_unit = moneys[0][3]
			
 
				+    # # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[（(]?万?", money_text)
			
 
				+    # money_text = re.sub('\s', '', money_text) # 2024/04/19 修复 457699044 556.46751 万元 金额与单位有空格造成万漏提取
			
 
				+    # if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text) and re.search('\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[（(]?万?', money_text):
			
 
				+    #     money_text = re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text).group(0)  # 如果表格同时包含大小写金额，取大写金额，避免单位取错 463310590 790000（柒拾玖万元整）
			
 
				+    # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[（(]?万?", money_text)
			
 
				+    # if re_price:
			
 
				+    #     money_re = re_price.group(0)
			
 
				+    #     if (re.search('万元|[（(]万[)）]',  header) or re.search('万元|[（(]万[)）]', money_text)) and '万' not in money_re:  # 修复37797825 控制价（万） # 修复 460307391 万元不在表头，在数字前面
			
 
				+    #         money_re += '万元'
			
 
				+    #     elif (re.search('亿元|[（(]亿[)）]',  header) or re.search('亿元|[（(]亿[)）]', money_text)) and '亿' not in money_re:  # 修复37797825 控制价（万） # 修复 460307391 万元不在表头，在数字前面
			
 
				+    #         money_re += '亿元'
			
 
				+    #     # money = float(getUnifyMoney(money_text))
			
 
				+    #     money = float(getUnifyMoney(money_re))
			
 
				+    #     if money > 10000000000000:  # 大于万亿的去除
			
 
				+    #         money = 0
			
 
				+    #     # money_unit = '万元' if '万' in money_re and re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None else '元'
			
 
				+    #     if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None:
			
 
				+    #         if '万' in money_re:
			
 
				+    #             money_unit = '万元'
			
 
				+    #         elif '亿' in money_re:
			
 
				+    #             money_unit = '亿元'
			
 
				+    #         else:
			
 
				+    #             money_unit = '元'
			
 
				     return (money, money_unit)
			
 
				 
			
 
				 package_number_pattern = re.compile(
			
@@ -1146,6 +1153,223 @@ def is_deposit_project(title, name, requirement):
 
				         return True
			
 
				     return False
			
 
				 
			
 
				+def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
			
 
				+    money_list = []
			
 
				+    # 使用正则识别金额
			
 
				+    entity_type = "money"
			
 
				+    list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
			
 
				+                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d：|\d=\d[-+×]\d：)?(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[）\)]?))",
			
 
				+                          "front_m": "((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)]?)\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
			
 
				+                          "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
			
 
				+    # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元，合同金额：378.8万元 提取
			
 
				+
			
 
				+    pattern_money = re.compile("%s|%s|%s|%s" % (
			
 
				+    list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
			
 
				+    list_money_pattern["front_m"]))
			
 
				+
			
 
				+    # sentence_text = re.sub('\d+[年月日]', '', sentence_text) # 修复560180018 中标价（元）：3年投标报价（元）含税6299700.00 3年作为金额
			
 
				+
			
 
				+    if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
			
 
				+        found_yeji += 1
			
 
				+    if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
			
 
				+        all_match = []
			
 
				+    else:
			
 
				+        ser = re.search('((收费标准|计算[方公]?式)：|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s（）()\[\]【】\d\.%％‰\+\-*×/]{20,}[，。]?', sentence_text)  # 过滤掉收费标准里面的金额
			
 
				+        if ser:
			
 
				+            sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
			
 
				+        all_match = re.finditer(pattern_money, sentence_text)
			
 
				+    # print('all_match:', all_match)
			
 
				+    for _match in all_match:
			
 
				+        # print('_match: ', _match.group())
			
 
				+        if re.search('^元/1\d{10}，$', _match.group(0)): # 修复 495042766 现场负责人 姚元 / 13488160460 预测为金额
			
 
				+            continue
			
 
				+        if len(_match.group()) > 0:
			
 
				+            # print("===",_match.group())
			
 
				+            # # print(_match.groupdict())
			
 
				+            notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
			
 
				+            unit = ""
			
 
				+            entity_text = ""
			
 
				+            start_index = ""
			
 
				+            end_index = ""
			
 
				+            text_beforeMoney = ""
			
 
				+            filter = ""
			
 
				+            filter_unit = False
			
 
				+            notSure = False
			
 
				+            science = ""
			
 
				+            if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
			
 
				+                # print('金额在业绩后面: ', _match.group(0))
			
 
				+                found_yeji += 1
			
 
				+                break
			
 
				+            for k, v in _match.groupdict().items():
			
 
				+                if v != "" and v is not None:
			
 
				+                    if k == 'text_key_word':
			
 
				+                        notSure = True
			
 
				+                    if k.split("_")[0] == "money":
			
 
				+                        entity_text = v
			
 
				+                        # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
			
 
				+                        if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾，应该小数点识别错，直接去掉
			
 
				+                            entity_text = entity_text[:-3]
			
 
				+                    if k.split("_")[0] == "unit":
			
 
				+                        if 'behind' in k or unit == "":  # 优先后面单位  预算金额(元)：160万元  总价（万元）：最终报价：695000.00（元）
			
 
				+                            unit = v
			
 
				+                    if k.split("_")[0] == "text":
			
 
				+                        text_beforeMoney = v
			
 
				+                    if k.split("_")[0] == "filter":
			
 
				+                        filter = v
			
 
				+                    if re.search("filter_unit", k) is not None:
			
 
				+                        filter_unit = True
			
 
				+                    if k.split("_")[0] == 'science':
			
 
				+                        science = v
			
 
				+            # print("金额：{0} ,单位：{1}, 前文：{2}, filter: {3}, filter_unit: {4}".format(entity_text,unit,text_beforeMoney,filter,filter_unit))
			
 
				+            # if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
			
 
				+            #     if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0] - 2):_match.span()[0]]):
			
 
				+            #         entity_text = re.sub('\d+,', '', entity_text)
			
 
				+            #     else:
			
 
				+            #         entity_text = entity_text.replace(',', '.')
			
 
				+            #     # print(' 修正OCR识别小数点为逗号')
			
 
				+
			
 
				+            if filter != "":
			
 
				+                continue
			
 
				+            if len(entity_text)>30 or len(re.sub('[E-]', '', science))>2: # 限制数字长度，避免类似265339018附件金额错误，数值超大报错 decimal.InvalidOperation
			
 
				+                continue
			
 
				+            start_index, end_index = _match.span()
			
 
				+            start_index += len(text_beforeMoney)
			
 
				+
			
 
				+            '''过滤掉手机号码作为金额'''
			
 
				+            if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
			
 
				+                # print('过滤掉手机号码作为金额')
			
 
				+                continue
			
 
				+            elif re.search('^1[3-9]\d{9}$', entity_text) and re.search('：\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额（万元）：季勇13863441880'
			
 
				+                # print('过滤掉手机号码作为金额')
			
 
				+                continue
			
 
				+            elif re.search('^\d(.\d{1,2})?$', entity_text) and re.search('\d$', _match.group(0)) and re.search('^[、.]', sentence_text[_match.end():]): # 170756755 控制价为：1、合理利润率上限
			
 
				+                # print('过滤错误金额：', _match.group(0))
			
 
				+                continue
			
 
				+
			
 
				+            if unit == "":  # 2021/7/21 有明显金额特征的补充单位，避免被过滤
			
 
				+                if (re.search('(￥|¥|RMB|CNY)[:：]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
			
 
				+                    if entity_text.endswith('万元'):
			
 
				+                        unit = '万元'
			
 
				+                        entity_text = entity_text[:-2]
			
 
				+                    else:
			
 
				+                        unit = '元'
			
 
				+                    # print('1明显金额特征补充单位 元')
			
 
				+                elif re.search('USD[:：]?$', text_beforeMoney):
			
 
				+                    unit = '美元'
			
 
				+                elif re.search('EUR[:：]?$', text_beforeMoney):
			
 
				+                    unit = '欧元'
			
 
				+                elif re.search('JPY[:：]?$', text_beforeMoney):
			
 
				+                    unit = '日元'
			
 
				+                elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
			
 
				+                    # print('两个金额连接后面的有单位，用后面单位')
			
 
				+                    unit = '万元'
			
 
				+                elif re.search('^，?(价格币种：\w{2,3}，)?价格单位：万元', sentence_text[end_index:]): # 修复494731937金额单位缺漏 中标价格：39501.094425，价格币种：人民币，价格单位：万元，
			
 
				+                    unit = '万元'
			
 
				+                elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型：（万元）报价：13311.1582，得分：84.46，
			
 
				+                    unit = '万元'
			
 
				+                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
			
 
				+                    if re.search('^[\d，,.]+$', entity_text) and float(re.sub('[,，]', '', entity_text))<500 and re.search('万元', sentence_text):
			
 
				+                        unit = '万元'
			
 
				+                        # print('金额较小且句子中有万元的，补充单位为万元')
			
 
				+                    elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
			
 
				+                        unit = '万元'
			
 
				+                    else:
			
 
				+                        unit = '元'
			
 
				+                        # print('金额前面紧接关键词的补充单位 元')
			
 
				+                elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7}，?)$)|(^\d{,3}(,\d{3})+，?$)', entity_text):
			
 
				+                    unit = '元'
			
 
				+                    # print('3明显金额特征补充单位 元')
			
 
				+                else:
			
 
				+                    # print('过滤掉没单位金额: ',entity_text)
			
 
				+                    continue
			
 
				+            elif unit == '万元':
			
 
				+                if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
			
 
				+                    unit = '元'
			
 
				+                elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
			
 
				+                    unit = '元'
			
 
				+            if unit.find("万") >= 0 and entity_text.find("万") >= 0:  # 2021/7/19修改为金额文本有万，不计算单位
			
 
				+                # print('修正金额及单位都有万， 金额：',entity_text, '单位:',unit)
			
 
				+                unit = "元"
			
 
				+            if re.search('.*万元万元', entity_text):  # 2021/7/19 修正两个万元
			
 
				+                # print(' 修正两个万元',entity_text)
			
 
				+                entity_text = entity_text.replace('万元万元', '万元')
			
 
				+            else:
			
 
				+                if filter_unit:
			
 
				+                    continue
			
 
				+
			
 
				+            # symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else ''  # 负值金额前面保留负号 ，后面这些不作为负金额 起拍价：105.29-200.46万元  预 算 --- 350000.0 2023/04/14 取消符号
			
 
				+
			
 
				+            entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
			
 
				+            # print('转换前金额：', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
			
 
				+            if re.search('总投资|投资总额|总预算|总概算|(投资|招标|资金|存放|操作|融资)规模|批复概算|投资额|总规模|工程造价|总金额',
			
 
				+                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额  20241031工程造价作总投资
			
 
				+                # print('总投资金额: ', _match.group(0))
			
 
				+                notes = '总投资'
			
 
				+            elif re.search('投资|概算|建安费|其他费用|基本预备费',
			
 
				+                           sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
			
 
				+                notes = '投资'
			
 
				+            # elif re.search('工程造价',
			
 
				+            #                sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
			
 
				+            #     notes = '工程造价'
			
 
				+            elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
			
 
				+                  or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\(（]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\)）]*[:：为]',
			
 
				+                               sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
			
 
				+                  or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
			
 
				+                               sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
			
 
				+                notes = '保证金'
			
 
				+                # print('保证金信息：', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
			
 
				+            elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
			
 
				+                           sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
			
 
				+                notes = '成本警戒线'
			
 
				+            elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为：]|服务金额', sentence_text[_match.span()[0]:_match.span()[1]]):
			
 
				+                # cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
			
 
				+                # notes = cost_re.group(1)
			
 
				+                notes = '招标或中标金额'
			
 
				+            elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
			
 
				+                notes = '单价'
			
 
				+            elif re.search('^[/每]', sentence_text[_match.end():]):
			
 
				+                # print('单价：', _match.group(0))
			
 
				+                notes = '单价'
			
 
				+            elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
			
 
				+                notes = '大写'
			
 
				+                if entity_text[0] == "拾":  # 2021/12/16 修正大写金额省略了数字转换错误问题
			
 
				+                    entity_text = "壹" + entity_text
			
 
				+                # print("补充备注：notes = 大写")
			
 
				+            if len(unit) > 0:
			
 
				+                if unit.find('万') >= 0 and len(entity_text.split('.')[0]) >= 8:  # 2021/7/19 修正万元金额过大的情况
			
 
				+                    # print('修正单位万元金额过大的情况 金额：', entity_text, '单位:', unit)
			
 
				+                    entity_text = str(
			
 
				+                        getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]) / 10000)
			
 
				+                    unit = '元'  # 修正金额后单位 重置为元
			
 
				+                else:
			
 
				+                    # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
			
 
				+                    entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]))
			
 
				+            else:
			
 
				+                if entity_text.find('万') >= 0 and entity_text.split('.')[0].isdigit() and len(
			
 
				+                        entity_text.split('.')[0]) >= 8:
			
 
				+                    entity_text = str(getUnifyMoney(entity_text) / 10000)
			
 
				+                    # print('修正金额字段含万 过大的情况')
			
 
				+                else:
			
 
				+                    entity_text = str(getUnifyMoney(entity_text))
			
 
				+            if science and re.search('^E-?\d+$', science):  # 科学计数
			
 
				+                entity_text = str(Decimal(entity_text + science)) if Decimal(entity_text + science) > 100 and Decimal(
			
 
				+                    entity_text + science) < 10000000000 else entity_text  # 结果大于100及小于100万才使用科学计算
			
 
				+
			
 
				+            if float(entity_text) > 100000000000:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
			
 
				+                # print('过滤掉金额：float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
			
 
				+                continue
			
 
				+
			
 
				+            if notSure and unit == "" and float(entity_text) > 100 * 10000:
			
 
				+                # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000：', entity_text, unit)
			
 
				+                continue
			
 
				+            # print("金额：{0} ,单位：{1}, 前文：{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
			
 
				+            #                                                                      filter, filter_unit))
			
 
				+            if re.search('[%％‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
			
 
				+                # print('过滤掉可能是费率的金额')
			
 
				+                continue
			
 
				+            money_list.append((entity_text, start_index, end_index, unit, notes))
			
 
				+    return money_list, found_yeji
			
 
				+
			
 
				 def recall(y_true, y_pred):
			
 
				     '''
			
 
				     计算召回率
			
--- a/BiddingKG/dl/interface/Entitys.py
+++ b/BiddingKG/dl/interface/Entitys.py
@@ -300,6 +300,7 @@ class Role():
 
				         self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期)
			
 
				         self.address = ""  #2022/08/08 新增 角色地址
			
 
				         self.multi_winner = multi_winner #2024/4/8 新增多中标人
			
 
				+        self.unit_price = 0 # 20241127 新增单价
			
 
				 
			
 
				     def getString(self):
			
 
				         self.linklist = [item for item in set(self.linklist)]
			
@@ -342,6 +343,8 @@ class Role():
 
				         result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
			
 
				                   'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
			
 
				                   'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}
			
 
				+        if self.unit_price != 0: # 单价
			
 
				+            result['role_money']['unit_price'] = self.unit_price
			
 
				         if result['role_name'] in ['tenderee', 'win_tenderer']:
			
 
				             result['role_prob'] = self.role_prob
			
 
				         if result['role_name'] == 'win_tenderer' and self.multi_winner != set():
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -3219,221 +3219,6 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
				         article.content = re.sub("##attachment_begin##|##attachment_end##", "", article.content)
			
 
				     return list_sentences,list_outlines
			
 
				 
			
 
				-def get_money_entity(sentence_text, found_yeji, in_attachment=False):
			
 
				-    money_list = []
			
 
				-    # 使用正则识别金额
			
 
				-    entity_type = "money"
			
 
				-    list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
			
 
				-                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d：|\d=\d[-+×]\d：)?(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
			
 
				-                          "front_m": "((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)]?)\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
			
 
				-                          "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
			
 
				-    # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元，合同金额：378.8万元 提取
			
 
				-
			
 
				-    pattern_money = re.compile("%s|%s|%s|%s" % (
			
 
				-    list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
			
 
				-    list_money_pattern["front_m"]))
			
 
				-
			
 
				-    if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
			
 
				-        found_yeji += 1
			
 
				-    if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
			
 
				-        all_match = []
			
 
				-    else:
			
 
				-        ser = re.search('((收费标准|计算[方公]?式)：|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s（）()\[\]【】\d\.%％‰\+\-*×/]{20,}[，。]?', sentence_text)  # 过滤掉收费标准里面的金额
			
 
				-        if ser:
			
 
				-            sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
			
 
				-        all_match = re.finditer(pattern_money, sentence_text)
			
 
				-    # print('all_match:', all_match)
			
 
				-    for _match in all_match:
			
 
				-        # print('_match: ', _match.group())
			
 
				-        if re.search('^元/1\d{10}，$', _match.group(0)): # 修复 495042766 现场负责人 姚元 / 13488160460 预测为金额
			
 
				-            continue
			
 
				-        if len(_match.group()) > 0:
			
 
				-            # print("===",_match.group())
			
 
				-            # # print(_match.groupdict())
			
 
				-            notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
			
 
				-            unit = ""
			
 
				-            entity_text = ""
			
 
				-            start_index = ""
			
 
				-            end_index = ""
			
 
				-            text_beforeMoney = ""
			
 
				-            filter = ""
			
 
				-            filter_unit = False
			
 
				-            notSure = False
			
 
				-            science = ""
			
 
				-            if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
			
 
				-                # print('金额在业绩后面: ', _match.group(0))
			
 
				-                found_yeji += 1
			
 
				-                break
			
 
				-            for k, v in _match.groupdict().items():
			
 
				-                if v != "" and v is not None:
			
 
				-                    if k == 'text_key_word':
			
 
				-                        notSure = True
			
 
				-                    if k.split("_")[0] == "money":
			
 
				-                        entity_text = v
			
 
				-                        # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
			
 
				-                        if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾，应该小数点识别错，直接去掉
			
 
				-                            entity_text = entity_text[:-3]
			
 
				-                    if k.split("_")[0] == "unit":
			
 
				-                        if 'behind' in k or unit == "":  # 优先后面单位  预算金额(元)：160万元  总价（万元）：最终报价：695000.00（元）
			
 
				-                            unit = v
			
 
				-                    if k.split("_")[0] == "text":
			
 
				-                        # print('text_before: ', _match.group(k))
			
 
				-                        text_beforeMoney = v
			
 
				-                    if k.split("_")[0] == "filter":
			
 
				-                        filter = v
			
 
				-                    if re.search("filter_unit", k) is not None:
			
 
				-                        filter_unit = True
			
 
				-                    if k.split("_")[0] == 'science':
			
 
				-                        science = v
			
 
				-            # print("金额：{0} ,单位：{1}, 前文：{2}, filter: {3}, filter_unit: {4}".format(entity_text,unit,text_beforeMoney,filter,filter_unit))
			
 
				-            # if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
			
 
				-            #     if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0] - 2):_match.span()[0]]):
			
 
				-            #         entity_text = re.sub('\d+,', '', entity_text)
			
 
				-            #     else:
			
 
				-            #         entity_text = entity_text.replace(',', '.')
			
 
				-            #     # print(' 修正OCR识别小数点为逗号')
			
 
				-
			
 
				-            if filter != "":
			
 
				-                continue
			
 
				-            if len(entity_text)>30 or len(re.sub('[E-]', '', science))>2: # 限制数字长度，避免类似265339018附件金额错误，数值超大报错 decimal.InvalidOperation
			
 
				-                continue
			
 
				-            start_index, end_index = _match.span()
			
 
				-            start_index += len(text_beforeMoney)
			
 
				-
			
 
				-            '''过滤掉手机号码作为金额'''
			
 
				-            if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
			
 
				-                # print('过滤掉手机号码作为金额')
			
 
				-                continue
			
 
				-            elif re.search('^1[3-9]\d{9}$', entity_text) and re.search('：\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额（万元）：季勇13863441880'
			
 
				-                # print('过滤掉手机号码作为金额')
			
 
				-                continue
			
 
				-            elif re.search('^\d(.\d{1,2})?$', entity_text) and re.search('\d$', _match.group(0)) and re.search('^[、.]', sentence_text[_match.end():]): # 170756755 控制价为：1、合理利润率上限
			
 
				-                # print('过滤错误金额：', _match.group(0))
			
 
				-                continue
			
 
				-
			
 
				-            if unit == "":  # 2021/7/21 有明显金额特征的补充单位，避免被过滤
			
 
				-                if (re.search('(￥|¥|RMB|CNY)[:：]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
			
 
				-                    if entity_text.endswith('万元'):
			
 
				-                        unit = '万元'
			
 
				-                        entity_text = entity_text[:-2]
			
 
				-                    else:
			
 
				-                        unit = '元'
			
 
				-                    # print('1明显金额特征补充单位 元')
			
 
				-                elif re.search('USD[:：]?$', text_beforeMoney):
			
 
				-                    unit = '美元'
			
 
				-                elif re.search('EUR[:：]?$', text_beforeMoney):
			
 
				-                    unit = '欧元'
			
 
				-                elif re.search('JPY[:：]?$', text_beforeMoney):
			
 
				-                    unit = '日元'
			
 
				-                elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
			
 
				-                    # print('两个金额连接后面的有单位，用后面单位')
			
 
				-                    unit = '万元'
			
 
				-                elif re.search('^，?(价格币种：\w{2,3}，)?价格单位：万元', sentence_text[end_index:]): # 修复494731937金额单位缺漏 中标价格：39501.094425，价格币种：人民币，价格单位：万元，
			
 
				-                    unit = '万元'
			
 
				-                elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型：（万元）报价：13311.1582，得分：84.46，
			
 
				-                    unit = '万元'
			
 
				-                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
			
 
				-                    if re.search('^[\d，,.]+$', entity_text) and float(re.sub('[,，]', '', entity_text))<500 and re.search('万元', sentence_text):
			
 
				-                        unit = '万元'
			
 
				-                        # print('金额较小且句子中有万元的，补充单位为万元')
			
 
				-                    elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
			
 
				-                        unit = '万元'
			
 
				-                    else:
			
 
				-                        unit = '元'
			
 
				-                        # print('金额前面紧接关键词的补充单位 元')
			
 
				-                elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7}，?)$)|(^\d{,3}(,\d{3})+，?$)', entity_text):
			
 
				-                    unit = '元'
			
 
				-                    # print('3明显金额特征补充单位 元')
			
 
				-                else:
			
 
				-                    # print('过滤掉没单位金额: ',entity_text)
			
 
				-                    continue
			
 
				-            elif unit == '万元':
			
 
				-                if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
			
 
				-                    unit = '元'
			
 
				-                elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
			
 
				-                    unit = '元'
			
 
				-            if unit.find("万") >= 0 and entity_text.find("万") >= 0:  # 2021/7/19修改为金额文本有万，不计算单位
			
 
				-                # print('修正金额及单位都有万， 金额：',entity_text, '单位:',unit)
			
 
				-                unit = "元"
			
 
				-            if re.search('.*万元万元', entity_text):  # 2021/7/19 修正两个万元
			
 
				-                # print(' 修正两个万元',entity_text)
			
 
				-                entity_text = entity_text.replace('万元万元', '万元')
			
 
				-            else:
			
 
				-                if filter_unit:
			
 
				-                    continue
			
 
				-
			
 
				-            # symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else ''  # 负值金额前面保留负号 ，后面这些不作为负金额 起拍价：105.29-200.46万元  预 算 --- 350000.0 2023/04/14 取消符号
			
 
				-
			
 
				-            entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
			
 
				-            # print('转换前金额：', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
			
 
				-            if re.search('总投资|投资总额|总预算|总概算|(投资|招标|资金|存放|操作|融资)规模|批复概算|投资额|总规模|工程造价|总金额',
			
 
				-                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额  20241031工程造价作总投资
			
 
				-                # print('总投资金额: ', _match.group(0))
			
 
				-                notes = '总投资'
			
 
				-            elif re.search('投资|概算|建安费|其他费用|基本预备费',
			
 
				-                           sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
			
 
				-                notes = '投资'
			
 
				-            # elif re.search('工程造价',
			
 
				-            #                sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
			
 
				-            #     notes = '工程造价'
			
 
				-            elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
			
 
				-                  or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\(（]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\)）]*[:：为]',
			
 
				-                               sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
			
 
				-                  or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
			
 
				-                               sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
			
 
				-                notes = '保证金'
			
 
				-                # print('保证金信息：', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
			
 
				-            elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
			
 
				-                           sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
			
 
				-                notes = '成本警戒线'
			
 
				-            elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为：]|服务金额', sentence_text[_match.span()[0]:_match.span()[1]]):
			
 
				-                # cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
			
 
				-                # notes = cost_re.group(1)
			
 
				-                notes = '招标或中标金额'
			
 
				-            elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
			
 
				-                notes = '单价'
			
 
				-            elif re.search('^[/每]', sentence_text[_match.end():]):
			
 
				-                # print('单价：', _match.group(0))
			
 
				-                notes = '单价'
			
 
				-            elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
			
 
				-                notes = '大写'
			
 
				-                if entity_text[0] == "拾":  # 2021/12/16 修正大写金额省略了数字转换错误问题
			
 
				-                    entity_text = "壹" + entity_text
			
 
				-                # print("补充备注：notes = 大写")
			
 
				-            if len(unit) > 0:
			
 
				-                if unit.find('万') >= 0 and len(entity_text.split('.')[0]) >= 8:  # 2021/7/19 修正万元金额过大的情况
			
 
				-                    # print('修正单位万元金额过大的情况 金额：', entity_text, '单位:', unit)
			
 
				-                    entity_text = str(
			
 
				-                        getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]) / 10000)
			
 
				-                    unit = '元'  # 修正金额后单位 重置为元
			
 
				-                else:
			
 
				-                    # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
			
 
				-                    entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]))
			
 
				-            else:
			
 
				-                if entity_text.find('万') >= 0 and entity_text.split('.')[0].isdigit() and len(
			
 
				-                        entity_text.split('.')[0]) >= 8:
			
 
				-                    entity_text = str(getUnifyMoney(entity_text) / 10000)
			
 
				-                    # print('修正金额字段含万 过大的情况')
			
 
				-                else:
			
 
				-                    entity_text = str(getUnifyMoney(entity_text))
			
 
				-            if science and re.search('^E-?\d+$', science):  # 科学计数
			
 
				-                entity_text = str(Decimal(entity_text + science)) if Decimal(entity_text + science) > 100 and Decimal(
			
 
				-                    entity_text + science) < 10000000000 else entity_text  # 结果大于100及小于100万才使用科学计算
			
 
				-
			
 
				-            if float(entity_text) > 100000000000:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
			
 
				-                # print('过滤掉金额：float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
			
 
				-                continue
			
 
				-
			
 
				-            if notSure and unit == "" and float(entity_text) > 100 * 10000:
			
 
				-                # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000：', entity_text, unit)
			
 
				-                continue
			
 
				-            # print("金额：{0} ,单位：{1}, 前文：{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
			
 
				-            #                                                                      filter, filter_unit))
			
 
				-            if re.search('[%％‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
			
 
				-                # print('过滤掉可能是费率的金额')
			
 
				-                continue
			
 
				-            money_list.append((entity_text, start_index, end_index, unit, notes))
			
 
				-    return money_list, found_yeji
			
 
				 def cut_repeat_name(s):
			
 
				     '''
			
 
				     公司连续重复名称去重
			
@@ -4086,6 +3871,7 @@ if __name__=="__main__":
 
				     text = '是否拟中标人：是，评标排名：1，价格类型：（万元）报价：13311.1582，得分：84.46，项目负责人：邓焱文'
			
 
				     text = '，采购包1：采购包预算金额（元：1,500000.00，采购包最高限价（元：1,430600.00，'
			
 
				     text = '成交人：中坤电力有限公司，成交价格：11493,603.52元，质量：合格，项目工期：117天，'
			
 
				+    text = '3年投标报价（元）含税 6299700.00'
			
 
				     # text = '数量及单位1：65台，单价2：800，投标报价3=1×2：52000。'
			
 
				     print(get_money_entity(text, found_yeji=0))
			
 
				     # with open('D:/138786703.html', 'r', encoding='utf-8') as f:
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -442,7 +442,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic)
			
 
				 
			
 
				     '''最终验证prem'''
			
 
				-    getAttributes.confirm_prem(prem[0]['prem'], channel_dic, deposit_project, prem[0]['total_tendereeMoney'])
			
 
				+    getAttributes.confirm_prem(prem[0]['prem'], channel_dic, deposit_project, prem[0]['total_tendereeMoney'], name=codeName[0]['name'])
			
 
				 
			
 
				     # 提取拟在建所需字段
			
 
				     start_time = time.time()
			
@@ -455,7 +455,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2024-11-25'}
			
 
				+    version_date = {'version_date': '2024-12-02'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
			
 
				 
			
 
				     if original_docchannel == 302:
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -936,13 +936,19 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                 #     packDict[packageName]["roleList"][i].money = money
			
 
				                 #     packDict[packageName]["roleList"][i].money_prob = money_prob
			
 
				                 if packDict[packageName]["roleList"][i].money_prob==0 :  # 2021/7/20第一次更新金额
			
 
				-                    packDict[packageName]["roleList"][i].money = money.entity_text
			
 
				+                    if money.notes == '单价':
			
 
				+                        packDict[packageName]["roleList"][i].unit_price = money.entity_text
			
 
				+                    else:
			
 
				+                        packDict[packageName]["roleList"][i].money = money.entity_text
			
 
				                     packDict[packageName]["roleList"][i].money_prob = money_prob
			
 
				                     packDict[packageName]["roleList"][i].money_unit = money.money_unit
			
 
				                 elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or (money.notes in ['大写'] and money.in_attachment==False): # 2021/7/20改为优先选择大写金额,
			
 
				                     # print('已连接金额概率：money_prob:',packDict[packageName]["roleList"][i].money_prob)
			
 
				                     # print('链接金额备注 ',money.notes, money.entity_text, money.values)
			
 
				-                    packDict[packageName]["roleList"][i].money = money.entity_text
			
 
				+                    if money.notes == '单价':
			
 
				+                        packDict[packageName]["roleList"][i].unit_price = money.entity_text
			
 
				+                    else:
			
 
				+                        packDict[packageName]["roleList"][i].money = money.entity_text
			
 
				                     packDict[packageName]["roleList"][i].money_prob = money_prob
			
 
				                     packDict[packageName]["roleList"][i].money_unit = money.money_unit
			
 
				                 # print('链接中的金额：{0}, 单位：{1}'.format(money.entity_text, money.money_unit))
			
@@ -2707,12 +2713,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                         PackDict[packageName]["cost_warning"] = str(Decimal(entity.entity_text))
			
 
				 
			
 
				             elif entity.values[entity.label]>=on_value:
			
 
				-                if str(entity.label)=="1":
			
 
				+                if str(entity.label)=="1" and entity.notes != '单价':
			
 
				                     set_tenderer_money.add(float(entity.entity_text))
			
 
				                     list_tenderer_money.append(float(entity.entity_text))  # 2021/7/16 新增列表，倒序保存所有中标金额
			
 
				                     unit_list.append(entity.money_unit)
			
 
				                 # if str(entity.label)=="0":
			
 
				-                if str(entity.label)=="0" and entity.notes!='总投资':
			
 
				+                if str(entity.label)=="0" and (entity.notes!='总投资' or float(entity.entity_text)<100000000):
			
 
				                     '''
			
 
				                     if p_entity>0:
			
 
				                         p_before = list_entity[p_entity-1]
			
@@ -2731,16 +2737,119 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                         #     PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
			
 
				                         # if entity.values[entity.label]>on_value:
			
 
				                         if entity.values[entity.label]>max_prob-0.005: # 选择最大概率招标金额 2024/05/23 相差0.005尽量选前面的
			
 
				-                            PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
			
 
				+                            if entity.notes == '单价':
			
 
				+                                PackDict["Project"]["unit_tendereeMoney"] = str(Decimal(entity.entity_text))
			
 
				+                            else:
			
 
				+                                PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
			
 
				                             PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit
			
 
				                             max_prob = entity.values[entity.label]
			
 
				                     else:
			
 
				-                        PackDict[packageName]["tendereeMoney"] = str(Decimal(entity.entity_text))
			
 
				+                        if entity.notes == '单价':
			
 
				+                            PackDict[packageName]["unit_tendereeMoney"] = str(Decimal(entity.entity_text))
			
 
				+                        else:
			
 
				+                            PackDict[packageName]["tendereeMoney"] = str(Decimal(entity.entity_text))
			
 
				                         PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit
			
 
				                         #add pointer_tendereeMoney
			
 
				                         packagePointer.pointer_tendereeMoney = entity
			
 
				         p_entity -= 1            
			
 
				-    
			
 
				+
			
 
				+    '''包名与标段号链接'''
			
 
				+    l_main = []
			
 
				+    l_attn = []
			
 
				+    pack_num_main = 0
			
 
				+    name_num_main = 0
			
 
				+    pack_num_attn = 0
			
 
				+    name_num_attn = 0
			
 
				+    for entity in list_entity:
			
 
				+        if entity.entity_type in  ['name', 'package']:
			
 
				+            if entity.in_attachment:
			
 
				+                l_attn.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end))
			
 
				+                if entity.entity_type == 'name':
			
 
				+                    name_num_attn += 1
			
 
				+                else:
			
 
				+                    pack_num_attn += 1
			
 
				+            else:
			
 
				+                l_main.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end))
			
 
				+                if entity.entity_type == 'name':
			
 
				+                    name_num_main += 1
			
 
				+                else:
			
 
				+                    pack_num_main += 1
			
 
				+    if name_num_main > 0 and pack_num_main > 0:
			
 
				+        l_main.sort(key=lambda x: [x[2],x[3]])
			
 
				+        # print('正文名称：',l_main)
			
 
				+        link_dic = {}
			
 
				+        i = 1
			
 
				+        pre_ty = l_main[0][0]
			
 
				+        while i < len(l_main):
			
 
				+            if l_main[i][0] != pre_ty:
			
 
				+                ty1, ent1, s1, b1, e1 = l_main[i-1]
			
 
				+                ty2, ent2, s2, b2, e2 = l_main[i]
			
 
				+                if ty1 == 'package':
			
 
				+                    if ent1 not in link_dic:
			
 
				+                        link_dic[ent1] = []
			
 
				+                    if s1 == s2:
			
 
				+                        dist = abs(b2 - b1)
			
 
				+                    else:
			
 
				+                        dist = len(list_sentence[s1].sentence_text) - b1
			
 
				+                        for id in range(s1+1, s2):
			
 
				+                            dist += len(list_sentence[id].sentence_text)
			
 
				+                        dist += b2
			
 
				+                    link_dic[ent1].append((s2-s1, dist, ent2))
			
 
				+                elif ty2 == 'package':
			
 
				+                    if ent2 not in link_dic:
			
 
				+                        link_dic[ent2] = []
			
 
				+                    if s1 == s2:
			
 
				+                        dist = abs(b2 - b1)
			
 
				+                    else:
			
 
				+                        dist = len(list_sentence[s1].sentence_text) - b1
			
 
				+                        for id in range(s1+1, s2):
			
 
				+                            dist += len(list_sentence[id].sentence_text)
			
 
				+                        dist += b2
			
 
				+                    link_dic[ent2].append((s2-s1, dist, ent1))
			
 
				+            pre_ty = l_main[i][0]
			
 
				+            i += 1
			
 
				+        for k, v in link_dic.items():
			
 
				+            v.sort(key=lambda x: [x[0], x[1]])
			
 
				+            # print('各包排序后项目名：', k, v)
			
 
				+            PackDict[k]["name"] = v[0][2]
			
 
				+    elif name_num_attn > 0 and pack_num_attn > 0:
			
 
				+        # print("附件名称：", l_attn)
			
 
				+        l_attn.sort(key=lambda x: [x[2],x[3]])
			
 
				+        link_dic = {}
			
 
				+        i = 1
			
 
				+        pre_ty = l_attn[0][0]
			
 
				+        while i < len(l_attn):
			
 
				+            if l_attn[i][0] != pre_ty:
			
 
				+                ty1, ent1, s1, b1, e1 = l_attn[i-1]
			
 
				+                ty2, ent2, s2, b2, e2 = l_attn[i]
			
 
				+                if ty1 == 'package':
			
 
				+                    if ent1 not in link_dic:
			
 
				+                        link_dic[ent1] = []
			
 
				+                    if s1 == s2:
			
 
				+                        dist = abs(b2 - b1)
			
 
				+                    else:
			
 
				+                        dist = len(list_sentence[s1].sentence_text) - b1
			
 
				+                        for id in range(s1+1, s2):
			
 
				+                            dist += len(list_sentence[id].sentence_text)
			
 
				+                        dist += b2
			
 
				+                    link_dic[ent1].append((s2-s1, dist, ent2))
			
 
				+                elif ty2 == 'package':
			
 
				+                    if ent2 not in link_dic:
			
 
				+                        link_dic[ent2] = []
			
 
				+                    if s1 == s2:
			
 
				+                        dist = abs(b2 - b1)
			
 
				+                    else:
			
 
				+                        dist = len(list_sentence[s1].sentence_text) - b1
			
 
				+                        for id in range(s1+1, s2):
			
 
				+                            dist += len(list_sentence[id].sentence_text)
			
 
				+                        dist += b2
			
 
				+                    link_dic[ent2].append((s2-s1, dist, ent1))
			
 
				+            pre_ty = l_attn[i][0]
			
 
				+            i += 1
			
 
				+        for k, v in link_dic.items():
			
 
				+            v.sort(key=lambda x: [x[0], x[1]])
			
 
				+            # print('各包排序后项目名：', k, v)
			
 
				+            PackDict[k]["name"] = v[0][2]
			
 
				         
			
 
				     #删除一个机构有多个角色的数据
			
 
				     #删除重复人、概率不回传
			
@@ -2804,8 +2913,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				             _flag_pack_money = False
			
 
				     if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
			
 
				         for k,v in dict_pack_tenderer_money.items():
			
 
				-            v[0].money = list(v[1])[0]
			
 
				-            # print('k,v in dict_pack_tenderer_money.items', k, v)
			
 
				+            if float(v[0].unit_price) < float(list(v[1])[0]): # 20241128 金额大于单价时才作链接金额
			
 
				+                v[0].money = list(v[1])[0]
			
 
				     # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
			
 
				     for pack in PackDict.keys():
			
 
				         for i in range(len(PackDict[pack]["roleList"])):
			
@@ -4217,7 +4326,7 @@ def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修
 
				                         #     l[2] = total_product_money
			
 
				                         #     log('修改中标金额为所有产品总金额')
			
 
				                         # if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) == 0 and float(l["role_money"]['money'])<total_product_money/10:
			
 
				-                        if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or float(l["role_money"]['money'])<ree_money/2): # 改为小于一半招标金额或为0时替换为合计金额
			
 
				+                        if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or (float(l["role_money"]['money'])<ree_money/2 and float(l["role_money"]['money'])<total_product_money<ree_money)): # 改为小于一半招标金额或为0时替换为合计金额
			
 
				                             l["role_money"]['money'] = total_product_money
			
 
				                             # print('修改中标金额为所有产品总金额')
			
 
				                     except Exception as e:
			
@@ -4622,6 +4731,11 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
				                     del_k.append(k)
			
 
				             for k in del_k:
			
 
				                 old_prem.pop(k)
			
 
				+        if in_attachment: # 附件表格提取的，原来提取有中标人，停止替换
			
 
				+            for v in old_prem.values():
			
 
				+                for d in v['roleList']:
			
 
				+                    if d['role_name'] in ['win_tenderer', 'pre_win_tenderer']:
			
 
				+                        return 0
			
 
				 
			
 
				         # if len(new_prem) > len(old_prem) and [k for k in new_prem if '自增' not in k] == []:  # 如果表格提取包号都为自增编号且包数大于非表格提取，不进行更新 例 244355092  281854766
			
 
				         #     return None
			
@@ -4707,7 +4821,7 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
				 
			
 
				     # return old_prem
			
 
				 
			
 
				-def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0):
			
 
				+def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0, name=""):
			
 
				     '''
			
 
				     规则检查纠正prem，如果Project包中标人在其他包中标人，去掉project包中标角色；如果有其他包中标人，去掉roleList为空的包；
			
 
				     :param prem: prem 字段字典
			
@@ -4758,6 +4872,10 @@ def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMone
 
				         for k in prem:
			
 
				             if float(prem[k]['tendereeMoney'])==0:
			
 
				                 prem[k]['tendereeMoney'] = total_tendereeMoney
			
 
				+    if name != '' and len(prem)<=2: # 20241129 小于等于两个包且无包名称，取项目名称
			
 
				+        for k in prem:
			
 
				+            if prem[k].get('name', '') == '':
			
 
				+                prem[k]['name'] = name
			
 
				 
			
 
				 
			
 
				 def fix_single_source(prem, channel_dic, original_docchannel):
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -531,9 +531,12 @@ class CodeNamePredict():
 
				             if len(dict_name_freq_score) == 0:
			
 
				                 # name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[:：\s]+([^，。：；]{2,60})[，。]'
			
 
				                 name_re1 = '(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题)[:：\s]+(?P<name>[^，。：；]{2,60})[，。]'
			
 
				+                name_re2 = '(合同|采购)包\d（(?P<name>[^，。：；]{2,60})）[：，。]' # 20241202 补充合同包 包名表达 558410976
			
 
				                 for sentence in list_sentence:
			
 
				                     # pad_sentence = sentence.sentence_text
			
 
				                     othername = re.search(name_re1, sentence.sentence_text)
			
 
				+                    if othername == None:
			
 
				+                        othername = re.search(name_re2, sentence.sentence_text)
			
 
				                     if othername != None:
			
 
				                         project_name = othername.group('name')
			
 
				                         if re.search('[\u4e00-\u9fa5]+', project_name) == None:  # 没有中文的项目名称去除
			
@@ -869,7 +872,7 @@ class PREMPredict():
 
				                 elif re.search('^放弃中标资格|是否中标：否|^(中标|成交)(公示|公告)', behind):
			
 
				                     values[2] = 0.5
			
 
				                     label = 5
			
 
				-                elif re.search('^，?(投标报价|(资格性审查：|符合性审查：)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', front)==None:
			
 
				+                elif re.search('^，?(投标报价|(资格性审查：|符合性审查：)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]', front)==None and values[2]<0.7: #20241126补充条件避免漏提 560768263 第一候选人：单位名称： 上海理想信息产业（集团）有限公司 ，投标报价：
			
 
				                     values[2] = 0.5
			
 
				                     label = 5
			
 
				                 elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单)：$|确定为标的的受让方，$|[主次出]入口?，?$|确定(项目|\w{,2})成交供应商，$|，承刻单位：$|乙方接受为$|丙方：$', front):  # 234501112 民币元，序号：1，债务人： 东营市海宁工贸有限责任公司 ，债权本金： 262414286 八、中标后签约单位，合同签约单位： 241929628 1月9，承刻单位： 肃宁县超凡网络光敏印章刻印部 ，印章预留印模
			
@@ -982,8 +985,8 @@ class PREMPredict():
 
				                     values[label] = 0.49
			
 
				                 elif re.search('(含|在|包括|[大小等高低]于|达到)$|[\d.%]+[+×*-]$', front):
			
 
				                     values[label] = 0.49
			
 
				-                elif entity.notes == '单价' and float(entity.entity_text)<5000:
			
 
				-                    label = 2
			
 
				+                # elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释，单价单独存放
			
 
				+                #     label = 2
			
 
				             elif label ==0: # 错误招标金额处理
			
 
				                 if re.search('投资(金额|规模)：$', front): # 545988699 金额不大的投资金额作为备选招标金额
			
 
				                     values[label] = 0.51
			
@@ -994,8 +997,8 @@ class PREMPredict():
 
				                     values[label] = 0.49
			
 
				                 # elif re.search('(含|在|包括|[大小等高低]于|如预算金额为)$|[\d.%]+(（含）)?[+×*-]$', front):  # 2024/10/30 注销，避免漏提 预算金额：控制在26000元以内由合作银行出资 ；投资金额不低于人民币500万元
			
 
				                 #     values[label] = 0.49
			
 
				-                elif entity.notes == '单价' and float(entity.entity_text)<5000:
			
 
				-                    label = 2
			
 
				+                # elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释，单价单独存放
			
 
				+                #     label = 2
			
 
				             elif re.search('报价：预估不?含税总价[为：]$', front) and (label != 1 or values[label]<0.5):
			
 
				                 label = 1
			
 
				                 values[label] = 0.8
			
@@ -2334,12 +2337,12 @@ class RoleGrade():
 
				         self.tenderee_left_6 = "(?P<tenderee_left_6>(业主|建设|委托)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|业主|买方)"
			
 
				         self.tenderee_left_5 = "(?P<tenderee_left_5>(发布)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|买方|发布机构)"
			
 
				         self.agency_left_9 = "(?P<agency_left_9>代理)"
			
 
				-        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]名|排[名序]：1|名次：1)"
			
 
				+        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一](名|候选)|排[名序]：1|名次：1)"
			
 
				         self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方|最[终后]选[择取]))"  # 229435497 最后选择西平，县中原彩印有限公司，作为此项目中标供应商，
			
 
				         self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
			
 
				         self.winTenderer_right_9 = "(?P<winTenderer_right_9>^(为(中标|成交|中选)(人|单位|供应商|公司)|以\d+[\d.,]+万?元中标))"
			
 
				-        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]：2|名次：2))"
			
 
				-        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]：3|名次：3))"
			
 
				+        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2](名|候选)|排[名序]：2|名次：2))"
			
 
				+        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3](名|候选)|排[名序]：3|名次：3))"
			
 
				         self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.tenderee_left_5,self.agency_left_9,
			
 
				                              self.winTenderer_left_9,self.winTenderer_left_8, self.winTenderer_right_9, self.winTenderer_left_6, self.secondTenderer_left_9, self.thirdTenderer_left_9] # 概率要由高到低 274941849
			
 
				     def predict(self, list_sentences, list_entitys, original_docchannel, span=15, min_prob=0.7):
			
@@ -2456,8 +2459,8 @@ class RoleGrade():
 
				         for entity in low_prob_winner: # 如果低概率中标人在招标或代理列表，改为非角色
			
 
				             if entity.entity_text in all_tenderee_agency:
			
 
				                 entity.label = 5
			
 
				-            elif entity.in_attachment: # 附件低概率中标角色不要 避免：516109391 桂林银行崇左宁明支行，宁明县城中镇兴宁大道中70号，预测为中标
			
 
				-                entity.label = 5
			
 
				+            # elif entity.in_attachment: # 附件低概率中标角色不要 避免：516109391 桂林银行崇左宁明支行，宁明县城中镇兴宁大道中70号，预测为中标 20241126 注释掉，558294326 附件单个候选人漏提取
			
 
				+            #     entity.label = 5
			
 
				 
			
 
				         if org_winner != []:
			
 
				             flag = 0
			
@@ -2499,7 +2502,7 @@ class MoneyGrade():
 
				                     if ser:
			
 
				                         groupdict = pattern.split('>')[0].replace('(?P<', '')
			
 
				                         _role, _direct, _prob = groupdict.split('_')
			
 
				-                        if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context):# or float(entity.entity_text)<100:
			
 
				+                        if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or entity.notes == '总投资':# or float(entity.entity_text)<100:
			
 
				                             _prob = 6
			
 
				                         _label = role2id.get(_role)
			
 
				                         if _label != entity.label:
			
@@ -2522,8 +2525,8 @@ class MoneyGrade():
 
				                     # _prob = min_prob - 0.1 if in_att else min_prob
			
 
				                     entity.values[entity.label] = _prob + entity.values[entity.label] / 20
			
 
				                     # print('找不到规则修改金额概率：', entity.entity_text, entity.label, entity.values)
			
 
				-            if entity.entity_type in ['money'] and entity.label in [0, 1] and 0.5<=entity.values[entity.label]<0.75 and float(entity.entity_text)<100: # 20241011 低概率小金额改为其他金额
			
 
				-                entity.label = 2
			
 
				+            # if entity.entity_type in ['money'] and entity.label in [0, 1] and 0.5<=entity.values[entity.label]<0.75 and float(entity.entity_text)<100: # 20241011 低概率小金额改为其他金额 # 20241128 小金额可能为单价，放单价存放
			
 
				+            #     entity.label = 2
			
 
				 
			
 
				 
			
 
				 # 时间类别
			
@@ -5765,16 +5768,233 @@ class DistrictPredictor():
 
				         with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
			
 
				             self.area_variance_dic = pickle.load(f)
			
 
				 
			
 
				-    def predict_backup(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
			
 
				-        '''
			
 
				-        先匹配 project_name+tenderee+tenderee_address， 如果缺少省或市 再匹配 title+content
			
 
				-        :param project_name:
			
 
				-        :param prem:
			
 
				-        :param title:
			
 
				-        :param list_articles:
			
 
				-        :param web_source_name:
			
 
				-        :return:
			
 
				-        '''
			
 
				+    def predict_area(self, title, ree, addr, web_source_name):
			
 
				+        p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
			
 
				+
			
 
				+        def find_whole_areas(text, weight=1):
			
 
				+            '''
			
 
				+            通过正则匹配字符串返回地址
			
 
				+            :param pettern: 地址正则 广东省|广西省|...
			
 
				+            :param text: 待匹配文本
			
 
				+            :return:
			
 
				+            '''
			
 
				+            province_l, city_l, district_l = [], [], []
			
 
				+
			
 
				+            text = str(text)
			
 
				+            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县',
			
 
				+                          ' ', text)  # 544151395 赤壁市老城区燃气管道老化更新改造
			
 
				+            text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
			
 
				+            text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域：怒江州 识别为广西 - 崇左 - 江州
			
 
				+            text = re.sub('茂名滨海新区', '茂名市', text)
			
 
				+            text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
			
 
				+            text = re.sub('横州市', '横县', text)  # 例：547363890 修复广西南宁横州 不在地区表问题
			
 
				+            ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
			
 
				+            if ser and '黎族' not in ser.group(0):
			
 
				+                text = text.replace(ser.group(0), ser.group(0) + '黎族')
			
 
				+            for k, v in self.area_variance_dic.items():  # 20241113 根据地区变更信息替换文本
			
 
				+                text = text.replace(k, v)
			
 
				+
			
 
				+            if re.search('[\u4e00-\u9fa5]', text) == None:
			
 
				+                return province_l, city_l, district_l
			
 
				+
			
 
				+            pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
			
 
				+                p_pro, p_city, p_dis, p_city, p_dis, p_dis)
			
 
				+
			
 
				+            for it in re.finditer(pettern, text):
			
 
				+                if it.group(0) == '站前':  # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份：辽宁， 城市：营口，区县：站前
			
 
				+                    continue
			
 
				+                for k, v in it.groupdict().items():
			
 
				+                    if v != None:
			
 
				+                        if it.end() == it.end(k) and re.search('[省市区县州旗盟]$', v) == None and re.search(
			
 
				+                                '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆|经济开发区|开发区|新区)',
			
 
				+                                # 城市不匹配为区的地址 修复 滨州北海经济开发区 北海新区 等提取为北海
			
 
				+                                text[it.end(k):]) != None:
			
 
				+                            continue
			
 
				+                        if k in ['prov']:
			
 
				+                            if v in full_dic['province']:
			
 
				+                                score = 2
			
 
				+                            else:
			
 
				+                                score = 1
			
 
				+                                if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
			
 
				+                                        , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='（' and text[it.end(k)]=='）'):
			
 
				+                                    score += 1
			
 
				+                            score += it.end(k) / len(text) / 10
			
 
				+                            province_l.append((v, score * weight))
			
 
				+                        elif k in ['city', 'city1']:
			
 
				+                            if v in full_dic['city']:
			
 
				+                                score = 2
			
 
				+                            else:
			
 
				+                                score = 1
			
 
				+                                if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
			
 
				+                                        , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='（' and text[it.end(k)]=='）'):
			
 
				+                                    score += 1
			
 
				+                            score += it.end(k) / len(text) / 10
			
 
				+                            city_l.append((v, score * weight))
			
 
				+                        elif k in ['dist', 'dist1', 'dist2']:
			
 
				+                            if v in ['东区', '西区', '城区', '郊区', '矿区']:
			
 
				+                                continue
			
 
				+                            if v in full_dic['district'] and len(v)>2:
			
 
				+                                score = 2
			
 
				+                            else:
			
 
				+                                score = 0.5
			
 
				+                                if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
			
 
				+                                        , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='（' and text[it.end(k)]=='）'):
			
 
				+                                    score += 1
			
 
				+                                    # print('县区加分：', v, text)
			
 
				+                            score += it.end(k) / len(text) / 10
			
 
				+                            if v == '昌江' and '景德镇' not in it.group(0):
			
 
				+                                district_l.append(('昌江黎族', score * weight))
			
 
				+                            else:
			
 
				+                                district_l.append((v, score * weight))
			
 
				+            return province_l, city_l, district_l
			
 
				+
			
 
				+        def merge_score(province_l, city_l, district_l, filter_short_dist=True):
			
 
				+            '''
			
 
				+            合并分数，下级地区分数加到上级
			
 
				+            :param province_l: 提取到的省份列表 [(name, score)]
			
 
				+            :param city_l: 提取到的城市列表 [(name, score)]
			
 
				+            :param district_l: 提取到的区县列表 [(name, score)]
			
 
				+            :param filter_short_dist: 是否过滤不在省份下的区县简称权重
			
 
				+            :return:
			
 
				+            '''
			
 
				+            pro_ids = dict()
			
 
				+            city_ids = dict()
			
 
				+            dis_ids = dict()
			
 
				+            for pro in province_l:
			
 
				+                name, score = pro
			
 
				+                idx = full_dic['province'][name] if name in full_dic['province'] else short_dic['province'][name]
			
 
				+                if idx not in pro_ids:
			
 
				+                    pro_ids[idx] = 0
			
 
				+                pro_ids[idx] += score
			
 
				+
			
 
				+            tmp_pro = {}
			
 
				+            for city in city_l:
			
 
				+                name, score = city
			
 
				+                if name in full_dic['city']:
			
 
				+                    for idx in full_dic['city'][name]:
			
 
				+                        if idx not in city_ids:
			
 
				+                            city_ids[idx] = 0
			
 
				+                        city_ids[idx] += score
			
 
				+                        pro_idx = idx_dic[idx]['省']
			
 
				+                        if pro_idx in tmp_pro:
			
 
				+                            tmp_pro[pro_idx] += score
			
 
				+                        else:
			
 
				+                            tmp_pro[pro_idx] = score
			
 
				+                elif name in short_dic['city']:
			
 
				+                    for idx in short_dic['city'][name]:
			
 
				+                        if idx not in city_ids:
			
 
				+                            city_ids[idx] = 0
			
 
				+                        city_ids[idx] += score
			
 
				+                        pro_idx = idx_dic[idx]['省']
			
 
				+                        if pro_idx in tmp_pro:
			
 
				+                            tmp_pro[pro_idx] += score
			
 
				+                        else:
			
 
				+                            tmp_pro[pro_idx] = score
			
 
				+            if set(tmp_pro) & set(pro_ids) != set():
			
 
				+                for k, v in tmp_pro.items():
			
 
				+                    if k in pro_ids:
			
 
				+                        pro_ids[k] += v
			
 
				+            else:
			
 
				+                pro_ids.update(tmp_pro)
			
 
				+            tmp_pro = {}
			
 
				+            tmp_city = {}
			
 
				+            for dis in district_l:
			
 
				+                name, score = dis
			
 
				+                if name in full_dic['district']:
			
 
				+                    for idx in full_dic['district'][name]:
			
 
				+                        if idx not in dis_ids:
			
 
				+                            dis_ids[idx] = 0
			
 
				+                        dis_ids[idx] += score
			
 
				+                        pro_idx = idx_dic[idx]['省']
			
 
				+                        if pro_idx in tmp_pro:
			
 
				+                            tmp_pro[pro_idx] += score
			
 
				+                        else:
			
 
				+                            tmp_pro[pro_idx] = score
			
 
				+                        city_idx = idx_dic[idx]['市']
			
 
				+                        if city_idx in tmp_city:
			
 
				+                            tmp_city[city_idx] += score
			
 
				+                        else:
			
 
				+                            tmp_city[city_idx] = score
			
 
				+                elif name in short_dic['district']:
			
 
				+                    for idx in short_dic['district'][name]:
			
 
				+                        if idx not in dis_ids:
			
 
				+                            dis_ids[idx] = 0
			
 
				+                        dis_ids[idx] += score
			
 
				+                        pro_idx = idx_dic[idx]['省']
			
 
				+                        if filter_short_dist and pro_idx not in pro_ids:
			
 
				+                            continue
			
 
				+                        if pro_idx in tmp_pro:
			
 
				+                            tmp_pro[pro_idx] += score
			
 
				+                        else:
			
 
				+                            tmp_pro[pro_idx] = score
			
 
				+                        city_idx = idx_dic[idx]['市']
			
 
				+                        if city_idx in tmp_city:
			
 
				+                            tmp_city[city_idx] += score
			
 
				+                        else:
			
 
				+                            tmp_city[city_idx] = score
			
 
				+            if set(tmp_pro) & set(pro_ids) != set():
			
 
				+                for k, v in tmp_pro.items():
			
 
				+                    if k in pro_ids:
			
 
				+                        pro_ids[k] += v
			
 
				+            else:
			
 
				+                pro_ids.update(tmp_pro)
			
 
				+            if set(tmp_city) & set(city_ids) != set():
			
 
				+                for k, v in tmp_city.items():
			
 
				+                    if k in city_ids:
			
 
				+                        city_ids[k] += v
			
 
				+            else:
			
 
				+                city_ids.update(tmp_city)
			
 
				+            return pro_ids, city_ids, dis_ids
			
 
				+
			
 
				+        def get_final_addr(pro_ids, city_ids, dis_ids):
			
 
				+            '''
			
 
				+            先把所有匹配的全称、简称转为id,如果省份不为空，城市不为空且有城市属于省份的取该城市
			
 
				+            :param province_l: 匹配到的所有省份
			
 
				+            :param city_l: 匹配到的所有城市
			
 
				+            :param district_l: 匹配到的所有区县
			
 
				+            :return:
			
 
				+            '''
			
 
				+            big_area = ""
			
 
				+            pred_pro = ""
			
 
				+            pred_city = ""
			
 
				+            pred_dis = ""
			
 
				+
			
 
				+            final_pro = ""
			
 
				+            final_city = ""
			
 
				+            prob = 0
			
 
				+            max_score = 0
			
 
				+            if len(pro_ids) >= 1:
			
 
				+                pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+                scores = [it[1] for it in pro_l]
			
 
				+                prob = max(scores)/sum(scores)
			
 
				+                max_score = max(scores)
			
 
				+                final_pro, score = pro_l[0]
			
 
				+                if score >= 0.01:
			
 
				+                    pred_pro = idx_dic[final_pro]['返回名称']
			
 
				+                    big_area = idx_dic[final_pro]['大区']
			
 
				+            if pred_pro != "" and len(city_ids) >= 1:
			
 
				+                city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+                for it in city_l:
			
 
				+                    if idx_dic[it[0]]['省'] == final_pro:
			
 
				+                        final_city = it[0]
			
 
				+                        pred_city = idx_dic[final_city]['返回名称']
			
 
				+                        break
			
 
				+            if final_city != "" and len(set(dis_ids)) >= 1:
			
 
				+                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+                for it in dis_l:
			
 
				+                    if idx_dic[it[0]]['市'] == final_city:
			
 
				+                        pred_dis = idx_dic[it[0]]['返回名称']
			
 
				+            elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1:  # 20241111 省份不为空，市为空，如果区县在省份下，补充对应的市县
			
 
				+                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+                for it in dis_l:
			
 
				+                    if idx_dic[it[0]]['省'] == final_pro:
			
 
				+                        pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
			
 
				+                        pred_dis = idx_dic[it[0]]['返回名称']
			
 
				+            if pred_city in ['北京', '天津', '上海', '重庆']:
			
 
				+                pred_city = pred_dis
			
 
				+                pred_dis = ""
			
 
				+            return big_area, pred_pro, pred_city, pred_dis, prob, max_score
			
 
				+
			
 
				         def get_ree_addr(prem):
			
 
				             tenderee = ""
			
 
				             tenderee_address = ""
			
@@ -5787,92 +6007,6 @@ class DistrictPredictor():
 
				             except Exception as e:
			
 
				                 print('解析prem 获取招标人、及地址出错')
			
 
				             return tenderee, tenderee_address
			
 
				-        def get_area(text, web_source_name, not_in_content=True):
			
 
				-            score_l = []
			
 
				-            id_set = set()
			
 
				-
			
 
				-            if re.search(self.short_name, text):
			
 
				-                for it in re.finditer(self.full_name, text):
			
 
				-                    name = it.group(0)
			
 
				-                    score = len(name) / len(text)
			
 
				-                    for _id in self.full2id[name]:
			
 
				-                        area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
			
 
				-                        # score_l.append([_id, score] + area)
			
 
				-                        # w = self.dist_dic[_id]['权重']
			
 
				-                        score_l.append([_id, score + 1] + area) # 匹配全称的加1 ，不加权重，因为权重某些赋值不好
			
 
				-
			
 
				-                flag = 0
			
 
				-                for it in re.finditer(self.short_name, text):
			
 
				-                    if it.end() < len(text) and re.search('^(村|镇|街|路|江|河|湖|北路|南路|东路|大道|社区)', text[it.end():]) == None:
			
 
				-                        name = it.group(0)
			
 
				-                        score = (it.start() + len(name)) / len(text)
			
 
				-                        for _id in self.short2id[name]:
			
 
				-                            score2 = 0
			
 
				-                            w = self.dist_dic[_id]['权重']
			
 
				-                            _type = self.dist_dic[_id]['类型']
			
 
				-                            area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
			
 
				-                            if area[0] in ['2', '16', '20', '30']:
			
 
				-                                _type += 10
			
 
				-                            if w < 1 and it.end() < len(text) and text[it.end()] in ['省', '市', '县']: # 如果简称后面 有省市县权重改为1
			
 
				-                                w = 1
			
 
				-                            score2 += w
			
 
				-                            if _id not in id_set:
			
 
				-                                if _type == 20:
			
 
				-                                    type_w = 3
			
 
				-                                elif _type == 30:
			
 
				-                                    if it.start()>3 and text[it.start()-1] == '市': # 城市后面 简称不能作为市
			
 
				-                                        type_w = 0
			
 
				-                                    else:
			
 
				-                                        type_w = 2
			
 
				-                                else:
			
 
				-                                    if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
			
 
				-                                        type_w = 2
			
 
				-                                    else:
			
 
				-                                        type_w = 0.5
			
 
				-                                id_set.add(_id)
			
 
				-                                score2 += w * type_w
			
 
				-                            score_l.append([_id, score * w + score2] + area)
			
 
				-
			
 
				-                if flag == 1:
			
 
				-                    pass
			
 
				-                #         print('score', score)
			
 
				-            if re.search('公司', web_source_name) == None:
			
 
				-                for it in re.finditer(self.short_name, web_source_name):
			
 
				-                    name = it.group(0)
			
 
				-                    for _id in self.short2id[name]:
			
 
				-                        area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
			
 
				-                        w = self.dist_dic[_id]['权重']
			
 
				-                        score = w * 0.2
			
 
				-                        score_l.append([_id, score] + area)
			
 
				-            area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
			
 
				-            if len(score_l) == 0:
			
 
				-                return {'district': area_dic}
			
 
				-            else:
			
 
				-                df = pd.DataFrame(score_l, columns=['id', 'score', 'province', 'city', 'district'])
			
 
				-                df['简称'] = df['id'].apply(lambda x: self.dist_dic[x]['地区'])
			
 
				-                # print('地区评分：')
			
 
				-                # print(df)
			
 
				-                df_pro = df.groupby('province').sum().sort_values(by=['score'], ascending=False)
			
 
				-                pro_id = df_pro.index[0]
			
 
				-                if df_pro.loc[pro_id, 'score'] < 0.1 and not_in_content:  # 不是二次全文匹配的 省级评分小于0.1的不要
			
 
				-                    # print('评分低于0.1', df_pro.loc[pro_id, 'score'], self.dist_dic[pro_id]['地区'])
			
 
				-                    return {'district': area_dic}
			
 
				-                area_dic['province'] = self.dist_dic[pro_id]['地区']
			
 
				-                area_dic['area'] = self.dist_dic[pro_id]['大区']
			
 
				-                df = df[df['city'] != ""]
			
 
				-                df = df[df['province'] == pro_id]
			
 
				-                if len(df) > 0:
			
 
				-                    df_city = df.groupby('city').sum().sort_values(by=['score'], ascending=False)
			
 
				-                    city_id = df_city.index[0]
			
 
				-                    area_dic['city'] = self.dist_dic[city_id]['地区']
			
 
				-                    df = df[df['district'] != ""]
			
 
				-                    df = df[df['city'] == city_id]
			
 
				-                    if len(df) > 0:
			
 
				-                        df_dist = df.groupby('district').sum().sort_values(by=['score'], ascending=False)
			
 
				-                        dist_id = df_dist.index[0]
			
 
				-                        area_dic['district'] = self.dist_dic[dist_id]['地区']
			
 
				-                # print(area_dic)
			
 
				-                return {'district': area_dic}
			
 
				 
			
 
				         def get_role_address(text):
			
 
				             '''正则匹配获取招标人地址
			
@@ -5892,14 +6026,17 @@ class DistrictPredictor():
 
				                 return ''
			
 
				 
			
 
				         def get_project_addr(text):
			
 
				-            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+            p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+([\w（）]{,20}[，。])?|\w{2,15}[，。])'
			
 
				+            p2 = '项目位于(?P<addr>\w{2}市\w{2,4}区)'
			
 
				             if re.search(p1, text):
			
 
				                 return re.search(p1, text).group('addr')
			
 
				+            elif re.search(p2, text):
			
 
				+                return re.search(p2, text).group('addr')
			
 
				             else:
			
 
				                 return ''
			
 
				 
			
 
				         def get_bid_addr(text):
			
 
				-            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售|所属)(地址|地点|所在地区?|地域)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				             if re.search(p2, text):
			
 
				                 return re.search(p2, text).group('addr')
			
 
				             else:
			
@@ -5909,7 +6046,7 @@ class DistrictPredictor():
 
				             tenderee_l = []
			
 
				             addr_l = []
			
 
				             for ent in list_entitys[0]:
			
 
				-                if ent.entity_type == 'location' and len(ent.entity_text)>2:
			
 
				+                if ent.entity_type == 'location' and len(ent.entity_text) > 2:
			
 
				                     addr_l.append(ent.entity_text)
			
 
				                 elif ent.entity_type in ['org', 'company']:
			
 
				                     if ent.label in [0, 1]:  # 加招标或代理
			
@@ -5923,85 +6060,43 @@ class DistrictPredictor():
 
				             else:
			
 
				                 return ''
			
 
				 
			
 
				-        if '##attachment##' in list_articles[0].content:
			
 
				-            content, attachment = list_articles[0].content.split('##attachment##')
			
 
				-            if len(content) < 200:
			
 
				-                content += attachment
			
 
				-        else:
			
 
				-            content = list_articles[0].content
			
 
				-
			
 
				-        tenderee, tenderee_address = get_ree_addr(prem)
			
 
				-        msc = ""
			
 
				-        pro_addr = get_project_addr(content)
			
 
				-        if pro_addr != "":
			
 
				-            msc += '使用规则提取的项目地址；'
			
 
				-            tenderee_address = pro_addr
			
 
				-        else:
			
 
				-            role_addr = get_role_address(content)
			
 
				-            if role_addr != "":
			
 
				-                msc += '使用规则提取的联系人地址；'
			
 
				-                tenderee_address = role_addr
			
 
				-
			
 
				-        if tenderee_address == "":
			
 
				-            title_addr = get_title_addr(title)
			
 
				-            if title_addr != "":
			
 
				-                msc += '使用规则提取的标题地址；'
			
 
				-                tenderee_address = title_addr
			
 
				-            else:
			
 
				-                bid_addr = get_bid_addr(content)
			
 
				-                if bid_addr != "":
			
 
				-                    msc += '使用规则提取的开标地址；'
			
 
				-                    tenderee_address = bid_addr
			
 
				-
			
 
				-        project_name = str(project_name)
			
 
				-        tenderee = str(tenderee)
			
 
				-
			
 
				-        # print('招标人地址',role_addr, tenderee_address)
			
 
				-
			
 
				-        project_name = project_name + title if project_name not in title else project_name
			
 
				-        project_name = project_name.replace(tenderee, '')
			
 
				-
			
 
				-        text1 = "{0} {1} {2}".format(project_name, tenderee, tenderee_address)
			
 
				-
			
 
				-        web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
			
 
				-        text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  #预防提取错 合肥 路南 新会 等地区
			
 
				-
			
 
				-        if pro_addr:
			
 
				-            msc += '## 使用项目地址输入：%s ##；' % pro_addr
			
 
				-            rs = get_area(pro_addr, '')
			
 
				-            msc += '预测结果：省份：%s， 城市：%s，区县：%s；' % (
			
 
				-                rs['district']['province'], rs['district']['city'], rs['district']['district'])
			
 
				-            if rs['district']['province'] != '全国':
			
 
				-                # print('地区匹配：', msc)
			
 
				-                return rs
			
 
				-
			
 
				-        # print('text1:', text1)
			
 
				-        msc += '## 第一次预测输入：%s ##；'%text1
			
 
				-        rs = get_area(text1, web_source_name)
			
 
				-        msc += '预测结果：省份：%s， 城市：%s，区县：%s；' % (
			
 
				-        rs['district']['province'], rs['district']['city'], rs['district']['district'])
			
 
				-        # self.f.write('%s %s \n' % (list_articles[0].id, msc))
			
 
				-        # print('地区匹配：', msc)
			
 
				-        if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
			
 
				-            msc = ""
			
 
				-            all_addr, tenderees = get_all_addr(list_entitys)
			
 
				-            text2 = tenderees + " " + all_addr + ' ' + title
			
 
				-            msc += '使用实体列表所有招标人+所有地址；'
			
 
				-            # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
			
 
				-            text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
			
 
				-            # print('text2:', text2)
			
 
				-            msc += '## 第二次预测输入：%s ##'%text2
			
 
				-            rs2 = get_area(text2, web_source_name, not_in_content=False)
			
 
				-            rs2['district']['is_in_text'] = True
			
 
				-            if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
			
 
				-                rs = rs2
			
 
				-            elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
			
 
				-                rs = rs2
			
 
				-            msc += '预测结果：省份：%s， 城市：%s，区县：%s'%(
			
 
				-                rs['district']['province'],rs['district']['city'],rs['district']['district'])
			
 
				-        # self.f.write('%s %s \n'%(list_articles[0].id, msc))
			
 
				-        # print('地区匹配：', msc)
			
 
				-        return rs
			
 
				+        area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
			
 
				+        province_l, city_l, district_l = find_whole_areas(title)
			
 
				+        pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
			
 
				+        big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
			
 
				+        # print('关键词1：', province_l, city_l, district_l)
			
 
				+        # print('分数：', pro_ids, city_ids, dis_ids, prob, max_score)
			
 
				+        if pred_city == "" or prob < 0.7 or max_score<2:
			
 
				+            province_l2, city_l2, district_l2 = find_whole_areas('%s %s' % (ree, addr), weight=0.8)
			
 
				+            province_l.extend(province_l2)
			
 
				+            city_l.extend(city_l2)
			
 
				+            district_l.extend(district_l2)
			
 
				+            pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
			
 
				+            big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
			
 
				+            # print('关键词2：', province_l, city_l, district_l)
			
 
				+            # print('分数：', pro_ids, city_ids, dis_ids, prob, max_score)
			
 
				+            if pred_city == "" or prob < 0.7 or max_score<2:
			
 
				+                province_l3, city_l3, district_l3 = find_whole_areas(web_source_name, weight=0.6)
			
 
				+                province_l.extend(province_l3)
			
 
				+                city_l.extend(city_l3)
			
 
				+                district_l.extend(district_l3)
			
 
				+                pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
			
 
				+                big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
			
 
				+                # print('关键词3：', province_l, city_l, district_l)
			
 
				+                # print('分数：', pro_ids, city_ids, dis_ids, prob, max_score)
			
 
				+
			
 
				+        in_content = False
			
 
				+        if big_area != "":
			
 
				+            area_dic['area'] = big_area
			
 
				+        if pred_pro != "":
			
 
				+            area_dic['province'] = pred_pro
			
 
				+        if pred_city != "":
			
 
				+            area_dic['city'] = pred_city
			
 
				+        if pred_dis != "":
			
 
				+            area_dic['district'] = pred_dis
			
 
				+        if in_content:
			
 
				+            area_dic['is_in_text'] = True
			
 
				+        return {'district': area_dic}
			
 
				 
			
 
				     def get_area(self, text, web_name, in_content=False):
			
 
				         p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
			
@@ -6651,6 +6746,8 @@ class TablePremExtractor(object):
 
				                         continue
			
 
				                     # print('表头错误，一个td匹配到两个表头：', header_dic)
			
 
				                     return flag, contain_header, dict(), not_sure_winner
			
 
				+                if text == '单位': # 20241128 补充金额单位
			
 
				+                    header_dic['amount_unit'] = (i, text)
			
 
				             if re.search('；金额(（万?元）)?；', '；'.join(td_list)):  # 召回某些表格只写 金额 作为表头，不能识别为招标或中标金额
			
 
				                 if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
			
 
				                     for i in range(len(td_list)):
			
@@ -6750,6 +6847,7 @@ class TablePremExtractor(object):
 
				             win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
			
 
				             win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else ""
			
 
				             serviceTime = df.loc[i, headers['serviceTime'][0]].strip() if "serviceTime" in headers else ""
			
 
				+            amount_unit = df.loc[i, headers['amount_unit'][0]].strip() if "amount_unit" in headers else ""
			
 
				 
			
 
				             if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
			
 
				                 # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
			
@@ -6764,7 +6862,7 @@ class TablePremExtractor(object):
 
				                 project_name = ""
			
 
				 
			
 
				             package_code = package_code_raw
			
 
				-            if re.search('合计|总计', package_code+project_code):
			
 
				+            if re.search('合计|总计', package_code+project_code+project_name):
			
 
				                 continue
			
 
				             if package_code + project_code == previous_package:  # 处理 208162730 一个包采购多种东西情况
			
 
				                 same_package = True
			
@@ -6843,7 +6941,14 @@ class TablePremExtractor(object):
 
				                     prem_dic.pop(package)
			
 
				                     break
			
 
				                 budget_header = headers['budget'][1] if 'budget' in headers else ''
			
 
				+                if amount_unit!='' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元', budget_+budget_header)==None : # 20241128 补充某些表格价格单位分开两列， 例：557953660
			
 
				+                    budget_ += amount_unit
			
 
				                 budget, money_unit = money_process(budget_, budget_header) if re.search('[%％‰折]|浮率|期加点\d+BP', budget_)==None else (0, '')
			
 
				+                if re.search('元[/每]', amount_unit) or re.search('单价', budget_header):
			
 
				+                    unit_tendereeMoney = budget
			
 
				+                    budget = 0
			
 
				+                else:
			
 
				+                    unit_tendereeMoney = 0
			
 
				 
			
 
				                 if (re.search('费率|下浮率|[%％‰折]|优惠率',
			
 
				                               budget_header + budget_) and budget < 100) or budget > 50000000000:  # 如果是费率或大于500亿的金额改为0
			
@@ -6854,6 +6959,13 @@ class TablePremExtractor(object):
 
				                     else:
			
 
				                         prem_dic[package]['tendereeMoney'] = budget
			
 
				                     prem_dic[package]['tendereeMoneyUnit'] = money_unit
			
 
				+                if unit_tendereeMoney > 0:
			
 
				+                    if 'unit_tendereeMoney' not in prem_dic[package]:
			
 
				+                        prem_dic[package]['unit_tendereeMoney'] = 0
			
 
				+                    if same_package and prem_dic[package]['unit_tendereeMoney'] != unit_tendereeMoney:  # 处理 类似 136839070 一包多物品多预算
			
 
				+                        prem_dic[package]['unit_tendereeMoney'] += unit_tendereeMoney
			
 
				+                    else:
			
 
				+                        prem_dic[package]['unit_tendereeMoney'] = unit_tendereeMoney
			
 
				             if tenderee and not same_package:
			
 
				                 prem_dic[package]['roleList'].append({
			
 
				                         "address": "",
			
@@ -6874,8 +6986,16 @@ class TablePremExtractor(object):
 
				                               bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符，中断匹配
			
 
				                     prem_dic.pop(package)
			
 
				                     break
			
 
				-
			
 
				+                bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
			
 
				+                if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and bid_amount_!='' and re.search('元',
			
 
				+                                                                                                       bid_amount_ + bid_amount_header) == None:
			
 
				+                    bid_amount_ += amount_unit
			
 
				                 bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%％‰折]|浮率|期加点\d+BP', bid_amount_)==None and 'bid_amount' in headers else (0, '')
			
 
				+                if re.search('元[/每]', amount_unit) or re.search('单价', bid_amount_header):
			
 
				+                    unit_price = bid_amount
			
 
				+                    bid_amount = 0
			
 
				+                else:
			
 
				+                    unit_price = 0
			
 
				                 if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%％‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉，防止类似 河钢供应链管理平台 站源错误，金额不为0的才算中标
			
 
				                     if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的包 丢弃
			
 
				                         prem_dic.pop(package)
			
@@ -6885,7 +7005,6 @@ class TablePremExtractor(object):
 
				                         prem_dic.pop(package)
			
 
				                     continue
			
 
				 
			
 
				-                bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
			
 
				                 if (re.search('费率|下浮率|[%％‰折]|优惠率',
			
 
				                               bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000:  # 如果是费率或大于500亿的金额改为0
			
 
				                     bid_amount = 0
			
@@ -6897,7 +7016,7 @@ class TablePremExtractor(object):
 
				                     serviceTime = extract_serviceTime(serviceTime[0]['body'],"") if serviceTime else ""
			
 
				                     # print(serviceTime)
			
 
				                 if not same_package or len(prem_dic[package]['roleList'])==0:
			
 
				-                    prem_dic[package]['roleList'].append({
			
 
				+                    role_dic = {
			
 
				                             "address": "",
			
 
				                             "linklist": [],
			
 
				                             "role_money": {
			
@@ -6910,17 +7029,20 @@ class TablePremExtractor(object):
 
				                             "role_name": "win_tenderer",
			
 
				                             "role_text": tenderer,
			
 
				                             "serviceTime": serviceTime
			
 
				-                    })
			
 
				+                    }
			
 
				+                    if unit_price > 0:
			
 
				+                        role_dic['role_money']['unit_price'] = unit_price
			
 
				+                    prem_dic[package]['roleList'].append(role_dic)
			
 
				                 elif prem_dic[package]['roleList'] and prem_dic[package]['roleList'][-1].get('role_name', '')=='win_tenderer':
			
 
				                     if 'multi_winner' not in prem_dic[package]['roleList'][-1]:
			
 
				                         prem_dic[package]['roleList'][-1]['multi_winner'] = prem_dic[package]['roleList'][-1]['role_text']
			
 
				                         prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
			
 
				                     elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
			
 
				                         prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
			
 
				-                    if bid_amount != 0: # 有中标金额的才放进去
			
 
				+                    if bid_amount != 0 or unit_price > 0: # 有中标金额的才放进去
			
 
				                         if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
			
 
				                             prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
			
 
				-                        prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit,"serviceTime":serviceTime})
			
 
				+                        prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit, "serviceTime": serviceTime})
			
 
				                 tenderer_list.append(tenderer)
			
 
				                 serviceTime_list.append(serviceTime)
			
 
				             if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃 并不再继续往下匹配
			
@@ -7113,6 +7235,7 @@ class CandidateExtractor(object):
 
				             flag = True
			
 
				             for i in range(len(td_list)) :
			
 
				                 text = td_list[i]
			
 
				+                text = re.sub('\s|[（(]排名不分先后[)）]', '', text)
			
 
				                 if len(text) > 15: # 长度大于15 不进行表头匹配
			
 
				                     continue
			
 
				                 if re.search('未(中标|成交)原因', text):  # 不提取此种表格
			
@@ -7134,6 +7257,8 @@ class CandidateExtractor(object):
 
				                 if num>1:
			
 
				                     # print('表头错误，一个td匹配到两个表头：', header_dic)
			
 
				                     return flag, contain_header, dict()
			
 
				+                if text == '单位': # 20241128 补充金额单位
			
 
				+                    header_dic['amount_unit'] = (i, text)
			
 
				             if ('candidate' in header_dic and 'win_sort' in header_dic) or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic): # 有排名才返回表头进行提取
			
 
				                 return flag, contain_header, header_dic
			
 
				         elif len(set(fix_td_list) & self.headerset) >= 2  or (len(set(fix_td_list)) == 2 and len(set(fix_td_list) & self.headerset) >= 1):  # 如果包含两个表头以上或 只有两列且包含一个表头
			
@@ -7210,6 +7335,7 @@ class CandidateExtractor(object):
 
				             win_tenderer = df.loc[i, headers['win_tenderer'][0]].strip() if "win_tenderer" in headers else ""
			
 
				             second_tenderer = df.loc[i, headers['second_tenderer'][0]].strip() if "second_tenderer" in headers else ""
			
 
				             third_tenderer = df.loc[i, headers['third_tenderer'][0]].strip() if "third_tenderer" in headers else ""
			
 
				+            amount_unit = df.loc[i, headers['amount_unit'][0]].strip() if "amount_unit" in headers else ""
			
 
				 
			
 
				             if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头， 停止匹配 # 排除 ,win_sort 避免367940050漏提取
			
 
				                 # print('包含表头， 停止匹配')
			
@@ -7286,7 +7412,14 @@ class CandidateExtractor(object):
 
				                         if len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分￥整\s\d,.]|人民币|不?含税', '',
			
 
				                                       text)) > 5:  # 金额字段出现超过5个非金额字符，中断匹配
			
 
				                             break
			
 
				+                        if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元', text+header)==None: # 补充另外在一列的金额单位
			
 
				+                            text += amount_unit
			
 
				                         money, money_unit = money_process(text, header)
			
 
				+                        if re.search('元[/每]', amount_unit) or re.search('单价', header):
			
 
				+                            unit_price = money
			
 
				+                            money = 0
			
 
				+                        else:
			
 
				+                            unit_price = 0
			
 
				 
			
 
				                         if (re.search('费率|下浮率|[%％‰折]|优惠率', header+text) and money < 100) or money > 50000000000: # 如果是费率或大于500亿的金额改为0
			
 
				                             money = 0
			
@@ -7295,6 +7428,11 @@ class CandidateExtractor(object):
 
				                                 role_dic[type] = dict()
			
 
				                             role_dic[type]['money'] = money
			
 
				                             role_dic[type]['money_unit'] = money_unit
			
 
				+                        if unit_price > 0:
			
 
				+                            if type not in role_dic:
			
 
				+                                role_dic[type] = dict()
			
 
				+                            role_dic[type]['unit_price'] = unit_price
			
 
				+                            role_dic[type]['money_unit'] = money_unit
			
 
				                 else:
			
 
				                     line_num += 1
			
 
				                     if findtop3 and findmoney:
			
@@ -7322,13 +7460,21 @@ class CandidateExtractor(object):
 
				                         prem_dic[package]['name'] = project_name
			
 
				                     if len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分￥整\s\d,.]|人民币|不?含税', '', bid_amount_))> 5:  # 金额字段出现超过5个非金额字符，中断匹配
			
 
				                         break
			
 
				+                    header = headers['bid_amount'][1] if "bid_amount" in headers else ''
			
 
				+                    if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元',
			
 
				+                                                                                                           bid_amount_ + header) == None:  # 补充另外在一列的金额单位
			
 
				+                        bid_amount_ += amount_unit
			
 
				                     bid_amount, money_unit  = money_process(bid_amount_, headers['bid_amount'][1])  if "bid_amount" in headers else (0, "")
			
 
				+                    if re.search('元[/每]', amount_unit) or re.search('单价', header):
			
 
				+                        unit_price = bid_amount
			
 
				+                        bid_amount = 0
			
 
				+                    else:
			
 
				+                        unit_price = 0
			
 
				 
			
 
				-                    header = headers['bid_amount'][1] if "bid_amount" in headers else ''
			
 
				                     if (re.search('费率|下浮率|[%％‰折]|优惠率',
			
 
				                                   header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000:  # 如果是费率或大于500亿的金额改为0
			
 
				                         bid_amount = 0
			
 
				-                    prem_dic[package]['roleList'].append({
			
 
				+                    tmp_role_dic = {
			
 
				                             "address": "",
			
 
				                             "linklist": [],
			
 
				                             "role_money": {
			
@@ -7341,7 +7487,10 @@ class CandidateExtractor(object):
 
				                             "role_name": role_type,
			
 
				                             "role_text": candidate,
			
 
				                             "serviceTime": ""
			
 
				-                    })
			
 
				+                    }
			
 
				+                    if unit_price > 0:
			
 
				+                        tmp_role_dic['role_money']['unit_price'] = unit_price
			
 
				+                    prem_dic[package]['roleList'].append(tmp_role_dic)
			
 
				                     if len(prem_dic[package]['roleList']) == 0:  # 只有项目编号和名称的 丢弃
			
 
				                         prem_dic.pop(package)
			
 
				         if role_dic and prem_dic == dict():