6 months ago · 5115a05a10
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -947,29 +947,36 @@ def money_process(money_text, header):
 
															     '''
														
 
															     money = 0
														
 
															     money_unit = ""
														
 
															-    # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[（(]?万?", money_text)
														
 
															-    money_text = re.sub('\s', '', money_text) # 2024/04/19 修复 457699044 556.46751 万元 金额与单位有空格造成万漏提取
														
 
															-    if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text) and re.search('\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[（(]?万?', money_text):
														
 
															-        money_text = re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text).group(0)  # 如果表格同时包含大小写金额，取大写金额，避免单位取错 463310590 790000（柒拾玖万元整）
														
 
															-    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[（(]?万?", money_text)
														
 
															-    if re_price:
														
 
															-        money_re = re_price.group(0)
														
 
															-        if (re.search('万元|[（(]万[)）]',  header) or re.search('万元|[（(]万[)）]', money_text)) and '万' not in money_re:  # 修复37797825 控制价（万） # 修复 460307391 万元不在表头，在数字前面
														
 
															-            money_re += '万元'
														
 
															-        elif (re.search('亿元|[（(]亿[)）]',  header) or re.search('亿元|[（(]亿[)）]', money_text)) and '亿' not in money_re:  # 修复37797825 控制价（万） # 修复 460307391 万元不在表头，在数字前面
														
 
															-            money_re += '亿元'
														
 
															-        # money = float(getUnifyMoney(money_text))
														
 
															-        money = float(getUnifyMoney(money_re))
														
 
															-        if money > 10000000000000:  # 大于万亿的去除
														
 
															-            money = 0
														
 
															-        # money_unit = '万元' if '万' in money_re and re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None else '元'
														
 
															-        if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None:
														
 
															-            if '万' in money_re:
														
 
															-                money_unit = '万元'
														
 
															-            elif '亿' in money_re:
														
 
															-                money_unit = '亿元'
														
 
															-            else:
														
 
															-                money_unit = '元'
														
 
															+    moneys, _ = get_money_entity('%s：%s' % (header, money_text))
														
 
															+    if len(moneys) == 1:
														
 
															+        money = float(moneys[0][0])
														
 
															+        money_unit = moneys[0][3]
														
 
															+    elif len(moneys) == 2 and moneys[0][0]==moneys[1][0]:
														
 
															+        money = float(moneys[0][0])
														
 
															+        money_unit = moneys[0][3]
														
 
															+    # # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[（(]?万?", money_text)
														
 
															+    # money_text = re.sub('\s', '', money_text) # 2024/04/19 修复 457699044 556.46751 万元 金额与单位有空格造成万漏提取
														
 
															+    # if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text) and re.search('\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[（(]?万?', money_text):
														
 
															+    #     money_text = re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text).group(0)  # 如果表格同时包含大小写金额，取大写金额，避免单位取错 463310590 790000（柒拾玖万元整）
														
 
															+    # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[（(]?万?", money_text)
														
 
															+    # if re_price:
														
 
															+    #     money_re = re_price.group(0)
														
 
															+    #     if (re.search('万元|[（(]万[)）]',  header) or re.search('万元|[（(]万[)）]', money_text)) and '万' not in money_re:  # 修复37797825 控制价（万） # 修复 460307391 万元不在表头，在数字前面
														
 
															+    #         money_re += '万元'
														
 
															+    #     elif (re.search('亿元|[（(]亿[)）]',  header) or re.search('亿元|[（(]亿[)）]', money_text)) and '亿' not in money_re:  # 修复37797825 控制价（万） # 修复 460307391 万元不在表头，在数字前面
														
 
															+    #         money_re += '亿元'
														
 
															+    #     # money = float(getUnifyMoney(money_text))
														
 
															+    #     money = float(getUnifyMoney(money_re))
														
 
															+    #     if money > 10000000000000:  # 大于万亿的去除
														
 
															+    #         money = 0
														
 
															+    #     # money_unit = '万元' if '万' in money_re and re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None else '元'
														
 
															+    #     if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None:
														
 
															+    #         if '万' in money_re:
														
 
															+    #             money_unit = '万元'
														
 
															+    #         elif '亿' in money_re:
														
 
															+    #             money_unit = '亿元'
														
 
															+    #         else:
														
 
															+    #             money_unit = '元'
														
 
															     return (money, money_unit)
														
 
															 package_number_pattern = re.compile(
														
@@ -1146,6 +1153,223 @@ def is_deposit_project(title, name, requirement):
 
															         return True
														
 
															     return False
														
 
															+def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
														
 
															+    money_list = []
														
 
															+    # 使用正则识别金额
														
 
															+    entity_type = "money"
														
 
															+    list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
														
 
															+                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d：|\d=\d[-+×]\d：)?(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[）\)]?))",
														
 
															+                          "front_m": "((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)]?)\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
														
 
															+                          "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
														
 
															+    # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元，合同金额：378.8万元 提取
														
 
															+
														
 
															+    pattern_money = re.compile("%s|%s|%s|%s" % (
														
 
															+    list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
														
 
															+    list_money_pattern["front_m"]))
														
 
															+
														
 
															+    # sentence_text = re.sub('\d+[年月日]', '', sentence_text) # 修复560180018 中标价（元）：3年投标报价（元）含税6299700.00 3年作为金额
														
 
															+
														
 
															+    if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
														
 
															+        found_yeji += 1
														
 
															+    if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
														
 
															+        all_match = []
														
 
															+    else:
														
 
															+        ser = re.search('((收费标准|计算[方公]?式)：|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s（）()\[\]【】\d\.%％‰\+\-*×/]{20,}[，。]?', sentence_text)  # 过滤掉收费标准里面的金额
														
 
															+        if ser:
														
 
															+            sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
														
 
															+        all_match = re.finditer(pattern_money, sentence_text)
														
 
															+    # print('all_match:', all_match)
														
 
															+    for _match in all_match:
														
 
															+        # print('_match: ', _match.group())
														
 
															+        if re.search('^元/1\d{10}，$', _match.group(0)): # 修复 495042766 现场负责人 姚元 / 13488160460 预测为金额
														
 
															+            continue
														
 
															+        if len(_match.group()) > 0:
														
 
															+            # print("===",_match.group())
														
 
															+            # # print(_match.groupdict())
														
 
															+            notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
														
 
															+            unit = ""
														
 
															+            entity_text = ""
														
 
															+            start_index = ""
														
 
															+            end_index = ""
														
 
															+            text_beforeMoney = ""
														
 
															+            filter = ""
														
 
															+            filter_unit = False
														
 
															+            notSure = False
														
 
															+            science = ""
														
 
															+            if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
														
 
															+                # print('金额在业绩后面: ', _match.group(0))
														
 
															+                found_yeji += 1
														
 
															+                break
														
 
															+            for k, v in _match.groupdict().items():
														
 
															+                if v != "" and v is not None:
														
 
															+                    if k == 'text_key_word':
														
 
															+                        notSure = True
														
 
															+                    if k.split("_")[0] == "money":
														
 
															+                        entity_text = v
														
 
															+                        # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
														
 
															+                        if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾，应该小数点识别错，直接去掉
														
 
															+                            entity_text = entity_text[:-3]
														
 
															+                    if k.split("_")[0] == "unit":
														
 
															+                        if 'behind' in k or unit == "":  # 优先后面单位  预算金额(元)：160万元  总价（万元）：最终报价：695000.00（元）
														
 
															+                            unit = v
														
 
															+                    if k.split("_")[0] == "text":
														
 
															+                        text_beforeMoney = v
														
 
															+                    if k.split("_")[0] == "filter":
														
 
															+                        filter = v
														
 
															+                    if re.search("filter_unit", k) is not None:
														
 
															+                        filter_unit = True
														
 
															+                    if k.split("_")[0] == 'science':
														
 
															+                        science = v
														
 
															+            # print("金额：{0} ,单位：{1}, 前文：{2}, filter: {3}, filter_unit: {4}".format(entity_text,unit,text_beforeMoney,filter,filter_unit))
														
 
															+            # if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
														
 
															+            #     if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0] - 2):_match.span()[0]]):
														
 
															+            #         entity_text = re.sub('\d+,', '', entity_text)
														
 
															+            #     else:
														
 
															+            #         entity_text = entity_text.replace(',', '.')
														
 
															+            #     # print(' 修正OCR识别小数点为逗号')
														
 
															+
														
 
															+            if filter != "":
														
 
															+                continue
														
 
															+            if len(entity_text)>30 or len(re.sub('[E-]', '', science))>2: # 限制数字长度，避免类似265339018附件金额错误，数值超大报错 decimal.InvalidOperation
														
 
															+                continue
														
 
															+            start_index, end_index = _match.span()
														
 
															+            start_index += len(text_beforeMoney)
														
 
															+
														
 
															+            '''过滤掉手机号码作为金额'''
														
 
															+            if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
														
 
															+                # print('过滤掉手机号码作为金额')
														
 
															+                continue
														
 
															+            elif re.search('^1[3-9]\d{9}$', entity_text) and re.search('：\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额（万元）：季勇13863441880'
														
 
															+                # print('过滤掉手机号码作为金额')
														
 
															+                continue
														
 
															+            elif re.search('^\d(.\d{1,2})?$', entity_text) and re.search('\d$', _match.group(0)) and re.search('^[、.]', sentence_text[_match.end():]): # 170756755 控制价为：1、合理利润率上限
														
 
															+                # print('过滤错误金额：', _match.group(0))
														
 
															+                continue
														
 
															+
														
 
															+            if unit == "":  # 2021/7/21 有明显金额特征的补充单位，避免被过滤
														
 
															+                if (re.search('(￥|¥|RMB|CNY)[:：]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
														
 
															+                    if entity_text.endswith('万元'):
														
 
															+                        unit = '万元'
														
 
															+                        entity_text = entity_text[:-2]
														
 
															+                    else:
														
 
															+                        unit = '元'
														
 
															+                    # print('1明显金额特征补充单位 元')
														
 
															+                elif re.search('USD[:：]?$', text_beforeMoney):
														
 
															+                    unit = '美元'
														
 
															+                elif re.search('EUR[:：]?$', text_beforeMoney):
														
 
															+                    unit = '欧元'
														
 
															+                elif re.search('JPY[:：]?$', text_beforeMoney):
														
 
															+                    unit = '日元'
														
 
															+                elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
														
 
															+                    # print('两个金额连接后面的有单位，用后面单位')
														
 
															+                    unit = '万元'
														
 
															+                elif re.search('^，?(价格币种：\w{2,3}，)?价格单位：万元', sentence_text[end_index:]): # 修复494731937金额单位缺漏 中标价格：39501.094425，价格币种：人民币，价格单位：万元，
														
 
															+                    unit = '万元'
														
 
															+                elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型：（万元）报价：13311.1582，得分：84.46，
														
 
															+                    unit = '万元'
														
 
															+                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
														
 
															+                    if re.search('^[\d，,.]+$', entity_text) and float(re.sub('[,，]', '', entity_text))<500 and re.search('万元', sentence_text):
														
 
															+                        unit = '万元'
														
 
															+                        # print('金额较小且句子中有万元的，补充单位为万元')
														
 
															+                    elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
														
 
															+                        unit = '万元'
														
 
															+                    else:
														
 
															+                        unit = '元'
														
 
															+                        # print('金额前面紧接关键词的补充单位 元')
														
 
															+                elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7}，?)$)|(^\d{,3}(,\d{3})+，?$)', entity_text):
														
 
															+                    unit = '元'
														
 
															+                    # print('3明显金额特征补充单位 元')
														
 
															+                else:
														
 
															+                    # print('过滤掉没单位金额: ',entity_text)
														
 
															+                    continue
														
 
															+            elif unit == '万元':
														
 
															+                if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
														
 
															+                    unit = '元'
														
 
															+                elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
														
 
															+                    unit = '元'
														
 
															+            if unit.find("万") >= 0 and entity_text.find("万") >= 0:  # 2021/7/19修改为金额文本有万，不计算单位
														
 
															+                # print('修正金额及单位都有万， 金额：',entity_text, '单位:',unit)
														
 
															+                unit = "元"
														
 
															+            if re.search('.*万元万元', entity_text):  # 2021/7/19 修正两个万元
														
 
															+                # print(' 修正两个万元',entity_text)
														
 
															+                entity_text = entity_text.replace('万元万元', '万元')
														
 
															+            else:
														
 
															+                if filter_unit:
														
 
															+                    continue
														
 
															+
														
 
															+            # symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else ''  # 负值金额前面保留负号 ，后面这些不作为负金额 起拍价：105.29-200.46万元  预 算 --- 350000.0 2023/04/14 取消符号
														
 
															+
														
 
															+            entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
														
 
															+            # print('转换前金额：', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
														
 
															+            if re.search('总投资|投资总额|总预算|总概算|(投资|招标|资金|存放|操作|融资)规模|批复概算|投资额|总规模|工程造价|总金额',
														
 
															+                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额  20241031工程造价作总投资
														
 
															+                # print('总投资金额: ', _match.group(0))
														
 
															+                notes = '总投资'
														
 
															+            elif re.search('投资|概算|建安费|其他费用|基本预备费',
														
 
															+                           sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
														
 
															+                notes = '投资'
														
 
															+            # elif re.search('工程造价',
														
 
															+            #                sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
														
 
															+            #     notes = '工程造价'
														
 
															+            elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
														
 
															+                  or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\(（]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\)）]*[:：为]',
														
 
															+                               sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
														
 
															+                  or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
														
 
															+                               sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
														
 
															+                notes = '保证金'
														
 
															+                # print('保证金信息：', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
														
 
															+            elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
														
 
															+                           sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
														
 
															+                notes = '成本警戒线'
														
 
															+            elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为：]|服务金额', sentence_text[_match.span()[0]:_match.span()[1]]):
														
 
															+                # cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
														
 
															+                # notes = cost_re.group(1)
														
 
															+                notes = '招标或中标金额'
														
 
															+            elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
														
 
															+                notes = '单价'
														
 
															+            elif re.search('^[/每]', sentence_text[_match.end():]):
														
 
															+                # print('单价：', _match.group(0))
														
 
															+                notes = '单价'
														
 
															+            elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
														
 
															+                notes = '大写'
														
 
															+                if entity_text[0] == "拾":  # 2021/12/16 修正大写金额省略了数字转换错误问题
														
 
															+                    entity_text = "壹" + entity_text
														
 
															+                # print("补充备注：notes = 大写")
														
 
															+            if len(unit) > 0:
														
 
															+                if unit.find('万') >= 0 and len(entity_text.split('.')[0]) >= 8:  # 2021/7/19 修正万元金额过大的情况
														
 
															+                    # print('修正单位万元金额过大的情况 金额：', entity_text, '单位:', unit)
														
 
															+                    entity_text = str(
														
 
															+                        getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]) / 10000)
														
 
															+                    unit = '元'  # 修正金额后单位 重置为元
														
 
															+                else:
														
 
															+                    # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
														
 
															+                    entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]))
														
 
															+            else:
														
 
															+                if entity_text.find('万') >= 0 and entity_text.split('.')[0].isdigit() and len(
														
 
															+                        entity_text.split('.')[0]) >= 8:
														
 
															+                    entity_text = str(getUnifyMoney(entity_text) / 10000)
														
 
															+                    # print('修正金额字段含万 过大的情况')
														
 
															+                else:
														
 
															+                    entity_text = str(getUnifyMoney(entity_text))
														
 
															+            if science and re.search('^E-?\d+$', science):  # 科学计数
														
 
															+                entity_text = str(Decimal(entity_text + science)) if Decimal(entity_text + science) > 100 and Decimal(
														
 
															+                    entity_text + science) < 10000000000 else entity_text  # 结果大于100及小于100万才使用科学计算
														
 
															+
														
 
															+            if float(entity_text) > 100000000000:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
														
 
															+                # print('过滤掉金额：float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
														
 
															+                continue
														
 
															+
														
 
															+            if notSure and unit == "" and float(entity_text) > 100 * 10000:
														
 
															+                # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000：', entity_text, unit)
														
 
															+                continue
														
 
															+            # print("金额：{0} ,单位：{1}, 前文：{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
														
 
															+            #                                                                      filter, filter_unit))
														
 
															+            if re.search('[%％‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
														
 
															+                # print('过滤掉可能是费率的金额')
														
 
															+                continue
														
 
															+            money_list.append((entity_text, start_index, end_index, unit, notes))
														
 
															+    return money_list, found_yeji
														
 
															+
														
 
															 def recall(y_true, y_pred):
														
 
															     '''
														
 
															     计算召回率
														
--- a/BiddingKG/dl/interface/Entitys.py
+++ b/BiddingKG/dl/interface/Entitys.py
@@ -300,6 +300,7 @@ class Role():
 
															         self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期)
														
 
															         self.address = ""  #2022/08/08 新增 角色地址
														
 
															         self.multi_winner = multi_winner #2024/4/8 新增多中标人
														
 
															+        self.unit_price = 0 # 20241127 新增单价
														
 
															     def getString(self):
														
 
															         self.linklist = [item for item in set(self.linklist)]
														
@@ -342,6 +343,8 @@ class Role():
 
															         result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
														
 
															                   'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
														
 
															                   'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}
														
 
															+        if self.unit_price != 0: # 单价
														
 
															+            result['role_money']['unit_price'] = self.unit_price
														
 
															         if result['role_name'] in ['tenderee', 'win_tenderer']:
														
 
															             result['role_prob'] = self.role_prob
														
 
															         if result['role_name'] == 'win_tenderer' and self.multi_winner != set():
														
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -3219,221 +3219,6 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
															         article.content = re.sub("##attachment_begin##|##attachment_end##", "", article.content)
														
 
															     return list_sentences,list_outlines
														
 
															-def get_money_entity(sentence_text, found_yeji, in_attachment=False):
														
 
															-    money_list = []
														
 
															-    # 使用正则识别金额
														
 
															-    entity_type = "money"
														
 
															-    list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
														
 
															-                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d：|\d=\d[-+×]\d：)?(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
														
 
															-                          "front_m": "((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)]?)\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
														
 
															-                          "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
														
 
															-    # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元，合同金额：378.8万元 提取
														
 
															-
														
 
															-    pattern_money = re.compile("%s|%s|%s|%s" % (
														
 
															-    list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
														
 
															-    list_money_pattern["front_m"]))
														
 
															-
														
 
															-    if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
														
 
															-        found_yeji += 1
														
 
															-    if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
														
 
															-        all_match = []
														
 
															-    else:
														
 
															-        ser = re.search('((收费标准|计算[方公]?式)：|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s（）()\[\]【】\d\.%％‰\+\-*×/]{20,}[，。]?', sentence_text)  # 过滤掉收费标准里面的金额
														
 
															-        if ser:
														
 
															-            sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
														
 
															-        all_match = re.finditer(pattern_money, sentence_text)
														
 
															-    # print('all_match:', all_match)
														
 
															-    for _match in all_match:
														
 
															-        # print('_match: ', _match.group())
														
 
															-        if re.search('^元/1\d{10}，$', _match.group(0)): # 修复 495042766 现场负责人 姚元 / 13488160460 预测为金额
														
 
															-            continue
														
 
															-        if len(_match.group()) > 0:
														
 
															-            # print("===",_match.group())
														
 
															-            # # print(_match.groupdict())
														
 
															-            notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
														
 
															-            unit = ""
														
 
															-            entity_text = ""
														
 
															-            start_index = ""
														
 
															-            end_index = ""
														
 
															-            text_beforeMoney = ""
														
 
															-            filter = ""
														
 
															-            filter_unit = False
														
 
															-            notSure = False
														
 
															-            science = ""
														
 
															-            if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
														
 
															-                # print('金额在业绩后面: ', _match.group(0))
														
 
															-                found_yeji += 1
														
 
															-                break
														
 
															-            for k, v in _match.groupdict().items():
														
 
															-                if v != "" and v is not None:
														
 
															-                    if k == 'text_key_word':
														
 
															-                        notSure = True
														
 
															-                    if k.split("_")[0] == "money":
														
 
															-                        entity_text = v
														
 
															-                        # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
														
 
															-                        if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾，应该小数点识别错，直接去掉
														
 
															-                            entity_text = entity_text[:-3]
														
 
															-                    if k.split("_")[0] == "unit":
														
 
															-                        if 'behind' in k or unit == "":  # 优先后面单位  预算金额(元)：160万元  总价（万元）：最终报价：695000.00（元）
														
 
															-                            unit = v
														
 
															-                    if k.split("_")[0] == "text":
														
 
															-                        # print('text_before: ', _match.group(k))
														
 
															-                        text_beforeMoney = v
														
 
															-                    if k.split("_")[0] == "filter":
														
 
															-                        filter = v
														
 
															-                    if re.search("filter_unit", k) is not None:
														
 
															-                        filter_unit = True
														
 
															-                    if k.split("_")[0] == 'science':
														
 
															-                        science = v
														
 
															-            # print("金额：{0} ,单位：{1}, 前文：{2}, filter: {3}, filter_unit: {4}".format(entity_text,unit,text_beforeMoney,filter,filter_unit))
														
 
															-            # if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
														
 
															-            #     if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0] - 2):_match.span()[0]]):
														
 
															-            #         entity_text = re.sub('\d+,', '', entity_text)
														
 
															-            #     else:
														
 
															-            #         entity_text = entity_text.replace(',', '.')
														
 
															-            #     # print(' 修正OCR识别小数点为逗号')
														
 
															-
														
 
															-            if filter != "":
														
 
															-                continue
														
 
															-            if len(entity_text)>30 or len(re.sub('[E-]', '', science))>2: # 限制数字长度，避免类似265339018附件金额错误，数值超大报错 decimal.InvalidOperation
														
 
															-                continue
														
 
															-            start_index, end_index = _match.span()
														
 
															-            start_index += len(text_beforeMoney)
														
 
															-
														
 
															-            '''过滤掉手机号码作为金额'''
														
 
															-            if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
														
 
															-                # print('过滤掉手机号码作为金额')
														
 
															-                continue
														
 
															-            elif re.search('^1[3-9]\d{9}$', entity_text) and re.search('：\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额（万元）：季勇13863441880'
														
 
															-                # print('过滤掉手机号码作为金额')
														
 
															-                continue
														
 
															-            elif re.search('^\d(.\d{1,2})?$', entity_text) and re.search('\d$', _match.group(0)) and re.search('^[、.]', sentence_text[_match.end():]): # 170756755 控制价为：1、合理利润率上限
														
 
															-                # print('过滤错误金额：', _match.group(0))
														
 
															-                continue
														
 
															-
														
 
															-            if unit == "":  # 2021/7/21 有明显金额特征的补充单位，避免被过滤
														
 
															-                if (re.search('(￥|¥|RMB|CNY)[:：]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
														
 
															-                    if entity_text.endswith('万元'):
														
 
															-                        unit = '万元'
														
 
															-                        entity_text = entity_text[:-2]
														
 
															-                    else:
														
 
															-                        unit = '元'
														
 
															-                    # print('1明显金额特征补充单位 元')
														
 
															-                elif re.search('USD[:：]?$', text_beforeMoney):
														
 
															-                    unit = '美元'
														
 
															-                elif re.search('EUR[:：]?$', text_beforeMoney):
														
 
															-                    unit = '欧元'
														
 
															-                elif re.search('JPY[:：]?$', text_beforeMoney):
														
 
															-                    unit = '日元'
														
 
															-                elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
														
 
															-                    # print('两个金额连接后面的有单位，用后面单位')
														
 
															-                    unit = '万元'
														
 
															-                elif re.search('^，?(价格币种：\w{2,3}，)?价格单位：万元', sentence_text[end_index:]): # 修复494731937金额单位缺漏 中标价格：39501.094425，价格币种：人民币，价格单位：万元，
														
 
															-                    unit = '万元'
														
 
															-                elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型：（万元）报价：13311.1582，得分：84.46，
														
 
															-                    unit = '万元'
														
 
															-                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
														
 
															-                    if re.search('^[\d，,.]+$', entity_text) and float(re.sub('[,，]', '', entity_text))<500 and re.search('万元', sentence_text):
														
 
															-                        unit = '万元'
														
 
															-                        # print('金额较小且句子中有万元的，补充单位为万元')
														
 
															-                    elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
														
 
															-                        unit = '万元'
														
 
															-                    else:
														
 
															-                        unit = '元'
														
 
															-                        # print('金额前面紧接关键词的补充单位 元')
														
 
															-                elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7}，?)$)|(^\d{,3}(,\d{3})+，?$)', entity_text):
														
 
															-                    unit = '元'
														
 
															-                    # print('3明显金额特征补充单位 元')
														
 
															-                else:
														
 
															-                    # print('过滤掉没单位金额: ',entity_text)
														
 
															-                    continue
														
 
															-            elif unit == '万元':
														
 
															-                if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
														
 
															-                    unit = '元'
														
 
															-                elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
														
 
															-                    unit = '元'
														
 
															-            if unit.find("万") >= 0 and entity_text.find("万") >= 0:  # 2021/7/19修改为金额文本有万，不计算单位
														
 
															-                # print('修正金额及单位都有万， 金额：',entity_text, '单位:',unit)
														
 
															-                unit = "元"
														
 
															-            if re.search('.*万元万元', entity_text):  # 2021/7/19 修正两个万元
														
 
															-                # print(' 修正两个万元',entity_text)
														
 
															-                entity_text = entity_text.replace('万元万元', '万元')
														
 
															-            else:
														
 
															-                if filter_unit:
														
 
															-                    continue
														
 
															-
														
 
															-            # symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else ''  # 负值金额前面保留负号 ，后面这些不作为负金额 起拍价：105.29-200.46万元  预 算 --- 350000.0 2023/04/14 取消符号
														
 
															-
														
 
															-            entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
														
 
															-            # print('转换前金额：', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
														
 
															-            if re.search('总投资|投资总额|总预算|总概算|(投资|招标|资金|存放|操作|融资)规模|批复概算|投资额|总规模|工程造价|总金额',
														
 
															-                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额  20241031工程造价作总投资
														
 
															-                # print('总投资金额: ', _match.group(0))
														
 
															-                notes = '总投资'
														
 
															-            elif re.search('投资|概算|建安费|其他费用|基本预备费',
														
 
															-                           sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
														
 
															-                notes = '投资'
														
 
															-            # elif re.search('工程造价',
														
 
															-            #                sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
														
 
															-            #     notes = '工程造价'
														
 
															-            elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
														
 
															-                  or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\(（]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\)）]*[:：为]',
														
 
															-                               sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
														
 
															-                  or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
														
 
															-                               sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
														
 
															-                notes = '保证金'
														
 
															-                # print('保证金信息：', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
														
 
															-            elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
														
 
															-                           sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
														
 
															-                notes = '成本警戒线'
														
 
															-            elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为：]|服务金额', sentence_text[_match.span()[0]:_match.span()[1]]):
														
 
															-                # cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
														
 
															-                # notes = cost_re.group(1)
														
 
															-                notes = '招标或中标金额'
														
 
															-            elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
														
 
															-                notes = '单价'
														
 
															-            elif re.search('^[/每]', sentence_text[_match.end():]):
														
 
															-                # print('单价：', _match.group(0))
														
 
															-                notes = '单价'
														
 
															-            elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
														
 
															-                notes = '大写'
														
 
															-                if entity_text[0] == "拾":  # 2021/12/16 修正大写金额省略了数字转换错误问题
														
 
															-                    entity_text = "壹" + entity_text
														
 
															-                # print("补充备注：notes = 大写")
														
 
															-            if len(unit) > 0:
														
 
															-                if unit.find('万') >= 0 and len(entity_text.split('.')[0]) >= 8:  # 2021/7/19 修正万元金额过大的情况
														
 
															-                    # print('修正单位万元金额过大的情况 金额：', entity_text, '单位:', unit)
														
 
															-                    entity_text = str(
														
 
															-                        getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]) / 10000)
														
 
															-                    unit = '元'  # 修正金额后单位 重置为元
														
 
															-                else:
														
 
															-                    # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
														
 
															-                    entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]))
														
 
															-            else:
														
 
															-                if entity_text.find('万') >= 0 and entity_text.split('.')[0].isdigit() and len(
														
 
															-                        entity_text.split('.')[0]) >= 8:
														
 
															-                    entity_text = str(getUnifyMoney(entity_text) / 10000)
														
 
															-                    # print('修正金额字段含万 过大的情况')
														
 
															-                else:
														
 
															-                    entity_text = str(getUnifyMoney(entity_text))
														
 
															-            if science and re.search('^E-?\d+$', science):  # 科学计数
														
 
															-                entity_text = str(Decimal(entity_text + science)) if Decimal(entity_text + science) > 100 and Decimal(
														
 
															-                    entity_text + science) < 10000000000 else entity_text  # 结果大于100及小于100万才使用科学计算
														
 
															-
														
 
															-            if float(entity_text) > 100000000000:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
														
 
															-                # print('过滤掉金额：float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
														
 
															-                continue
														
 
															-
														
 
															-            if notSure and unit == "" and float(entity_text) > 100 * 10000:
														
 
															-                # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000：', entity_text, unit)
														
 
															-                continue
														
 
															-            # print("金额：{0} ,单位：{1}, 前文：{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
														
 
															-            #                                                                      filter, filter_unit))
														
 
															-            if re.search('[%％‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
														
 
															-                # print('过滤掉可能是费率的金额')
														
 
															-                continue
														
 
															-            money_list.append((entity_text, start_index, end_index, unit, notes))
														
 
															-    return money_list, found_yeji
														
 
															 def cut_repeat_name(s):
														
 
															     '''
														
 
															     公司连续重复名称去重
														
@@ -4086,6 +3871,7 @@ if __name__=="__main__":
 
															     text = '是否拟中标人：是，评标排名：1，价格类型：（万元）报价：13311.1582，得分：84.46，项目负责人：邓焱文'
														
 
															     text = '，采购包1：采购包预算金额（元：1,500000.00，采购包最高限价（元：1,430600.00，'
														
 
															     text = '成交人：中坤电力有限公司，成交价格：11493,603.52元，质量：合格，项目工期：117天，'
														
 
															+    text = '3年投标报价（元）含税 6299700.00'
														
 
															     # text = '数量及单位1：65台，单价2：800，投标报价3=1×2：52000。'
														
 
															     print(get_money_entity(text, found_yeji=0))
														
 
															     # with open('D:/138786703.html', 'r', encoding='utf-8') as f:
														
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -442,7 +442,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
															     property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic)
														
 
															     '''最终验证prem'''
														
 
															-    getAttributes.confirm_prem(prem[0]['prem'], channel_dic, deposit_project, prem[0]['total_tendereeMoney'])
														
 
															+    getAttributes.confirm_prem(prem[0]['prem'], channel_dic, deposit_project, prem[0]['total_tendereeMoney'], name=codeName[0]['name'])
														
 
															     # 提取拟在建所需字段
														
 
															     start_time = time.time()
														
@@ -455,7 +455,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
														
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
														
 
															-    version_date = {'version_date': '2024-11-25'}
														
 
															+    version_date = {'version_date': '2024-12-02'}
														
 
															     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
														
 
															     if original_docchannel == 302:
														
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -936,13 +936,19 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
															                 #     packDict[packageName]["roleList"][i].money = money
														
 
															                 #     packDict[packageName]["roleList"][i].money_prob = money_prob
														
 
															                 if packDict[packageName]["roleList"][i].money_prob==0 :  # 2021/7/20第一次更新金额
														
 
															-                    packDict[packageName]["roleList"][i].money = money.entity_text
														
 
															+                    if money.notes == '单价':
														
 
															+                        packDict[packageName]["roleList"][i].unit_price = money.entity_text
														
 
															+                    else:
														
 
															+                        packDict[packageName]["roleList"][i].money = money.entity_text
														
 
															                     packDict[packageName]["roleList"][i].money_prob = money_prob
														
 
															                     packDict[packageName]["roleList"][i].money_unit = money.money_unit
														
 
															                 elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or (money.notes in ['大写'] and money.in_attachment==False): # 2021/7/20改为优先选择大写金额,
														
 
															                     # print('已连接金额概率：money_prob:',packDict[packageName]["roleList"][i].money_prob)
														
 
															                     # print('链接金额备注 ',money.notes, money.entity_text, money.values)
														
 
															-                    packDict[packageName]["roleList"][i].money = money.entity_text
														
 
															+                    if money.notes == '单价':
														
 
															+                        packDict[packageName]["roleList"][i].unit_price = money.entity_text
														
 
															+                    else:
														
 
															+                        packDict[packageName]["roleList"][i].money = money.entity_text
														
 
															                     packDict[packageName]["roleList"][i].money_prob = money_prob
														
 
															                     packDict[packageName]["roleList"][i].money_unit = money.money_unit
														
 
															                 # print('链接中的金额：{0}, 单位：{1}'.format(money.entity_text, money.money_unit))
														
@@ -2707,12 +2713,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
															                         PackDict[packageName]["cost_warning"] = str(Decimal(entity.entity_text))
														
 
															             elif entity.values[entity.label]>=on_value:
														
 
															-                if str(entity.label)=="1":
														
 
															+                if str(entity.label)=="1" and entity.notes != '单价':
														
 
															                     set_tenderer_money.add(float(entity.entity_text))
														
 
															                     list_tenderer_money.append(float(entity.entity_text))  # 2021/7/16 新增列表，倒序保存所有中标金额
														
 
															                     unit_list.append(entity.money_unit)
														
 
															                 # if str(entity.label)=="0":
														
 
															-                if str(entity.label)=="0" and entity.notes!='总投资':
														
 
															+                if str(entity.label)=="0" and (entity.notes!='总投资' or float(entity.entity_text)<100000000):
														
 
															                     '''
														
 
															                     if p_entity>0:
														
 
															                         p_before = list_entity[p_entity-1]
														
@@ -2731,16 +2737,119 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
															                         #     PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
														
 
															                         # if entity.values[entity.label]>on_value:
														
 
															                         if entity.values[entity.label]>max_prob-0.005: # 选择最大概率招标金额 2024/05/23 相差0.005尽量选前面的
														
 
															-                            PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
														
 
															+                            if entity.notes == '单价':
														
 
															+                                PackDict["Project"]["unit_tendereeMoney"] = str(Decimal(entity.entity_text))
														
 
															+                            else:
														
 
															+                                PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
														
 
															                             PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit
														
 
															                             max_prob = entity.values[entity.label]
														
 
															                     else:
														
 
															-                        PackDict[packageName]["tendereeMoney"] = str(Decimal(entity.entity_text))
														
 
															+                        if entity.notes == '单价':
														
 
															+                            PackDict[packageName]["unit_tendereeMoney"] = str(Decimal(entity.entity_text))
														
 
															+                        else:
														
 
															+                            PackDict[packageName]["tendereeMoney"] = str(Decimal(entity.entity_text))
														
 
															                         PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit
														
 
															                         #add pointer_tendereeMoney
														
 
															                         packagePointer.pointer_tendereeMoney = entity
														
 
															         p_entity -= 1            
														
 
															-    
														
 
															+
														
 
															+    '''包名与标段号链接'''
														
 
															+    l_main = []
														
 
															+    l_attn = []
														
 
															+    pack_num_main = 0
														
 
															+    name_num_main = 0
														
 
															+    pack_num_attn = 0
														
 
															+    name_num_attn = 0
														
 
															+    for entity in list_entity:
														
 
															+        if entity.entity_type in  ['name', 'package']:
														
 
															+            if entity.in_attachment:
														
 
															+                l_attn.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end))
														
 
															+                if entity.entity_type == 'name':
														
 
															+                    name_num_attn += 1
														
 
															+                else:
														
 
															+                    pack_num_attn += 1
														
 
															+            else:
														
 
															+                l_main.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end))
														
 
															+                if entity.entity_type == 'name':
														
 
															+                    name_num_main += 1
														
 
															+                else:
														
 
															+                    pack_num_main += 1
														
 
															+    if name_num_main > 0 and pack_num_main > 0:
														
 
															+        l_main.sort(key=lambda x: [x[2],x[3]])
														
 
															+        # print('正文名称：',l_main)
														
 
															+        link_dic = {}
														
 
															+        i = 1
														
 
															+        pre_ty = l_main[0][0]
														
 
															+        while i < len(l_main):
														
 
															+            if l_main[i][0] != pre_ty:
														
 
															+                ty1, ent1, s1, b1, e1 = l_main[i-1]
														
 
															+                ty2, ent2, s2, b2, e2 = l_main[i]
														
 
															+                if ty1 == 'package':
														
 
															+                    if ent1 not in link_dic:
														
 
															+                        link_dic[ent1] = []
														
 
															+                    if s1 == s2:
														
 
															+                        dist = abs(b2 - b1)
														
 
															+                    else:
														
 
															+                        dist = len(list_sentence[s1].sentence_text) - b1
														
 
															+                        for id in range(s1+1, s2):
														
 
															+                            dist += len(list_sentence[id].sentence_text)
														
 
															+                        dist += b2
														
 
															+                    link_dic[ent1].append((s2-s1, dist, ent2))
														
 
															+                elif ty2 == 'package':
														
 
															+                    if ent2 not in link_dic:
														
 
															+                        link_dic[ent2] = []
														
 
															+                    if s1 == s2:
														
 
															+                        dist = abs(b2 - b1)
														
 
															+                    else:
														
 
															+                        dist = len(list_sentence[s1].sentence_text) - b1
														
 
															+                        for id in range(s1+1, s2):
														
 
															+                            dist += len(list_sentence[id].sentence_text)
														
 
															+                        dist += b2
														
 
															+                    link_dic[ent2].append((s2-s1, dist, ent1))
														
 
															+            pre_ty = l_main[i][0]
														
 
															+            i += 1
														
 
															+        for k, v in link_dic.items():
														
 
															+            v.sort(key=lambda x: [x[0], x[1]])
														
 
															+            # print('各包排序后项目名：', k, v)
														
 
															+            PackDict[k]["name"] = v[0][2]
														
 
															+    elif name_num_attn > 0 and pack_num_attn > 0:
														
 
															+        # print("附件名称：", l_attn)
														
 
															+        l_attn.sort(key=lambda x: [x[2],x[3]])
														
 
															+        link_dic = {}
														
 
															+        i = 1
														
 
															+        pre_ty = l_attn[0][0]
														
 
															+        while i < len(l_attn):
														
 
															+            if l_attn[i][0] != pre_ty:
														
 
															+                ty1, ent1, s1, b1, e1 = l_attn[i-1]
														
 
															+                ty2, ent2, s2, b2, e2 = l_attn[i]
														
 
															+                if ty1 == 'package':
														
 
															+                    if ent1 not in link_dic:
														
 
															+                        link_dic[ent1] = []
														
 
															+                    if s1 == s2:
														
 
															+                        dist = abs(b2 - b1)
														
 
															+                    else:
														
 
															+                        dist = len(list_sentence[s1].sentence_text) - b1
														
 
															+                        for id in range(s1+1, s2):
														
 
															+                            dist += len(list_sentence[id].sentence_text)
														
 
															+                        dist += b2
														
 
															+                    link_dic[ent1].append((s2-s1, dist, ent2))
														
 
															+                elif ty2 == 'package':
														
 
															+                    if ent2 not in link_dic:
														
 
															+                        link_dic[ent2] = []
														
 
															+                    if s1 == s2:
														
 
															+                        dist = abs(b2 - b1)
														
 
															+                    else:
														
 
															+                        dist = len(list_sentence[s1].sentence_text) - b1
														
 
															+                        for id in range(s1+1, s2):
														
 
															+                            dist += len(list_sentence[id].sentence_text)
														
 
															+                        dist += b2
														
 
															+                    link_dic[ent2].append((s2-s1, dist, ent1))
														
 
															+            pre_ty = l_attn[i][0]
														
 
															+            i += 1
														
 
															+        for k, v in link_dic.items():
														
 
															+            v.sort(key=lambda x: [x[0], x[1]])
														
 
															+            # print('各包排序后项目名：', k, v)
														
 
															+            PackDict[k]["name"] = v[0][2]
														
 
															     #删除一个机构有多个角色的数据
														
 
															     #删除重复人、概率不回传
														
@@ -2804,8 +2913,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
															             _flag_pack_money = False
														
 
															     if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
														
 
															         for k,v in dict_pack_tenderer_money.items():
														
 
															-            v[0].money = list(v[1])[0]
														
 
															-            # print('k,v in dict_pack_tenderer_money.items', k, v)
														
 
															+            if float(v[0].unit_price) < float(list(v[1])[0]): # 20241128 金额大于单价时才作链接金额
														
 
															+                v[0].money = list(v[1])[0]
														
 
															     # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
														
 
															     for pack in PackDict.keys():
														
 
															         for i in range(len(PackDict[pack]["roleList"])):
														
@@ -4217,7 +4326,7 @@ def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修
 
															                         #     l[2] = total_product_money
														
 
															                         #     log('修改中标金额为所有产品总金额')
														
 
															                         # if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) == 0 and float(l["role_money"]['money'])<total_product_money/10:
														
 
															-                        if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or float(l["role_money"]['money'])<ree_money/2): # 改为小于一半招标金额或为0时替换为合计金额
														
 
															+                        if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or (float(l["role_money"]['money'])<ree_money/2 and float(l["role_money"]['money'])<total_product_money<ree_money)): # 改为小于一半招标金额或为0时替换为合计金额
														
 
															                             l["role_money"]['money'] = total_product_money
														
 
															                             # print('修改中标金额为所有产品总金额')
														
 
															                     except Exception as e:
														
@@ -4622,6 +4731,11 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
															                     del_k.append(k)
														
 
															             for k in del_k:
														
 
															                 old_prem.pop(k)
														
 
															+        if in_attachment: # 附件表格提取的，原来提取有中标人，停止替换
														
 
															+            for v in old_prem.values():
														
 
															+                for d in v['roleList']:
														
 
															+                    if d['role_name'] in ['win_tenderer', 'pre_win_tenderer']:
														
 
															+                        return 0
														
 
															         # if len(new_prem) > len(old_prem) and [k for k in new_prem if '自增' not in k] == []:  # 如果表格提取包号都为自增编号且包数大于非表格提取，不进行更新 例 244355092  281854766
														
 
															         #     return None
														
@@ -4707,7 +4821,7 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
															     # return old_prem
														
 
															-def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0):
														
 
															+def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0, name=""):
														
 
															     '''
														
 
															     规则检查纠正prem，如果Project包中标人在其他包中标人，去掉project包中标角色；如果有其他包中标人，去掉roleList为空的包；
														
 
															     :param prem: prem 字段字典
														
@@ -4758,6 +4872,10 @@ def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMone
 
															         for k in prem:
														
 
															             if float(prem[k]['tendereeMoney'])==0:
														
 
															                 prem[k]['tendereeMoney'] = total_tendereeMoney
														
 
															+    if name != '' and len(prem)<=2: # 20241129 小于等于两个包且无包名称，取项目名称
														
 
															+        for k in prem:
														
 
															+            if prem[k].get('name', '') == '':
														
 
															+                prem[k]['name'] = name
														
 
															 def fix_single_source(prem, channel_dic, original_docchannel):
														
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -531,9 +531,12 @@ class CodeNamePredict():
 
															             if len(dict_name_freq_score) == 0:
														
 
															                 # name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[:：\s]+([^，。：；]{2,60})[，。]'
														
 
															                 name_re1 = '(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题)[:：\s]+(?P<name>[^，。：；]{2,60})[，。]'
														
 
															+                name_re2 = '(合同|采购)包\d（(?P<name>[^，。：；]{2,60})）[：，。]' # 20241202 补充合同包 包名表达 558410976
														
 
															                 for sentence in list_sentence:
														
 
															                     # pad_sentence = sentence.sentence_text
														
 
															                     othername = re.search(name_re1, sentence.sentence_text)
														
 
															+                    if othername == None:
														
 
															+                        othername = re.search(name_re2, sentence.sentence_text)
														
 
															                     if othername != None:
														
 
															                         project_name = othername.group('name')
														
 
															                         if re.search('[\u4e00-\u9fa5]+', project_name) == None:  # 没有中文的项目名称去除
														
@@ -869,7 +872,7 @@ class PREMPredict():
 
															                 elif re.search('^放弃中标资格|是否中标：否|^(中标|成交)(公示|公告)', behind):
														
 
															                     values[2] = 0.5
														
 
															                     label = 5
														
 
															-                elif re.search('^，?(投标报价|(资格性审查：|符合性审查：)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', front)==None:
														
 
															+                elif re.search('^，?(投标报价|(资格性审查：|符合性审查：)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]', front)==None and values[2]<0.7: #20241126补充条件避免漏提 560768263 第一候选人：单位名称： 上海理想信息产业（集团）有限公司 ，投标报价：
														
 
															                     values[2] = 0.5
														
 
															                     label = 5
														
 
															                 elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单)：$|确定为标的的受让方，$|[主次出]入口?，?$|确定(项目|\w{,2})成交供应商，$|，承刻单位：$|乙方接受为$|丙方：$', front):  # 234501112 民币元，序号：1，债务人： 东营市海宁工贸有限责任公司 ，债权本金： 262414286 八、中标后签约单位，合同签约单位： 241929628 1月9，承刻单位： 肃宁县超凡网络光敏印章刻印部 ，印章预留印模
														
@@ -982,8 +985,8 @@ class PREMPredict():
 
															                     values[label] = 0.49
														
 
															                 elif re.search('(含|在|包括|[大小等高低]于|达到)$|[\d.%]+[+×*-]$', front):
														
 
															                     values[label] = 0.49
														
 
															-                elif entity.notes == '单价' and float(entity.entity_text)<5000:
														
 
															-                    label = 2
														
 
															+                # elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释，单价单独存放
														
 
															+                #     label = 2
														
 
															             elif label ==0: # 错误招标金额处理
														
 
															                 if re.search('投资(金额|规模)：$', front): # 545988699 金额不大的投资金额作为备选招标金额
														
 
															                     values[label] = 0.51
														
@@ -994,8 +997,8 @@ class PREMPredict():
 
															                     values[label] = 0.49
														
 
															                 # elif re.search('(含|在|包括|[大小等高低]于|如预算金额为)$|[\d.%]+(（含）)?[+×*-]$', front):  # 2024/10/30 注销，避免漏提 预算金额：控制在26000元以内由合作银行出资 ；投资金额不低于人民币500万元
														
 
															                 #     values[label] = 0.49
														
 
															-                elif entity.notes == '单价' and float(entity.entity_text)<5000:
														
 
															-                    label = 2
														
 
															+                # elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释，单价单独存放
														
 
															+                #     label = 2
														
 
															             elif re.search('报价：预估不?含税总价[为：]$', front) and (label != 1 or values[label]<0.5):
														
 
															                 label = 1
														
 
															                 values[label] = 0.8
														
@@ -2334,12 +2337,12 @@ class RoleGrade():
 
															         self.tenderee_left_6 = "(?P<tenderee_left_6>(业主|建设|委托)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|业主|买方)"
														
 
															         self.tenderee_left_5 = "(?P<tenderee_left_5>(发布)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|买方|发布机构)"
														
 
															         self.agency_left_9 = "(?P<agency_left_9>代理)"
														
 
															-        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]名|排[名序]：1|名次：1)"
														
 
															+        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一](名|候选)|排[名序]：1|名次：1)"
														
 
															         self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方|最[终后]选[择取]))"  # 229435497 最后选择西平，县中原彩印有限公司，作为此项目中标供应商，
														
 
															         self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
														
 
															         self.winTenderer_right_9 = "(?P<winTenderer_right_9>^(为(中标|成交|中选)(人|单位|供应商|公司)|以\d+[\d.,]+万?元中标))"
														
 
															-        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]：2|名次：2))"
														
 
															-        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]：3|名次：3))"
														
 
															+        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2](名|候选)|排[名序]：2|名次：2))"
														
 
															+        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3](名|候选)|排[名序]：3|名次：3))"
														
 
															         self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.tenderee_left_5,self.agency_left_9,
														
 
															                              self.winTenderer_left_9,self.winTenderer_left_8, self.winTenderer_right_9, self.winTenderer_left_6, self.secondTenderer_left_9, self.thirdTenderer_left_9] # 概率要由高到低 274941849
														
 
															     def predict(self, list_sentences, list_entitys, original_docchannel, span=15, min_prob=0.7):
														
@@ -2456,8 +2459,8 @@ class RoleGrade():
 
															         for entity in low_prob_winner: # 如果低概率中标人在招标或代理列表，改为非角色
														
 
															             if entity.entity_text in all_tenderee_agency:
														
 
															                 entity.label = 5
														
 
															-            elif entity.in_attachment: # 附件低概率中标角色不要 避免：516109391 桂林银行崇左宁明支行，宁明县城中镇兴宁大道中70号，预测为中标
														
 
															-                entity.label = 5
														
 
															+            # elif entity.in_attachment: # 附件低概率中标角色不要 避免：516109391 桂林银行崇左宁明支行，宁明县城中镇兴宁大道中70号，预测为中标 20241126 注释掉，558294326 附件单个候选人漏提取
														
 
															+            #     entity.label = 5
														
 
															         if org_winner != []:
														
 
															             flag = 0
														
@@ -2499,7 +2502,7 @@ class MoneyGrade():
 
															                     if ser:
														
 
															                         groupdict = pattern.split('>')[0].replace('(?P<', '')
														
 
															                         _role, _direct, _prob = groupdict.split('_')
														
 
															-                        if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context):# or float(entity.entity_text)<100:
														
 
															+                        if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or entity.notes == '总投资':# or float(entity.entity_text)<100:
														
 
															                             _prob = 6
														
 
															                         _label = role2id.get(_role)
														
 
															                         if _label != entity.label:
														
@@ -2522,8 +2525,8 @@ class MoneyGrade():
 
															                     # _prob = min_prob - 0.1 if in_att else min_prob
														
 
															                     entity.values[entity.label] = _prob + entity.values[entity.label] / 20
														
 
															                     # print('找不到规则修改金额概率：', entity.entity_text, entity.label, entity.values)
														
 
															-            if entity.entity_type in ['money'] and entity.label in [0, 1] and 0.5<=entity.values[entity.label]<0.75 and float(entity.entity_text)<100: # 20241011 低概率小金额改为其他金额
														
 
															-                entity.label = 2
														
 
															+            # if entity.entity_type in ['money'] and entity.label in [0, 1] and 0.5<=entity.values[entity.label]<0.75 and float(entity.entity_text)<100: # 20241011 低概率小金额改为其他金额 # 20241128 小金额可能为单价，放单价存放
														
 
															+            #     entity.label = 2
														
 
															 # 时间类别
														
@@ -5765,16 +5768,233 @@ class DistrictPredictor():
 
															         with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
														
 
															             self.area_variance_dic = pickle.load(f)
														
 
															-    def predict_backup(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
														
 
															-        '''
														
 
															-        先匹配 project_name+tenderee+tenderee_address， 如果缺少省或市 再匹配 title+content
														
 
															-        :param project_name:
														
 
															-        :param prem:
														
 
															-        :param title:
														
 
															-        :param list_articles:
														
 
															-        :param web_source_name:
														
 
															-        :return:
														
 
															-        '''
														
 
															+    def predict_area(self, title, ree, addr, web_source_name):
														
 
															+        p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
														
 
															+
														
 
															+        def find_whole_areas(text, weight=1):
														
 
															+            '''
														
 
															+            通过正则匹配字符串返回地址
														
 
															+            :param pettern: 地址正则 广东省|广西省|...
														
 
															+            :param text: 待匹配文本
														
 
															+            :return:
														
 
															+            '''
														
 
															+            province_l, city_l, district_l = [], [], []
														
 
															+
														
 
															+            text = str(text)
														
 
															+            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县',
														
 
															+                          ' ', text)  # 544151395 赤壁市老城区燃气管道老化更新改造
														
 
															+            text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
														
 
															+            text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域：怒江州 识别为广西 - 崇左 - 江州
														
 
															+            text = re.sub('茂名滨海新区', '茂名市', text)
														
 
															+            text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
														
 
															+            text = re.sub('横州市', '横县', text)  # 例：547363890 修复广西南宁横州 不在地区表问题
														
 
															+            ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
														
 
															+            if ser and '黎族' not in ser.group(0):
														
 
															+                text = text.replace(ser.group(0), ser.group(0) + '黎族')
														
 
															+            for k, v in self.area_variance_dic.items():  # 20241113 根据地区变更信息替换文本
														
 
															+                text = text.replace(k, v)
														
 
															+
														
 
															+            if re.search('[\u4e00-\u9fa5]', text) == None:
														
 
															+                return province_l, city_l, district_l
														
 
															+
														
 
															+            pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
														
 
															+                p_pro, p_city, p_dis, p_city, p_dis, p_dis)
														
 
															+
														
 
															+            for it in re.finditer(pettern, text):
														
 
															+                if it.group(0) == '站前':  # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份：辽宁， 城市：营口，区县：站前
														
 
															+                    continue
														
 
															+                for k, v in it.groupdict().items():
														
 
															+                    if v != None:
														
 
															+                        if it.end() == it.end(k) and re.search('[省市区县州旗盟]$', v) == None and re.search(
														
 
															+                                '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆|经济开发区|开发区|新区)',
														
 
															+                                # 城市不匹配为区的地址 修复 滨州北海经济开发区 北海新区 等提取为北海
														
 
															+                                text[it.end(k):]) != None:
														
 
															+                            continue
														
 
															+                        if k in ['prov']:
														
 
															+                            if v in full_dic['province']:
														
 
															+                                score = 2
														
 
															+                            else:
														
 
															+                                score = 1
														
 
															+                                if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
														
 
															+                                        , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='（' and text[it.end(k)]=='）'):
														
 
															+                                    score += 1
														
 
															+                            score += it.end(k) / len(text) / 10
														
 
															+                            province_l.append((v, score * weight))
														
 
															+                        elif k in ['city', 'city1']:
														
 
															+                            if v in full_dic['city']:
														
 
															+                                score = 2
														
 
															+                            else:
														
 
															+                                score = 1
														
 
															+                                if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
														
 
															+                                        , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='（' and text[it.end(k)]=='）'):
														
 
															+                                    score += 1
														
 
															+                            score += it.end(k) / len(text) / 10
														
 
															+                            city_l.append((v, score * weight))
														
 
															+                        elif k in ['dist', 'dist1', 'dist2']:
														
 
															+                            if v in ['东区', '西区', '城区', '郊区', '矿区']:
														
 
															+                                continue
														
 
															+                            if v in full_dic['district'] and len(v)>2:
														
 
															+                                score = 2
														
 
															+                            else:
														
 
															+                                score = 0.5
														
 
															+                                if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
														
 
															+                                        , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='（' and text[it.end(k)]=='）'):
														
 
															+                                    score += 1
														
 
															+                                    # print('县区加分：', v, text)
														
 
															+                            score += it.end(k) / len(text) / 10
														
 
															+                            if v == '昌江' and '景德镇' not in it.group(0):
														
 
															+                                district_l.append(('昌江黎族', score * weight))
														
 
															+                            else:
														
 
															+                                district_l.append((v, score * weight))
														
 
															+            return province_l, city_l, district_l
														
 
															+
														
 
															+        def merge_score(province_l, city_l, district_l, filter_short_dist=True):
														
 
															+            '''
														
 
															+            合并分数，下级地区分数加到上级
														
 
															+            :param province_l: 提取到的省份列表 [(name, score)]
														
 
															+            :param city_l: 提取到的城市列表 [(name, score)]
														
 
															+            :param district_l: 提取到的区县列表 [(name, score)]
														
 
															+            :param filter_short_dist: 是否过滤不在省份下的区县简称权重
														
 
															+            :return:
														
 
															+            '''
														
 
															+            pro_ids = dict()
														
 
															+            city_ids = dict()
														
 
															+            dis_ids = dict()
														
 
															+            for pro in province_l:
														
 
															+                name, score = pro
														
 
															+                idx = full_dic['province'][name] if name in full_dic['province'] else short_dic['province'][name]
														
 
															+                if idx not in pro_ids:
														
 
															+                    pro_ids[idx] = 0
														
 
															+                pro_ids[idx] += score
														
 
															+
														
 
															+            tmp_pro = {}
														
 
															+            for city in city_l:
														
 
															+                name, score = city
														
 
															+                if name in full_dic['city']:
														
 
															+                    for idx in full_dic['city'][name]:
														
 
															+                        if idx not in city_ids:
														
 
															+                            city_ids[idx] = 0
														
 
															+                        city_ids[idx] += score
														
 
															+                        pro_idx = idx_dic[idx]['省']
														
 
															+                        if pro_idx in tmp_pro:
														
 
															+                            tmp_pro[pro_idx] += score
														
 
															+                        else:
														
 
															+                            tmp_pro[pro_idx] = score
														
 
															+                elif name in short_dic['city']:
														
 
															+                    for idx in short_dic['city'][name]:
														
 
															+                        if idx not in city_ids:
														
 
															+                            city_ids[idx] = 0
														
 
															+                        city_ids[idx] += score
														
 
															+                        pro_idx = idx_dic[idx]['省']
														
 
															+                        if pro_idx in tmp_pro:
														
 
															+                            tmp_pro[pro_idx] += score
														
 
															+                        else:
														
 
															+                            tmp_pro[pro_idx] = score
														
 
															+            if set(tmp_pro) & set(pro_ids) != set():
														
 
															+                for k, v in tmp_pro.items():
														
 
															+                    if k in pro_ids:
														
 
															+                        pro_ids[k] += v
														
 
															+            else:
														
 
															+                pro_ids.update(tmp_pro)
														
 
															+            tmp_pro = {}
														
 
															+            tmp_city = {}
														
 
															+            for dis in district_l:
														
 
															+                name, score = dis
														
 
															+                if name in full_dic['district']:
														
 
															+                    for idx in full_dic['district'][name]:
														
 
															+                        if idx not in dis_ids:
														
 
															+                            dis_ids[idx] = 0
														
 
															+                        dis_ids[idx] += score
														
 
															+                        pro_idx = idx_dic[idx]['省']
														
 
															+                        if pro_idx in tmp_pro:
														
 
															+                            tmp_pro[pro_idx] += score
														
 
															+                        else:
														
 
															+                            tmp_pro[pro_idx] = score
														
 
															+                        city_idx = idx_dic[idx]['市']
														
 
															+                        if city_idx in tmp_city:
														
 
															+                            tmp_city[city_idx] += score
														
 
															+                        else:
														
 
															+                            tmp_city[city_idx] = score
														
 
															+                elif name in short_dic['district']:
														
 
															+                    for idx in short_dic['district'][name]:
														
 
															+                        if idx not in dis_ids:
														
 
															+                            dis_ids[idx] = 0
														
 
															+                        dis_ids[idx] += score
														
 
															+                        pro_idx = idx_dic[idx]['省']
														
 
															+                        if filter_short_dist and pro_idx not in pro_ids:
														
 
															+                            continue
														
 
															+                        if pro_idx in tmp_pro:
														
 
															+                            tmp_pro[pro_idx] += score
														
 
															+                        else:
														
 
															+                            tmp_pro[pro_idx] = score
														
 
															+                        city_idx = idx_dic[idx]['市']
														
 
															+                        if city_idx in tmp_city:
														
 
															+                            tmp_city[city_idx] += score
														
 
															+                        else:
														
 
															+                            tmp_city[city_idx] = score
														
 
															+            if set(tmp_pro) & set(pro_ids) != set():
														
 
															+                for k, v in tmp_pro.items():
														
 
															+                    if k in pro_ids:
														
 
															+                        pro_ids[k] += v
														
 
															+            else:
														
 
															+                pro_ids.update(tmp_pro)
														
 
															+            if set(tmp_city) & set(city_ids) != set():
														
 
															+                for k, v in tmp_city.items():
														
 
															+                    if k in city_ids:
														
 
															+                        city_ids[k] += v
														
 
															+            else:
														
 
															+                city_ids.update(tmp_city)
														
 
															+            return pro_ids, city_ids, dis_ids
														
 
															+
														
 
															+        def get_final_addr(pro_ids, city_ids, dis_ids):
														
 
															+            '''
														
 
															+            先把所有匹配的全称、简称转为id,如果省份不为空，城市不为空且有城市属于省份的取该城市
														
 
															+            :param province_l: 匹配到的所有省份
														
 
															+            :param city_l: 匹配到的所有城市
														
 
															+            :param district_l: 匹配到的所有区县
														
 
															+            :return:
														
 
															+            '''
														
 
															+            big_area = ""
														
 
															+            pred_pro = ""
														
 
															+            pred_city = ""
														
 
															+            pred_dis = ""
														
 
															+
														
 
															+            final_pro = ""
														
 
															+            final_city = ""
														
 
															+            prob = 0
														
 
															+            max_score = 0
														
 
															+            if len(pro_ids) >= 1:
														
 
															+                pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
														
 
															+                scores = [it[1] for it in pro_l]
														
 
															+                prob = max(scores)/sum(scores)
														
 
															+                max_score = max(scores)
														
 
															+                final_pro, score = pro_l[0]
														
 
															+                if score >= 0.01:
														
 
															+                    pred_pro = idx_dic[final_pro]['返回名称']
														
 
															+                    big_area = idx_dic[final_pro]['大区']
														
 
															+            if pred_pro != "" and len(city_ids) >= 1:
														
 
															+                city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
														
 
															+                for it in city_l:
														
 
															+                    if idx_dic[it[0]]['省'] == final_pro:
														
 
															+                        final_city = it[0]
														
 
															+                        pred_city = idx_dic[final_city]['返回名称']
														
 
															+                        break
														
 
															+            if final_city != "" and len(set(dis_ids)) >= 1:
														
 
															+                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
														
 
															+                for it in dis_l:
														
 
															+                    if idx_dic[it[0]]['市'] == final_city:
														
 
															+                        pred_dis = idx_dic[it[0]]['返回名称']
														
 
															+            elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1:  # 20241111 省份不为空，市为空，如果区县在省份下，补充对应的市县
														
 
															+                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
														
 
															+                for it in dis_l:
														
 
															+                    if idx_dic[it[0]]['省'] == final_pro:
														
 
															+                        pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
														
 
															+                        pred_dis = idx_dic[it[0]]['返回名称']
														
 
															+            if pred_city in ['北京', '天津', '上海', '重庆']:
														
 
															+                pred_city = pred_dis
														
 
															+                pred_dis = ""
														
 
															+            return big_area, pred_pro, pred_city, pred_dis, prob, max_score
														
 
															+
														
 
															         def get_ree_addr(prem):
														
 
															             tenderee = ""
														
 
															             tenderee_address = ""
														
@@ -5787,92 +6007,6 @@ class DistrictPredictor():
 
															             except Exception as e:
														
 
															                 print('解析prem 获取招标人、及地址出错')
														
 
															             return tenderee, tenderee_address
														
 
															-        def get_area(text, web_source_name, not_in_content=True):
														
 
															-            score_l = []
														
 
															-            id_set = set()
														
 
															-
														
 
															-            if re.search(self.short_name, text):
														
 
															-                for it in re.finditer(self.full_name, text):
														
 
															-                    name = it.group(0)
														
 
															-                    score = len(name) / len(text)
														
 
															-                    for _id in self.full2id[name]:
														
 
															-                        area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
														
 
															-                        # score_l.append([_id, score] + area)
														
 
															-                        # w = self.dist_dic[_id]['权重']
														
 
															-                        score_l.append([_id, score + 1] + area) # 匹配全称的加1 ，不加权重，因为权重某些赋值不好
														
 
															-
														
 
															-                flag = 0
														
 
															-                for it in re.finditer(self.short_name, text):
														
 
															-                    if it.end() < len(text) and re.search('^(村|镇|街|路|江|河|湖|北路|南路|东路|大道|社区)', text[it.end():]) == None:
														
 
															-                        name = it.group(0)
														
 
															-                        score = (it.start() + len(name)) / len(text)
														
 
															-                        for _id in self.short2id[name]:
														
 
															-                            score2 = 0
														
 
															-                            w = self.dist_dic[_id]['权重']
														
 
															-                            _type = self.dist_dic[_id]['类型']
														
 
															-                            area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
														
 
															-                            if area[0] in ['2', '16', '20', '30']:
														
 
															-                                _type += 10
														
 
															-                            if w < 1 and it.end() < len(text) and text[it.end()] in ['省', '市', '县']: # 如果简称后面 有省市县权重改为1
														
 
															-                                w = 1
														
 
															-                            score2 += w
														
 
															-                            if _id not in id_set:
														
 
															-                                if _type == 20:
														
 
															-                                    type_w = 3
														
 
															-                                elif _type == 30:
														
 
															-                                    if it.start()>3 and text[it.start()-1] == '市': # 城市后面 简称不能作为市
														
 
															-                                        type_w = 0
														
 
															-                                    else:
														
 
															-                                        type_w = 2
														
 
															-                                else:
														
 
															-                                    if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
														
 
															-                                        type_w = 2
														
 
															-                                    else:
														
 
															-                                        type_w = 0.5
														
 
															-                                id_set.add(_id)
														
 
															-                                score2 += w * type_w
														
 
															-                            score_l.append([_id, score * w + score2] + area)
														
 
															-
														
 
															-                if flag == 1:
														
 
															-                    pass
														
 
															-                #         print('score', score)
														
 
															-            if re.search('公司', web_source_name) == None:
														
 
															-                for it in re.finditer(self.short_name, web_source_name):
														
 
															-                    name = it.group(0)
														
 
															-                    for _id in self.short2id[name]:
														
 
															-                        area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
														
 
															-                        w = self.dist_dic[_id]['权重']
														
 
															-                        score = w * 0.2
														
 
															-                        score_l.append([_id, score] + area)
														
 
															-            area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
														
 
															-            if len(score_l) == 0:
														
 
															-                return {'district': area_dic}
														
 
															-            else:
														
 
															-                df = pd.DataFrame(score_l, columns=['id', 'score', 'province', 'city', 'district'])
														
 
															-                df['简称'] = df['id'].apply(lambda x: self.dist_dic[x]['地区'])
														
 
															-                # print('地区评分：')
														
 
															-                # print(df)
														
 
															-                df_pro = df.groupby('province').sum().sort_values(by=['score'], ascending=False)
														
 
															-                pro_id = df_pro.index[0]
														
 
															-                if df_pro.loc[pro_id, 'score'] < 0.1 and not_in_content:  # 不是二次全文匹配的 省级评分小于0.1的不要
														
 
															-                    # print('评分低于0.1', df_pro.loc[pro_id, 'score'], self.dist_dic[pro_id]['地区'])
														
 
															-                    return {'district': area_dic}
														
 
															-                area_dic['province'] = self.dist_dic[pro_id]['地区']
														
 
															-                area_dic['area'] = self.dist_dic[pro_id]['大区']
														
 
															-                df = df[df['city'] != ""]
														
 
															-                df = df[df['province'] == pro_id]
														
 
															-                if len(df) > 0:
														
 
															-                    df_city = df.groupby('city').sum().sort_values(by=['score'], ascending=False)
														
 
															-                    city_id = df_city.index[0]
														
 
															-                    area_dic['city'] = self.dist_dic[city_id]['地区']
														
 
															-                    df = df[df['district'] != ""]
														
 
															-                    df = df[df['city'] == city_id]
														
 
															-                    if len(df) > 0:
														
 
															-                        df_dist = df.groupby('district').sum().sort_values(by=['score'], ascending=False)
														
 
															-                        dist_id = df_dist.index[0]
														
 
															-                        area_dic['district'] = self.dist_dic[dist_id]['地区']
														
 
															-                # print(area_dic)
														
 
															-                return {'district': area_dic}
														
 
															         def get_role_address(text):
														
 
															             '''正则匹配获取招标人地址
														
@@ -5892,14 +6026,17 @@ class DistrictPredictor():
 
															                 return ''
														
 
															         def get_project_addr(text):
														
 
															-            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															+            p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+([\w（）]{,20}[，。])?|\w{2,15}[，。])'
														
 
															+            p2 = '项目位于(?P<addr>\w{2}市\w{2,4}区)'
														
 
															             if re.search(p1, text):
														
 
															                 return re.search(p1, text).group('addr')
														
 
															+            elif re.search(p2, text):
														
 
															+                return re.search(p2, text).group('addr')
														
 
															             else:
														
 
															                 return ''
														
 
															         def get_bid_addr(text):
														
 
															-            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售|所属)(地址|地点|所在地区?|地域)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															             if re.search(p2, text):
														
 
															                 return re.search(p2, text).group('addr')
														
 
															             else:
														
@@ -5909,7 +6046,7 @@ class DistrictPredictor():
 
															             tenderee_l = []
														
 
															             addr_l = []
														
 
															             for ent in list_entitys[0]:
														
 
															-                if ent.entity_type == 'location' and len(ent.entity_text)>2:
														
 
															+                if ent.entity_type == 'location' and len(ent.entity_text) > 2:
														
 
															                     addr_l.append(ent.entity_text)
														
 
															                 elif ent.entity_type in ['org', 'company']:
														
 
															                     if ent.label in [0, 1]:  # 加招标或代理
														
@@ -5923,85 +6060,43 @@ class DistrictPredictor():
 
															             else:
														
 
															                 return ''
														
 
															-        if '##attachment##' in list_articles[0].content:
														
 
															-            content, attachment = list_articles[0].content.split('##attachment##')
														
 
															-            if len(content) < 200:
														
 
															-                content += attachment
														
 
															-        else:
														
 
															-            content = list_articles[0].content
														
 
															-
														
 
															-        tenderee, tenderee_address = get_ree_addr(prem)
														
 
															-        msc = ""
														
 
															-        pro_addr = get_project_addr(content)
														
 
															-        if pro_addr != "":
														
 
															-            msc += '使用规则提取的项目地址；'
														
 
															-            tenderee_address = pro_addr
														
 
															-        else:
														
 
															-            role_addr = get_role_address(content)
														
 
															-            if role_addr != "":
														
 
															-                msc += '使用规则提取的联系人地址；'
														
 
															-                tenderee_address = role_addr
														
 
															-
														
 
															-        if tenderee_address == "":
														
 
															-            title_addr = get_title_addr(title)
														
 
															-            if title_addr != "":
														
 
															-                msc += '使用规则提取的标题地址；'
														
 
															-                tenderee_address = title_addr
														
 
															-            else:
														
 
															-                bid_addr = get_bid_addr(content)
														
 
															-                if bid_addr != "":
														
 
															-                    msc += '使用规则提取的开标地址；'
														
 
															-                    tenderee_address = bid_addr
														
 
															-
														
 
															-        project_name = str(project_name)
														
 
															-        tenderee = str(tenderee)
														
 
															-
														
 
															-        # print('招标人地址',role_addr, tenderee_address)
														
 
															-
														
 
															-        project_name = project_name + title if project_name not in title else project_name
														
 
															-        project_name = project_name.replace(tenderee, '')
														
 
															-
														
 
															-        text1 = "{0} {1} {2}".format(project_name, tenderee, tenderee_address)
														
 
															-
														
 
															-        web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
														
 
															-        text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  #预防提取错 合肥 路南 新会 等地区
														
 
															-
														
 
															-        if pro_addr:
														
 
															-            msc += '## 使用项目地址输入：%s ##；' % pro_addr
														
 
															-            rs = get_area(pro_addr, '')
														
 
															-            msc += '预测结果：省份：%s， 城市：%s，区县：%s；' % (
														
 
															-                rs['district']['province'], rs['district']['city'], rs['district']['district'])
														
 
															-            if rs['district']['province'] != '全国':
														
 
															-                # print('地区匹配：', msc)
														
 
															-                return rs
														
 
															-
														
 
															-        # print('text1:', text1)
														
 
															-        msc += '## 第一次预测输入：%s ##；'%text1
														
 
															-        rs = get_area(text1, web_source_name)
														
 
															-        msc += '预测结果：省份：%s， 城市：%s，区县：%s；' % (
														
 
															-        rs['district']['province'], rs['district']['city'], rs['district']['district'])
														
 
															-        # self.f.write('%s %s \n' % (list_articles[0].id, msc))
														
 
															-        # print('地区匹配：', msc)
														
 
															-        if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
														
 
															-            msc = ""
														
 
															-            all_addr, tenderees = get_all_addr(list_entitys)
														
 
															-            text2 = tenderees + " " + all_addr + ' ' + title
														
 
															-            msc += '使用实体列表所有招标人+所有地址；'
														
 
															-            # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
														
 
															-            text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
														
 
															-            # print('text2:', text2)
														
 
															-            msc += '## 第二次预测输入：%s ##'%text2
														
 
															-            rs2 = get_area(text2, web_source_name, not_in_content=False)
														
 
															-            rs2['district']['is_in_text'] = True
														
 
															-            if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
														
 
															-                rs = rs2
														
 
															-            elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
														
 
															-                rs = rs2
														
 
															-            msc += '预测结果：省份：%s， 城市：%s，区县：%s'%(
														
 
															-                rs['district']['province'],rs['district']['city'],rs['district']['district'])
														
 
															-        # self.f.write('%s %s \n'%(list_articles[0].id, msc))
														
 
															-        # print('地区匹配：', msc)
														
 
															-        return rs
														
 
															+        area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
														
 
															+        province_l, city_l, district_l = find_whole_areas(title)
														
 
															+        pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
														
 
															+        big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
														
 
															+        # print('关键词1：', province_l, city_l, district_l)
														
 
															+        # print('分数：', pro_ids, city_ids, dis_ids, prob, max_score)
														
 
															+        if pred_city == "" or prob < 0.7 or max_score<2:
														
 
															+            province_l2, city_l2, district_l2 = find_whole_areas('%s %s' % (ree, addr), weight=0.8)
														
 
															+            province_l.extend(province_l2)
														
 
															+            city_l.extend(city_l2)
														
 
															+            district_l.extend(district_l2)
														
 
															+            pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
														
 
															+            big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
														
 
															+            # print('关键词2：', province_l, city_l, district_l)
														
 
															+            # print('分数：', pro_ids, city_ids, dis_ids, prob, max_score)
														
 
															+            if pred_city == "" or prob < 0.7 or max_score<2:
														
 
															+                province_l3, city_l3, district_l3 = find_whole_areas(web_source_name, weight=0.6)
														
 
															+                province_l.extend(province_l3)
														
 
															+                city_l.extend(city_l3)
														
 
															+                district_l.extend(district_l3)
														
 
															+                pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
														
 
															+                big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
														
 
															+                # print('关键词3：', province_l, city_l, district_l)
														
 
															+                # print('分数：', pro_ids, city_ids, dis_ids, prob, max_score)
														
 
															+
														
 
															+        in_content = False
														
 
															+        if big_area != "":
														
 
															+            area_dic['area'] = big_area
														
 
															+        if pred_pro != "":
														
 
															+            area_dic['province'] = pred_pro
														
 
															+        if pred_city != "":
														
 
															+            area_dic['city'] = pred_city
														
 
															+        if pred_dis != "":
														
 
															+            area_dic['district'] = pred_dis
														
 
															+        if in_content:
														
 
															+            area_dic['is_in_text'] = True
														
 
															+        return {'district': area_dic}
														
 
															     def get_area(self, text, web_name, in_content=False):
														
 
															         p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
														
@@ -6651,6 +6746,8 @@ class TablePremExtractor(object):
 
															                         continue
														
 
															                     # print('表头错误，一个td匹配到两个表头：', header_dic)
														
 
															                     return flag, contain_header, dict(), not_sure_winner
														
 
															+                if text == '单位': # 20241128 补充金额单位
														
 
															+                    header_dic['amount_unit'] = (i, text)
														
 
															             if re.search('；金额(（万?元）)?；', '；'.join(td_list)):  # 召回某些表格只写 金额 作为表头，不能识别为招标或中标金额
														
 
															                 if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
														
 
															                     for i in range(len(td_list)):
														
@@ -6750,6 +6847,7 @@ class TablePremExtractor(object):
 
															             win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
														
 
															             win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else ""
														
 
															             serviceTime = df.loc[i, headers['serviceTime'][0]].strip() if "serviceTime" in headers else ""
														
 
															+            amount_unit = df.loc[i, headers['amount_unit'][0]].strip() if "amount_unit" in headers else ""
														
 
															             if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
														
 
															                 # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
														
@@ -6764,7 +6862,7 @@ class TablePremExtractor(object):
 
															                 project_name = ""
														
 
															             package_code = package_code_raw
														
 
															-            if re.search('合计|总计', package_code+project_code):
														
 
															+            if re.search('合计|总计', package_code+project_code+project_name):
														
 
															                 continue
														
 
															             if package_code + project_code == previous_package:  # 处理 208162730 一个包采购多种东西情况
														
 
															                 same_package = True
														
@@ -6843,7 +6941,14 @@ class TablePremExtractor(object):
 
															                     prem_dic.pop(package)
														
 
															                     break
														
 
															                 budget_header = headers['budget'][1] if 'budget' in headers else ''
														
 
															+                if amount_unit!='' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元', budget_+budget_header)==None : # 20241128 补充某些表格价格单位分开两列， 例：557953660
														
 
															+                    budget_ += amount_unit
														
 
															                 budget, money_unit = money_process(budget_, budget_header) if re.search('[%％‰折]|浮率|期加点\d+BP', budget_)==None else (0, '')
														
 
															+                if re.search('元[/每]', amount_unit) or re.search('单价', budget_header):
														
 
															+                    unit_tendereeMoney = budget
														
 
															+                    budget = 0
														
 
															+                else:
														
 
															+                    unit_tendereeMoney = 0
														
 
															                 if (re.search('费率|下浮率|[%％‰折]|优惠率',
														
 
															                               budget_header + budget_) and budget < 100) or budget > 50000000000:  # 如果是费率或大于500亿的金额改为0
														
@@ -6854,6 +6959,13 @@ class TablePremExtractor(object):
 
															                     else:
														
 
															                         prem_dic[package]['tendereeMoney'] = budget
														
 
															                     prem_dic[package]['tendereeMoneyUnit'] = money_unit
														
 
															+                if unit_tendereeMoney > 0:
														
 
															+                    if 'unit_tendereeMoney' not in prem_dic[package]:
														
 
															+                        prem_dic[package]['unit_tendereeMoney'] = 0
														
 
															+                    if same_package and prem_dic[package]['unit_tendereeMoney'] != unit_tendereeMoney:  # 处理 类似 136839070 一包多物品多预算
														
 
															+                        prem_dic[package]['unit_tendereeMoney'] += unit_tendereeMoney
														
 
															+                    else:
														
 
															+                        prem_dic[package]['unit_tendereeMoney'] = unit_tendereeMoney
														
 
															             if tenderee and not same_package:
														
 
															                 prem_dic[package]['roleList'].append({
														
 
															                         "address": "",
														
@@ -6874,8 +6986,16 @@ class TablePremExtractor(object):
 
															                               bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符，中断匹配
														
 
															                     prem_dic.pop(package)
														
 
															                     break
														
 
															-
														
 
															+                bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
														
 
															+                if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and bid_amount_!='' and re.search('元',
														
 
															+                                                                                                       bid_amount_ + bid_amount_header) == None:
														
 
															+                    bid_amount_ += amount_unit
														
 
															                 bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%％‰折]|浮率|期加点\d+BP', bid_amount_)==None and 'bid_amount' in headers else (0, '')
														
 
															+                if re.search('元[/每]', amount_unit) or re.search('单价', bid_amount_header):
														
 
															+                    unit_price = bid_amount
														
 
															+                    bid_amount = 0
														
 
															+                else:
														
 
															+                    unit_price = 0
														
 
															                 if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%％‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉，防止类似 河钢供应链管理平台 站源错误，金额不为0的才算中标
														
 
															                     if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的包 丢弃
														
 
															                         prem_dic.pop(package)
														
@@ -6885,7 +7005,6 @@ class TablePremExtractor(object):
 
															                         prem_dic.pop(package)
														
 
															                     continue
														
 
															-                bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
														
 
															                 if (re.search('费率|下浮率|[%％‰折]|优惠率',
														
 
															                               bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000:  # 如果是费率或大于500亿的金额改为0
														
 
															                     bid_amount = 0
														
@@ -6897,7 +7016,7 @@ class TablePremExtractor(object):
 
															                     serviceTime = extract_serviceTime(serviceTime[0]['body'],"") if serviceTime else ""
														
 
															                     # print(serviceTime)
														
 
															                 if not same_package or len(prem_dic[package]['roleList'])==0:
														
 
															-                    prem_dic[package]['roleList'].append({
														
 
															+                    role_dic = {
														
 
															                             "address": "",
														
 
															                             "linklist": [],
														
 
															                             "role_money": {
														
@@ -6910,17 +7029,20 @@ class TablePremExtractor(object):
 
															                             "role_name": "win_tenderer",
														
 
															                             "role_text": tenderer,
														
 
															                             "serviceTime": serviceTime
														
 
															-                    })
														
 
															+                    }
														
 
															+                    if unit_price > 0:
														
 
															+                        role_dic['role_money']['unit_price'] = unit_price
														
 
															+                    prem_dic[package]['roleList'].append(role_dic)
														
 
															                 elif prem_dic[package]['roleList'] and prem_dic[package]['roleList'][-1].get('role_name', '')=='win_tenderer':
														
 
															                     if 'multi_winner' not in prem_dic[package]['roleList'][-1]:
														
 
															                         prem_dic[package]['roleList'][-1]['multi_winner'] = prem_dic[package]['roleList'][-1]['role_text']
														
 
															                         prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
														
 
															                     elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
														
 
															                         prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
														
 
															-                    if bid_amount != 0: # 有中标金额的才放进去
														
 
															+                    if bid_amount != 0 or unit_price > 0: # 有中标金额的才放进去
														
 
															                         if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
														
 
															                             prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
														
 
															-                        prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit,"serviceTime":serviceTime})
														
 
															+                        prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit, "serviceTime": serviceTime})
														
 
															                 tenderer_list.append(tenderer)
														
 
															                 serviceTime_list.append(serviceTime)
														
 
															             if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃 并不再继续往下匹配
														
@@ -7113,6 +7235,7 @@ class CandidateExtractor(object):
 
															             flag = True
														
 
															             for i in range(len(td_list)) :
														
 
															                 text = td_list[i]
														
 
															+                text = re.sub('\s|[（(]排名不分先后[)）]', '', text)
														
 
															                 if len(text) > 15: # 长度大于15 不进行表头匹配
														
 
															                     continue
														
 
															                 if re.search('未(中标|成交)原因', text):  # 不提取此种表格
														
@@ -7134,6 +7257,8 @@ class CandidateExtractor(object):
 
															                 if num>1:
														
 
															                     # print('表头错误，一个td匹配到两个表头：', header_dic)
														
 
															                     return flag, contain_header, dict()
														
 
															+                if text == '单位': # 20241128 补充金额单位
														
 
															+                    header_dic['amount_unit'] = (i, text)
														
 
															             if ('candidate' in header_dic and 'win_sort' in header_dic) or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic): # 有排名才返回表头进行提取
														
 
															                 return flag, contain_header, header_dic
														
 
															         elif len(set(fix_td_list) & self.headerset) >= 2  or (len(set(fix_td_list)) == 2 and len(set(fix_td_list) & self.headerset) >= 1):  # 如果包含两个表头以上或 只有两列且包含一个表头
														
@@ -7210,6 +7335,7 @@ class CandidateExtractor(object):
 
															             win_tenderer = df.loc[i, headers['win_tenderer'][0]].strip() if "win_tenderer" in headers else ""
														
 
															             second_tenderer = df.loc[i, headers['second_tenderer'][0]].strip() if "second_tenderer" in headers else ""
														
 
															             third_tenderer = df.loc[i, headers['third_tenderer'][0]].strip() if "third_tenderer" in headers else ""
														
 
															+            amount_unit = df.loc[i, headers['amount_unit'][0]].strip() if "amount_unit" in headers else ""
														
 
															             if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头， 停止匹配 # 排除 ,win_sort 避免367940050漏提取
														
 
															                 # print('包含表头， 停止匹配')
														
@@ -7286,7 +7412,14 @@ class CandidateExtractor(object):
 
															                         if len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分￥整\s\d,.]|人民币|不?含税', '',
														
 
															                                       text)) > 5:  # 金额字段出现超过5个非金额字符，中断匹配
														
 
															                             break
														
 
															+                        if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元', text+header)==None: # 补充另外在一列的金额单位
														
 
															+                            text += amount_unit
														
 
															                         money, money_unit = money_process(text, header)
														
 
															+                        if re.search('元[/每]', amount_unit) or re.search('单价', header):
														
 
															+                            unit_price = money
														
 
															+                            money = 0
														
 
															+                        else:
														
 
															+                            unit_price = 0
														
 
															                         if (re.search('费率|下浮率|[%％‰折]|优惠率', header+text) and money < 100) or money > 50000000000: # 如果是费率或大于500亿的金额改为0
														
 
															                             money = 0
														
@@ -7295,6 +7428,11 @@ class CandidateExtractor(object):
 
															                                 role_dic[type] = dict()
														
 
															                             role_dic[type]['money'] = money
														
 
															                             role_dic[type]['money_unit'] = money_unit
														
 
															+                        if unit_price > 0:
														
 
															+                            if type not in role_dic:
														
 
															+                                role_dic[type] = dict()
														
 
															+                            role_dic[type]['unit_price'] = unit_price
														
 
															+                            role_dic[type]['money_unit'] = money_unit
														
 
															                 else:
														
 
															                     line_num += 1
														
 
															                     if findtop3 and findmoney:
														
@@ -7322,13 +7460,21 @@ class CandidateExtractor(object):
 
															                         prem_dic[package]['name'] = project_name
														
 
															                     if len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分￥整\s\d,.]|人民币|不?含税', '', bid_amount_))> 5:  # 金额字段出现超过5个非金额字符，中断匹配
														
 
															                         break
														
 
															+                    header = headers['bid_amount'][1] if "bid_amount" in headers else ''
														
 
															+                    if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元',
														
 
															+                                                                                                           bid_amount_ + header) == None:  # 补充另外在一列的金额单位
														
 
															+                        bid_amount_ += amount_unit
														
 
															                     bid_amount, money_unit  = money_process(bid_amount_, headers['bid_amount'][1])  if "bid_amount" in headers else (0, "")
														
 
															+                    if re.search('元[/每]', amount_unit) or re.search('单价', header):
														
 
															+                        unit_price = bid_amount
														
 
															+                        bid_amount = 0
														
 
															+                    else:
														
 
															+                        unit_price = 0
														
 
															-                    header = headers['bid_amount'][1] if "bid_amount" in headers else ''
														
 
															                     if (re.search('费率|下浮率|[%％‰折]|优惠率',
														
 
															                                   header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000:  # 如果是费率或大于500亿的金额改为0
														
 
															                         bid_amount = 0
														
 
															-                    prem_dic[package]['roleList'].append({
														
 
															+                    tmp_role_dic = {
														
 
															                             "address": "",
														
 
															                             "linklist": [],
														
 
															                             "role_money": {
														
@@ -7341,7 +7487,10 @@ class CandidateExtractor(object):
 
															                             "role_name": role_type,
														
 
															                             "role_text": candidate,
														
 
															                             "serviceTime": ""
														
 
															-                    })
														
 
															+                    }
														
 
															+                    if unit_price > 0:
														
 
															+                        tmp_role_dic['role_money']['unit_price'] = unit_price
														
 
															+                    prem_dic[package]['roleList'].append(tmp_role_dic)
														
 
															                     if len(prem_dic[package]['roleList']) == 0:  # 只有项目编号和名称的 丢弃
														
 
															                         prem_dic.pop(package)
														
 
															         if role_dic and prem_dic == dict():