Parcourir la source

Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BIDI_ML_INFO_EXTRACTION

znj il y a 6 mois
Parent
commit
5115a05a10

+ 247 - 23
BiddingKG/dl/common/Utils.py

@@ -947,29 +947,36 @@ def money_process(money_text, header):
     '''
     money = 0
     money_unit = ""
-    # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[((]?万?", money_text)
-    money_text = re.sub('\s', '', money_text) # 2024/04/19 修复 457699044 556.46751 万元 金额与单位有空格造成万漏提取
-    if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text) and re.search('\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?', money_text):
-        money_text = re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text).group(0)  # 如果表格同时包含大小写金额,取大写金额,避免单位取错 463310590 790000(柒拾玖万元整)
-    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?", money_text)
-    if re_price:
-        money_re = re_price.group(0)
-        if (re.search('万元|[((]万[))]',  header) or re.search('万元|[((]万[))]', money_text)) and '万' not in money_re:  # 修复37797825 控制价(万) # 修复 460307391 万元不在表头,在数字前面
-            money_re += '万元'
-        elif (re.search('亿元|[((]亿[))]',  header) or re.search('亿元|[((]亿[))]', money_text)) and '亿' not in money_re:  # 修复37797825 控制价(万) # 修复 460307391 万元不在表头,在数字前面
-            money_re += '亿元'
-        # money = float(getUnifyMoney(money_text))
-        money = float(getUnifyMoney(money_re))
-        if money > 10000000000000:  # 大于万亿的去除
-            money = 0
-        # money_unit = '万元' if '万' in money_re and re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None else '元'
-        if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None:
-            if '万' in money_re:
-                money_unit = '万元'
-            elif '亿' in money_re:
-                money_unit = '亿元'
-            else:
-                money_unit = '元'
+    moneys, _ = get_money_entity('%s:%s' % (header, money_text))
+    if len(moneys) == 1:
+        money = float(moneys[0][0])
+        money_unit = moneys[0][3]
+    elif len(moneys) == 2 and moneys[0][0]==moneys[1][0]:
+        money = float(moneys[0][0])
+        money_unit = moneys[0][3]
+    # # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[((]?万?", money_text)
+    # money_text = re.sub('\s', '', money_text) # 2024/04/19 修复 457699044 556.46751 万元 金额与单位有空格造成万漏提取
+    # if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text) and re.search('\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?', money_text):
+    #     money_text = re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text).group(0)  # 如果表格同时包含大小写金额,取大写金额,避免单位取错 463310590 790000(柒拾玖万元整)
+    # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?", money_text)
+    # if re_price:
+    #     money_re = re_price.group(0)
+    #     if (re.search('万元|[((]万[))]',  header) or re.search('万元|[((]万[))]', money_text)) and '万' not in money_re:  # 修复37797825 控制价(万) # 修复 460307391 万元不在表头,在数字前面
+    #         money_re += '万元'
+    #     elif (re.search('亿元|[((]亿[))]',  header) or re.search('亿元|[((]亿[))]', money_text)) and '亿' not in money_re:  # 修复37797825 控制价(万) # 修复 460307391 万元不在表头,在数字前面
+    #         money_re += '亿元'
+    #     # money = float(getUnifyMoney(money_text))
+    #     money = float(getUnifyMoney(money_re))
+    #     if money > 10000000000000:  # 大于万亿的去除
+    #         money = 0
+    #     # money_unit = '万元' if '万' in money_re and re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None else '元'
+    #     if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None:
+    #         if '万' in money_re:
+    #             money_unit = '万元'
+    #         elif '亿' in money_re:
+    #             money_unit = '亿元'
+    #         else:
+    #             money_unit = '元'
     return (money, money_unit)
 
 package_number_pattern = re.compile(
@@ -1146,6 +1153,223 @@ def is_deposit_project(title, name, requirement):
         return True
     return False
 
+def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
+    money_list = []
+    # 使用正则识别金额
+    entity_type = "money"
+    list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
+                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d:|\d=\d[-+×]\d:)?(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[)\)]?))",
+                          "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
+                          "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
+    # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元,合同金额:378.8万元 提取
+
+    pattern_money = re.compile("%s|%s|%s|%s" % (
+    list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
+    list_money_pattern["front_m"]))
+
+    # sentence_text = re.sub('\d+[年月日]', '', sentence_text) # 修复560180018 中标价(元):3年投标报价(元)含税6299700.00 3年作为金额
+
+    if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
+        found_yeji += 1
+    if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
+        all_match = []
+    else:
+        ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text)  # 过滤掉收费标准里面的金额
+        if ser:
+            sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
+        all_match = re.finditer(pattern_money, sentence_text)
+    # print('all_match:', all_match)
+    for _match in all_match:
+        # print('_match: ', _match.group())
+        if re.search('^元/1\d{10},$', _match.group(0)): # 修复 495042766 现场负责人 姚元 / 13488160460 预测为金额
+            continue
+        if len(_match.group()) > 0:
+            # print("===",_match.group())
+            # # print(_match.groupdict())
+            notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
+            unit = ""
+            entity_text = ""
+            start_index = ""
+            end_index = ""
+            text_beforeMoney = ""
+            filter = ""
+            filter_unit = False
+            notSure = False
+            science = ""
+            if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
+                # print('金额在业绩后面: ', _match.group(0))
+                found_yeji += 1
+                break
+            for k, v in _match.groupdict().items():
+                if v != "" and v is not None:
+                    if k == 'text_key_word':
+                        notSure = True
+                    if k.split("_")[0] == "money":
+                        entity_text = v
+                        # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
+                        if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
+                            entity_text = entity_text[:-3]
+                    if k.split("_")[0] == "unit":
+                        if 'behind' in k or unit == "":  # 优先后面单位  预算金额(元):160万元  总价(万元):最终报价:695000.00(元)
+                            unit = v
+                    if k.split("_")[0] == "text":
+                        text_beforeMoney = v
+                    if k.split("_")[0] == "filter":
+                        filter = v
+                    if re.search("filter_unit", k) is not None:
+                        filter_unit = True
+                    if k.split("_")[0] == 'science':
+                        science = v
+            # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text,unit,text_beforeMoney,filter,filter_unit))
+            # if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
+            #     if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0] - 2):_match.span()[0]]):
+            #         entity_text = re.sub('\d+,', '', entity_text)
+            #     else:
+            #         entity_text = entity_text.replace(',', '.')
+            #     # print(' 修正OCR识别小数点为逗号')
+
+            if filter != "":
+                continue
+            if len(entity_text)>30 or len(re.sub('[E-]', '', science))>2: # 限制数字长度,避免类似265339018附件金额错误,数值超大报错 decimal.InvalidOperation
+                continue
+            start_index, end_index = _match.span()
+            start_index += len(text_beforeMoney)
+
+            '''过滤掉手机号码作为金额'''
+            if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
+                # print('过滤掉手机号码作为金额')
+                continue
+            elif re.search('^1[3-9]\d{9}$', entity_text) and re.search(':\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额(万元):季勇13863441880'
+                # print('过滤掉手机号码作为金额')
+                continue
+            elif re.search('^\d(.\d{1,2})?$', entity_text) and re.search('\d$', _match.group(0)) and re.search('^[、.]', sentence_text[_match.end():]): # 170756755 控制价为:1、合理利润率上限
+                # print('过滤错误金额:', _match.group(0))
+                continue
+
+            if unit == "":  # 2021/7/21 有明显金额特征的补充单位,避免被过滤
+                if (re.search('(¥|¥|RMB|CNY)[::]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
+                    if entity_text.endswith('万元'):
+                        unit = '万元'
+                        entity_text = entity_text[:-2]
+                    else:
+                        unit = '元'
+                    # print('1明显金额特征补充单位 元')
+                elif re.search('USD[::]?$', text_beforeMoney):
+                    unit = '美元'
+                elif re.search('EUR[::]?$', text_beforeMoney):
+                    unit = '欧元'
+                elif re.search('JPY[::]?$', text_beforeMoney):
+                    unit = '日元'
+                elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
+                    # print('两个金额连接后面的有单位,用后面单位')
+                    unit = '万元'
+                elif re.search('^,?(价格币种:\w{2,3},)?价格单位:万元', sentence_text[end_index:]): # 修复494731937金额单位缺漏 中标价格:39501.094425,价格币种:人民币,价格单位:万元,
+                    unit = '万元'
+                elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型:(万元)报价:13311.1582,得分:84.46,
+                    unit = '万元'
+                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
+                    if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
+                        unit = '万元'
+                        # print('金额较小且句子中有万元的,补充单位为万元')
+                    elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
+                        unit = '万元'
+                    else:
+                        unit = '元'
+                        # print('金额前面紧接关键词的补充单位 元')
+                elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|(^\d{,3}(,\d{3})+,?$)', entity_text):
+                    unit = '元'
+                    # print('3明显金额特征补充单位 元')
+                else:
+                    # print('过滤掉没单位金额: ',entity_text)
+                    continue
+            elif unit == '万元':
+                if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
+                    unit = '元'
+                elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
+                    unit = '元'
+            if unit.find("万") >= 0 and entity_text.find("万") >= 0:  # 2021/7/19修改为金额文本有万,不计算单位
+                # print('修正金额及单位都有万, 金额:',entity_text, '单位:',unit)
+                unit = "元"
+            if re.search('.*万元万元', entity_text):  # 2021/7/19 修正两个万元
+                # print(' 修正两个万元',entity_text)
+                entity_text = entity_text.replace('万元万元', '万元')
+            else:
+                if filter_unit:
+                    continue
+
+            # symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else ''  # 负值金额前面保留负号 ,后面这些不作为负金额 起拍价:105.29-200.46万元  预 算 --- 350000.0 2023/04/14 取消符号
+
+            entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
+            # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
+            if re.search('总投资|投资总额|总预算|总概算|(投资|招标|资金|存放|操作|融资)规模|批复概算|投资额|总规模|工程造价|总金额',
+                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额  20241031工程造价作总投资
+                # print('总投资金额: ', _match.group(0))
+                notes = '总投资'
+            elif re.search('投资|概算|建安费|其他费用|基本预备费',
+                           sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
+                notes = '投资'
+            # elif re.search('工程造价',
+            #                sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
+            #     notes = '工程造价'
+            elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
+                  or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\((]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\))]*[::为]',
+                               sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
+                  or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
+                               sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
+                notes = '保证金'
+                # print('保证金信息:', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
+            elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
+                           sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
+                notes = '成本警戒线'
+            elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]|服务金额', sentence_text[_match.span()[0]:_match.span()[1]]):
+                # cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
+                # notes = cost_re.group(1)
+                notes = '招标或中标金额'
+            elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
+                notes = '单价'
+            elif re.search('^[/每]', sentence_text[_match.end():]):
+                # print('单价:', _match.group(0))
+                notes = '单价'
+            elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
+                notes = '大写'
+                if entity_text[0] == "拾":  # 2021/12/16 修正大写金额省略了数字转换错误问题
+                    entity_text = "壹" + entity_text
+                # print("补充备注:notes = 大写")
+            if len(unit) > 0:
+                if unit.find('万') >= 0 and len(entity_text.split('.')[0]) >= 8:  # 2021/7/19 修正万元金额过大的情况
+                    # print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
+                    entity_text = str(
+                        getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]) / 10000)
+                    unit = '元'  # 修正金额后单位 重置为元
+                else:
+                    # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
+                    entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]))
+            else:
+                if entity_text.find('万') >= 0 and entity_text.split('.')[0].isdigit() and len(
+                        entity_text.split('.')[0]) >= 8:
+                    entity_text = str(getUnifyMoney(entity_text) / 10000)
+                    # print('修正金额字段含万 过大的情况')
+                else:
+                    entity_text = str(getUnifyMoney(entity_text))
+            if science and re.search('^E-?\d+$', science):  # 科学计数
+                entity_text = str(Decimal(entity_text + science)) if Decimal(entity_text + science) > 100 and Decimal(
+                    entity_text + science) < 10000000000 else entity_text  # 结果大于100及小于100万才使用科学计算
+
+            if float(entity_text) > 100000000000:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
+                # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
+                continue
+
+            if notSure and unit == "" and float(entity_text) > 100 * 10000:
+                # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000:', entity_text, unit)
+                continue
+            # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
+            #                                                                      filter, filter_unit))
+            if re.search('[%%‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
+                # print('过滤掉可能是费率的金额')
+                continue
+            money_list.append((entity_text, start_index, end_index, unit, notes))
+    return money_list, found_yeji
+
 def recall(y_true, y_pred):
     '''
     计算召回率

+ 3 - 0
BiddingKG/dl/interface/Entitys.py

@@ -300,6 +300,7 @@ class Role():
         self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期)
         self.address = ""  #2022/08/08 新增 角色地址
         self.multi_winner = multi_winner #2024/4/8 新增多中标人
+        self.unit_price = 0 # 20241127 新增单价
 
     def getString(self):
         self.linklist = [item for item in set(self.linklist)]
@@ -342,6 +343,8 @@ class Role():
         result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
                   'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
                   'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}
+        if self.unit_price != 0: # 单价
+            result['role_money']['unit_price'] = self.unit_price
         if result['role_name'] in ['tenderee', 'win_tenderer']:
             result['role_prob'] = self.role_prob
         if result['role_name'] == 'win_tenderer' and self.multi_winner != set():

+ 1 - 215
BiddingKG/dl/interface/Preprocessing.py

@@ -3219,221 +3219,6 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
         article.content = re.sub("##attachment_begin##|##attachment_end##", "", article.content)
     return list_sentences,list_outlines
 
-def get_money_entity(sentence_text, found_yeji, in_attachment=False):
-    money_list = []
-    # 使用正则识别金额
-    entity_type = "money"
-    list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d:|\d=\d[-+×]\d:)?(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
-                          "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
-                          "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
-    # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元,合同金额:378.8万元 提取
-
-    pattern_money = re.compile("%s|%s|%s|%s" % (
-    list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
-    list_money_pattern["front_m"]))
-
-    if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
-        found_yeji += 1
-    if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
-        all_match = []
-    else:
-        ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text)  # 过滤掉收费标准里面的金额
-        if ser:
-            sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
-        all_match = re.finditer(pattern_money, sentence_text)
-    # print('all_match:', all_match)
-    for _match in all_match:
-        # print('_match: ', _match.group())
-        if re.search('^元/1\d{10},$', _match.group(0)): # 修复 495042766 现场负责人 姚元 / 13488160460 预测为金额
-            continue
-        if len(_match.group()) > 0:
-            # print("===",_match.group())
-            # # print(_match.groupdict())
-            notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
-            unit = ""
-            entity_text = ""
-            start_index = ""
-            end_index = ""
-            text_beforeMoney = ""
-            filter = ""
-            filter_unit = False
-            notSure = False
-            science = ""
-            if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
-                # print('金额在业绩后面: ', _match.group(0))
-                found_yeji += 1
-                break
-            for k, v in _match.groupdict().items():
-                if v != "" and v is not None:
-                    if k == 'text_key_word':
-                        notSure = True
-                    if k.split("_")[0] == "money":
-                        entity_text = v
-                        # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
-                        if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
-                            entity_text = entity_text[:-3]
-                    if k.split("_")[0] == "unit":
-                        if 'behind' in k or unit == "":  # 优先后面单位  预算金额(元):160万元  总价(万元):最终报价:695000.00(元)
-                            unit = v
-                    if k.split("_")[0] == "text":
-                        # print('text_before: ', _match.group(k))
-                        text_beforeMoney = v
-                    if k.split("_")[0] == "filter":
-                        filter = v
-                    if re.search("filter_unit", k) is not None:
-                        filter_unit = True
-                    if k.split("_")[0] == 'science':
-                        science = v
-            # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text,unit,text_beforeMoney,filter,filter_unit))
-            # if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
-            #     if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0] - 2):_match.span()[0]]):
-            #         entity_text = re.sub('\d+,', '', entity_text)
-            #     else:
-            #         entity_text = entity_text.replace(',', '.')
-            #     # print(' 修正OCR识别小数点为逗号')
-
-            if filter != "":
-                continue
-            if len(entity_text)>30 or len(re.sub('[E-]', '', science))>2: # 限制数字长度,避免类似265339018附件金额错误,数值超大报错 decimal.InvalidOperation
-                continue
-            start_index, end_index = _match.span()
-            start_index += len(text_beforeMoney)
-
-            '''过滤掉手机号码作为金额'''
-            if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
-                # print('过滤掉手机号码作为金额')
-                continue
-            elif re.search('^1[3-9]\d{9}$', entity_text) and re.search(':\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额(万元):季勇13863441880'
-                # print('过滤掉手机号码作为金额')
-                continue
-            elif re.search('^\d(.\d{1,2})?$', entity_text) and re.search('\d$', _match.group(0)) and re.search('^[、.]', sentence_text[_match.end():]): # 170756755 控制价为:1、合理利润率上限
-                # print('过滤错误金额:', _match.group(0))
-                continue
-
-            if unit == "":  # 2021/7/21 有明显金额特征的补充单位,避免被过滤
-                if (re.search('(¥|¥|RMB|CNY)[::]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
-                    if entity_text.endswith('万元'):
-                        unit = '万元'
-                        entity_text = entity_text[:-2]
-                    else:
-                        unit = '元'
-                    # print('1明显金额特征补充单位 元')
-                elif re.search('USD[::]?$', text_beforeMoney):
-                    unit = '美元'
-                elif re.search('EUR[::]?$', text_beforeMoney):
-                    unit = '欧元'
-                elif re.search('JPY[::]?$', text_beforeMoney):
-                    unit = '日元'
-                elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
-                    # print('两个金额连接后面的有单位,用后面单位')
-                    unit = '万元'
-                elif re.search('^,?(价格币种:\w{2,3},)?价格单位:万元', sentence_text[end_index:]): # 修复494731937金额单位缺漏 中标价格:39501.094425,价格币种:人民币,价格单位:万元,
-                    unit = '万元'
-                elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型:(万元)报价:13311.1582,得分:84.46,
-                    unit = '万元'
-                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
-                    if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
-                        unit = '万元'
-                        # print('金额较小且句子中有万元的,补充单位为万元')
-                    elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
-                        unit = '万元'
-                    else:
-                        unit = '元'
-                        # print('金额前面紧接关键词的补充单位 元')
-                elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|(^\d{,3}(,\d{3})+,?$)', entity_text):
-                    unit = '元'
-                    # print('3明显金额特征补充单位 元')
-                else:
-                    # print('过滤掉没单位金额: ',entity_text)
-                    continue
-            elif unit == '万元':
-                if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
-                    unit = '元'
-                elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
-                    unit = '元'
-            if unit.find("万") >= 0 and entity_text.find("万") >= 0:  # 2021/7/19修改为金额文本有万,不计算单位
-                # print('修正金额及单位都有万, 金额:',entity_text, '单位:',unit)
-                unit = "元"
-            if re.search('.*万元万元', entity_text):  # 2021/7/19 修正两个万元
-                # print(' 修正两个万元',entity_text)
-                entity_text = entity_text.replace('万元万元', '万元')
-            else:
-                if filter_unit:
-                    continue
-
-            # symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else ''  # 负值金额前面保留负号 ,后面这些不作为负金额 起拍价:105.29-200.46万元  预 算 --- 350000.0 2023/04/14 取消符号
-
-            entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
-            # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
-            if re.search('总投资|投资总额|总预算|总概算|(投资|招标|资金|存放|操作|融资)规模|批复概算|投资额|总规模|工程造价|总金额',
-                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额  20241031工程造价作总投资
-                # print('总投资金额: ', _match.group(0))
-                notes = '总投资'
-            elif re.search('投资|概算|建安费|其他费用|基本预备费',
-                           sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
-                notes = '投资'
-            # elif re.search('工程造价',
-            #                sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
-            #     notes = '工程造价'
-            elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
-                  or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\((]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\))]*[::为]',
-                               sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
-                  or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
-                               sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
-                notes = '保证金'
-                # print('保证金信息:', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
-            elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
-                           sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
-                notes = '成本警戒线'
-            elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]|服务金额', sentence_text[_match.span()[0]:_match.span()[1]]):
-                # cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
-                # notes = cost_re.group(1)
-                notes = '招标或中标金额'
-            elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
-                notes = '单价'
-            elif re.search('^[/每]', sentence_text[_match.end():]):
-                # print('单价:', _match.group(0))
-                notes = '单价'
-            elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
-                notes = '大写'
-                if entity_text[0] == "拾":  # 2021/12/16 修正大写金额省略了数字转换错误问题
-                    entity_text = "壹" + entity_text
-                # print("补充备注:notes = 大写")
-            if len(unit) > 0:
-                if unit.find('万') >= 0 and len(entity_text.split('.')[0]) >= 8:  # 2021/7/19 修正万元金额过大的情况
-                    # print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
-                    entity_text = str(
-                        getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]) / 10000)
-                    unit = '元'  # 修正金额后单位 重置为元
-                else:
-                    # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
-                    entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]))
-            else:
-                if entity_text.find('万') >= 0 and entity_text.split('.')[0].isdigit() and len(
-                        entity_text.split('.')[0]) >= 8:
-                    entity_text = str(getUnifyMoney(entity_text) / 10000)
-                    # print('修正金额字段含万 过大的情况')
-                else:
-                    entity_text = str(getUnifyMoney(entity_text))
-            if science and re.search('^E-?\d+$', science):  # 科学计数
-                entity_text = str(Decimal(entity_text + science)) if Decimal(entity_text + science) > 100 and Decimal(
-                    entity_text + science) < 10000000000 else entity_text  # 结果大于100及小于100万才使用科学计算
-
-            if float(entity_text) > 100000000000:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
-                # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
-                continue
-
-            if notSure and unit == "" and float(entity_text) > 100 * 10000:
-                # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000:', entity_text, unit)
-                continue
-            # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
-            #                                                                      filter, filter_unit))
-            if re.search('[%%‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
-                # print('过滤掉可能是费率的金额')
-                continue
-            money_list.append((entity_text, start_index, end_index, unit, notes))
-    return money_list, found_yeji
 def cut_repeat_name(s):
     '''
     公司连续重复名称去重
@@ -4086,6 +3871,7 @@ if __name__=="__main__":
     text = '是否拟中标人:是,评标排名:1,价格类型:(万元)报价:13311.1582,得分:84.46,项目负责人:邓焱文'
     text = ',采购包1:采购包预算金额(元:1,500000.00,采购包最高限价(元:1,430600.00,'
     text = '成交人:中坤电力有限公司,成交价格:11493,603.52元,质量:合格,项目工期:117天,'
+    text = '3年投标报价(元)含税 6299700.00'
     # text = '数量及单位1:65台,单价2:800,投标报价3=1×2:52000。'
     print(get_money_entity(text, found_yeji=0))
     # with open('D:/138786703.html', 'r', encoding='utf-8') as f:

+ 2 - 2
BiddingKG/dl/interface/extract.py

@@ -442,7 +442,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic)
 
     '''最终验证prem'''
-    getAttributes.confirm_prem(prem[0]['prem'], channel_dic, deposit_project, prem[0]['total_tendereeMoney'])
+    getAttributes.confirm_prem(prem[0]['prem'], channel_dic, deposit_project, prem[0]['total_tendereeMoney'], name=codeName[0]['name'])
 
     # 提取拟在建所需字段
     start_time = time.time()
@@ -455,7 +455,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-11-25'}
+    version_date = {'version_date': '2024-12-02'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:

+ 129 - 11
BiddingKG/dl/interface/getAttributes.py

@@ -936,13 +936,19 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 #     packDict[packageName]["roleList"][i].money = money
                 #     packDict[packageName]["roleList"][i].money_prob = money_prob
                 if packDict[packageName]["roleList"][i].money_prob==0 :  # 2021/7/20第一次更新金额
-                    packDict[packageName]["roleList"][i].money = money.entity_text
+                    if money.notes == '单价':
+                        packDict[packageName]["roleList"][i].unit_price = money.entity_text
+                    else:
+                        packDict[packageName]["roleList"][i].money = money.entity_text
                     packDict[packageName]["roleList"][i].money_prob = money_prob
                     packDict[packageName]["roleList"][i].money_unit = money.money_unit
                 elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or (money.notes in ['大写'] and money.in_attachment==False): # 2021/7/20改为优先选择大写金额,
                     # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob)
                     # print('链接金额备注 ',money.notes, money.entity_text, money.values)
-                    packDict[packageName]["roleList"][i].money = money.entity_text
+                    if money.notes == '单价':
+                        packDict[packageName]["roleList"][i].unit_price = money.entity_text
+                    else:
+                        packDict[packageName]["roleList"][i].money = money.entity_text
                     packDict[packageName]["roleList"][i].money_prob = money_prob
                     packDict[packageName]["roleList"][i].money_unit = money.money_unit
                 # print('链接中的金额:{0}, 单位:{1}'.format(money.entity_text, money.money_unit))
@@ -2707,12 +2713,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         PackDict[packageName]["cost_warning"] = str(Decimal(entity.entity_text))
 
             elif entity.values[entity.label]>=on_value:
-                if str(entity.label)=="1":
+                if str(entity.label)=="1" and entity.notes != '单价':
                     set_tenderer_money.add(float(entity.entity_text))
                     list_tenderer_money.append(float(entity.entity_text))  # 2021/7/16 新增列表,倒序保存所有中标金额
                     unit_list.append(entity.money_unit)
                 # if str(entity.label)=="0":
-                if str(entity.label)=="0" and entity.notes!='总投资':
+                if str(entity.label)=="0" and (entity.notes!='总投资' or float(entity.entity_text)<100000000):
                     '''
                     if p_entity>0:
                         p_before = list_entity[p_entity-1]
@@ -2731,16 +2737,119 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         #     PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
                         # if entity.values[entity.label]>on_value:
                         if entity.values[entity.label]>max_prob-0.005: # 选择最大概率招标金额 2024/05/23 相差0.005尽量选前面的
-                            PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
+                            if entity.notes == '单价':
+                                PackDict["Project"]["unit_tendereeMoney"] = str(Decimal(entity.entity_text))
+                            else:
+                                PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
                             PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit
                             max_prob = entity.values[entity.label]
                     else:
-                        PackDict[packageName]["tendereeMoney"] = str(Decimal(entity.entity_text))
+                        if entity.notes == '单价':
+                            PackDict[packageName]["unit_tendereeMoney"] = str(Decimal(entity.entity_text))
+                        else:
+                            PackDict[packageName]["tendereeMoney"] = str(Decimal(entity.entity_text))
                         PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit
                         #add pointer_tendereeMoney
                         packagePointer.pointer_tendereeMoney = entity
         p_entity -= 1            
-    
+
+    '''包名与标段号链接'''
+    l_main = []
+    l_attn = []
+    pack_num_main = 0
+    name_num_main = 0
+    pack_num_attn = 0
+    name_num_attn = 0
+    for entity in list_entity:
+        if entity.entity_type in  ['name', 'package']:
+            if entity.in_attachment:
+                l_attn.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end))
+                if entity.entity_type == 'name':
+                    name_num_attn += 1
+                else:
+                    pack_num_attn += 1
+            else:
+                l_main.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end))
+                if entity.entity_type == 'name':
+                    name_num_main += 1
+                else:
+                    pack_num_main += 1
+    if name_num_main > 0 and pack_num_main > 0:
+        l_main.sort(key=lambda x: [x[2],x[3]])
+        # print('正文名称:',l_main)
+        link_dic = {}
+        i = 1
+        pre_ty = l_main[0][0]
+        while i < len(l_main):
+            if l_main[i][0] != pre_ty:
+                ty1, ent1, s1, b1, e1 = l_main[i-1]
+                ty2, ent2, s2, b2, e2 = l_main[i]
+                if ty1 == 'package':
+                    if ent1 not in link_dic:
+                        link_dic[ent1] = []
+                    if s1 == s2:
+                        dist = abs(b2 - b1)
+                    else:
+                        dist = len(list_sentence[s1].sentence_text) - b1
+                        for id in range(s1+1, s2):
+                            dist += len(list_sentence[id].sentence_text)
+                        dist += b2
+                    link_dic[ent1].append((s2-s1, dist, ent2))
+                elif ty2 == 'package':
+                    if ent2 not in link_dic:
+                        link_dic[ent2] = []
+                    if s1 == s2:
+                        dist = abs(b2 - b1)
+                    else:
+                        dist = len(list_sentence[s1].sentence_text) - b1
+                        for id in range(s1+1, s2):
+                            dist += len(list_sentence[id].sentence_text)
+                        dist += b2
+                    link_dic[ent2].append((s2-s1, dist, ent1))
+            pre_ty = l_main[i][0]
+            i += 1
+        for k, v in link_dic.items():
+            v.sort(key=lambda x: [x[0], x[1]])
+            # print('各包排序后项目名:', k, v)
+            PackDict[k]["name"] = v[0][2]
+    elif name_num_attn > 0 and pack_num_attn > 0:
+        # print("附件名称:", l_attn)
+        l_attn.sort(key=lambda x: [x[2],x[3]])
+        link_dic = {}
+        i = 1
+        pre_ty = l_attn[0][0]
+        while i < len(l_attn):
+            if l_attn[i][0] != pre_ty:
+                ty1, ent1, s1, b1, e1 = l_attn[i-1]
+                ty2, ent2, s2, b2, e2 = l_attn[i]
+                if ty1 == 'package':
+                    if ent1 not in link_dic:
+                        link_dic[ent1] = []
+                    if s1 == s2:
+                        dist = abs(b2 - b1)
+                    else:
+                        dist = len(list_sentence[s1].sentence_text) - b1
+                        for id in range(s1+1, s2):
+                            dist += len(list_sentence[id].sentence_text)
+                        dist += b2
+                    link_dic[ent1].append((s2-s1, dist, ent2))
+                elif ty2 == 'package':
+                    if ent2 not in link_dic:
+                        link_dic[ent2] = []
+                    if s1 == s2:
+                        dist = abs(b2 - b1)
+                    else:
+                        dist = len(list_sentence[s1].sentence_text) - b1
+                        for id in range(s1+1, s2):
+                            dist += len(list_sentence[id].sentence_text)
+                        dist += b2
+                    link_dic[ent2].append((s2-s1, dist, ent1))
+            pre_ty = l_attn[i][0]
+            i += 1
+        for k, v in link_dic.items():
+            v.sort(key=lambda x: [x[0], x[1]])
+            # print('各包排序后项目名:', k, v)
+            PackDict[k]["name"] = v[0][2]
         
     #删除一个机构有多个角色的数据
     #删除重复人、概率不回传
@@ -2804,8 +2913,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
             _flag_pack_money = False
     if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
         for k,v in dict_pack_tenderer_money.items():
-            v[0].money = list(v[1])[0]
-            # print('k,v in dict_pack_tenderer_money.items', k, v)
+            if float(v[0].unit_price) < float(list(v[1])[0]): # 20241128 金额大于单价时才作链接金额
+                v[0].money = list(v[1])[0]
     # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
     for pack in PackDict.keys():
         for i in range(len(PackDict[pack]["roleList"])):
@@ -4217,7 +4326,7 @@ def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修
                         #     l[2] = total_product_money
                         #     log('修改中标金额为所有产品总金额')
                         # if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) == 0 and float(l["role_money"]['money'])<total_product_money/10:
-                        if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or float(l["role_money"]['money'])<ree_money/2): # 改为小于一半招标金额或为0时替换为合计金额
+                        if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or (float(l["role_money"]['money'])<ree_money/2 and float(l["role_money"]['money'])<total_product_money<ree_money)): # 改为小于一半招标金额或为0时替换为合计金额
                             l["role_money"]['money'] = total_product_money
                             # print('修改中标金额为所有产品总金额')
                     except Exception as e:
@@ -4622,6 +4731,11 @@ def update_prem(old_prem, new_prem, in_attachment=False):
                     del_k.append(k)
             for k in del_k:
                 old_prem.pop(k)
+        if in_attachment: # 附件表格提取的,原来提取有中标人,停止替换
+            for v in old_prem.values():
+                for d in v['roleList']:
+                    if d['role_name'] in ['win_tenderer', 'pre_win_tenderer']:
+                        return 0
 
         # if len(new_prem) > len(old_prem) and [k for k in new_prem if '自增' not in k] == []:  # 如果表格提取包号都为自增编号且包数大于非表格提取,不进行更新 例 244355092  281854766
         #     return None
@@ -4707,7 +4821,7 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
     # return old_prem
 
-def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0):
+def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0, name=""):
     '''
     规则检查纠正prem,如果Project包中标人在其他包中标人,去掉project包中标角色;如果有其他包中标人,去掉roleList为空的包;
     :param prem: prem 字段字典
@@ -4758,6 +4872,10 @@ def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMone
         for k in prem:
             if float(prem[k]['tendereeMoney'])==0:
                 prem[k]['tendereeMoney'] = total_tendereeMoney
+    if name != '' and len(prem)<=2: # 20241129 小于等于两个包且无包名称,取项目名称
+        for k in prem:
+            if prem[k].get('name', '') == '':
+                prem[k]['name'] = name
 
 
 def fix_single_source(prem, channel_dic, original_docchannel):

+ 350 - 201
BiddingKG/dl/interface/predictor.py

@@ -531,9 +531,12 @@ class CodeNamePredict():
             if len(dict_name_freq_score) == 0:
                 # name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
                 name_re1 = '(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题)[::\s]+(?P<name>[^,。:;]{2,60})[,。]'
+                name_re2 = '(合同|采购)包\d((?P<name>[^,。:;]{2,60}))[:,。]' # 20241202 补充合同包 包名表达 558410976
                 for sentence in list_sentence:
                     # pad_sentence = sentence.sentence_text
                     othername = re.search(name_re1, sentence.sentence_text)
+                    if othername == None:
+                        othername = re.search(name_re2, sentence.sentence_text)
                     if othername != None:
                         project_name = othername.group('name')
                         if re.search('[\u4e00-\u9fa5]+', project_name) == None:  # 没有中文的项目名称去除
@@ -869,7 +872,7 @@ class PREMPredict():
                 elif re.search('^放弃中标资格|是否中标:否|^(中标|成交)(公示|公告)', behind):
                     values[2] = 0.5
                     label = 5
-                elif re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]', front)==None:
+                elif re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]', front)==None and values[2]<0.7: #20241126补充条件避免漏提 560768263 第一候选人:单位名称: 上海理想信息产业(集团)有限公司 ,投标报价:
                     values[2] = 0.5
                     label = 5
                 elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$|,承刻单位:$|乙方接受为$|丙方:$', front):  # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位: 241929628 1月9,承刻单位: 肃宁县超凡网络光敏印章刻印部 ,印章预留印模
@@ -982,8 +985,8 @@ class PREMPredict():
                     values[label] = 0.49
                 elif re.search('(含|在|包括|[大小等高低]于|达到)$|[\d.%]+[+×*-]$', front):
                     values[label] = 0.49
-                elif entity.notes == '单价' and float(entity.entity_text)<5000:
-                    label = 2
+                # elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释,单价单独存放
+                #     label = 2
             elif label ==0: # 错误招标金额处理
                 if re.search('投资(金额|规模):$', front): # 545988699 金额不大的投资金额作为备选招标金额
                     values[label] = 0.51
@@ -994,8 +997,8 @@ class PREMPredict():
                     values[label] = 0.49
                 # elif re.search('(含|在|包括|[大小等高低]于|如预算金额为)$|[\d.%]+((含))?[+×*-]$', front):  # 2024/10/30 注销,避免漏提 预算金额:控制在26000元以内由合作银行出资 ;投资金额不低于人民币500万元
                 #     values[label] = 0.49
-                elif entity.notes == '单价' and float(entity.entity_text)<5000:
-                    label = 2
+                # elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释,单价单独存放
+                #     label = 2
             elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
                 label = 1
                 values[label] = 0.8
@@ -2334,12 +2337,12 @@ class RoleGrade():
         self.tenderee_left_6 = "(?P<tenderee_left_6>(业主|建设|委托)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|业主|买方)"
         self.tenderee_left_5 = "(?P<tenderee_left_5>(发布)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|买方|发布机构)"
         self.agency_left_9 = "(?P<agency_left_9>代理)"
-        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]名|排[名序]:1|名次:1)"
+        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一](|候选)|排[名序]:1|名次:1)"
         self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方|最[终后]选[择取]))"  # 229435497 最后选择西平,县中原彩印有限公司,作为此项目中标供应商,
         self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
         self.winTenderer_right_9 = "(?P<winTenderer_right_9>^(为(中标|成交|中选)(人|单位|供应商|公司)|以\d+[\d.,]+万?元中标))"
-        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]:2|名次:2))"
-        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))"
+        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2](|候选)|排[名序]:2|名次:2))"
+        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3](|候选)|排[名序]:3|名次:3))"
         self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.tenderee_left_5,self.agency_left_9,
                              self.winTenderer_left_9,self.winTenderer_left_8, self.winTenderer_right_9, self.winTenderer_left_6, self.secondTenderer_left_9, self.thirdTenderer_left_9] # 概率要由高到低 274941849
     def predict(self, list_sentences, list_entitys, original_docchannel, span=15, min_prob=0.7):
@@ -2456,8 +2459,8 @@ class RoleGrade():
         for entity in low_prob_winner: # 如果低概率中标人在招标或代理列表,改为非角色
             if entity.entity_text in all_tenderee_agency:
                 entity.label = 5
-            elif entity.in_attachment: # 附件低概率中标角色不要 避免:516109391 桂林银行崇左宁明支行,宁明县城中镇兴宁大道中70号,预测为中标
-                entity.label = 5
+            # elif entity.in_attachment: # 附件低概率中标角色不要 避免:516109391 桂林银行崇左宁明支行,宁明县城中镇兴宁大道中70号,预测为中标 20241126 注释掉,558294326 附件单个候选人漏提取
+            #     entity.label = 5
 
         if org_winner != []:
             flag = 0
@@ -2499,7 +2502,7 @@ class MoneyGrade():
                     if ser:
                         groupdict = pattern.split('>')[0].replace('(?P<', '')
                         _role, _direct, _prob = groupdict.split('_')
-                        if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context):# or float(entity.entity_text)<100:
+                        if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or entity.notes == '总投资':# or float(entity.entity_text)<100:
                             _prob = 6
                         _label = role2id.get(_role)
                         if _label != entity.label:
@@ -2522,8 +2525,8 @@ class MoneyGrade():
                     # _prob = min_prob - 0.1 if in_att else min_prob
                     entity.values[entity.label] = _prob + entity.values[entity.label] / 20
                     # print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values)
-            if entity.entity_type in ['money'] and entity.label in [0, 1] and 0.5<=entity.values[entity.label]<0.75 and float(entity.entity_text)<100: # 20241011 低概率小金额改为其他金额
-                entity.label = 2
+            # if entity.entity_type in ['money'] and entity.label in [0, 1] and 0.5<=entity.values[entity.label]<0.75 and float(entity.entity_text)<100: # 20241011 低概率小金额改为其他金额 # 20241128 小金额可能为单价,放单价存放
+            #     entity.label = 2
 
 
 # 时间类别
@@ -5765,16 +5768,233 @@ class DistrictPredictor():
         with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
             self.area_variance_dic = pickle.load(f)
 
-    def predict_backup(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
-        '''
-        先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
-        :param project_name:
-        :param prem:
-        :param title:
-        :param list_articles:
-        :param web_source_name:
-        :return:
-        '''
+    def predict_area(self, title, ree, addr, web_source_name):
+        p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
+
+        def find_whole_areas(text, weight=1):
+            '''
+            通过正则匹配字符串返回地址
+            :param pettern: 地址正则 广东省|广西省|...
+            :param text: 待匹配文本
+            :return:
+            '''
+            province_l, city_l, district_l = [], [], []
+
+            text = str(text)
+            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县',
+                          ' ', text)  # 544151395 赤壁市老城区燃气管道老化更新改造
+            text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
+            text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域:怒江州 识别为广西 - 崇左 - 江州
+            text = re.sub('茂名滨海新区', '茂名市', text)
+            text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
+            text = re.sub('横州市', '横县', text)  # 例:547363890 修复广西南宁横州 不在地区表问题
+            ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
+            if ser and '黎族' not in ser.group(0):
+                text = text.replace(ser.group(0), ser.group(0) + '黎族')
+            for k, v in self.area_variance_dic.items():  # 20241113 根据地区变更信息替换文本
+                text = text.replace(k, v)
+
+            if re.search('[\u4e00-\u9fa5]', text) == None:
+                return province_l, city_l, district_l
+
+            pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
+                p_pro, p_city, p_dis, p_city, p_dis, p_dis)
+
+            for it in re.finditer(pettern, text):
+                if it.group(0) == '站前':  # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
+                    continue
+                for k, v in it.groupdict().items():
+                    if v != None:
+                        if it.end() == it.end(k) and re.search('[省市区县州旗盟]$', v) == None and re.search(
+                                '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆|经济开发区|开发区|新区)',
+                                # 城市不匹配为区的地址 修复 滨州北海经济开发区 北海新区 等提取为北海
+                                text[it.end(k):]) != None:
+                            continue
+                        if k in ['prov']:
+                            if v in full_dic['province']:
+                                score = 2
+                            else:
+                                score = 1
+                                if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
+                                        , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='(' and text[it.end(k)]==')'):
+                                    score += 1
+                            score += it.end(k) / len(text) / 10
+                            province_l.append((v, score * weight))
+                        elif k in ['city', 'city1']:
+                            if v in full_dic['city']:
+                                score = 2
+                            else:
+                                score = 1
+                                if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
+                                        , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='(' and text[it.end(k)]==')'):
+                                    score += 1
+                            score += it.end(k) / len(text) / 10
+                            city_l.append((v, score * weight))
+                        elif k in ['dist', 'dist1', 'dist2']:
+                            if v in ['东区', '西区', '城区', '郊区', '矿区']:
+                                continue
+                            if v in full_dic['district'] and len(v)>2:
+                                score = 2
+                            else:
+                                score = 0.5
+                                if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
+                                        , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='(' and text[it.end(k)]==')'):
+                                    score += 1
+                                    # print('县区加分:', v, text)
+                            score += it.end(k) / len(text) / 10
+                            if v == '昌江' and '景德镇' not in it.group(0):
+                                district_l.append(('昌江黎族', score * weight))
+                            else:
+                                district_l.append((v, score * weight))
+            return province_l, city_l, district_l
+
+        def merge_score(province_l, city_l, district_l, filter_short_dist=True):
+            '''
+            合并分数,下级地区分数加到上级
+            :param province_l: 提取到的省份列表 [(name, score)]
+            :param city_l: 提取到的城市列表 [(name, score)]
+            :param district_l: 提取到的区县列表 [(name, score)]
+            :param filter_short_dist: 是否过滤不在省份下的区县简称权重
+            :return:
+            '''
+            pro_ids = dict()
+            city_ids = dict()
+            dis_ids = dict()
+            for pro in province_l:
+                name, score = pro
+                idx = full_dic['province'][name] if name in full_dic['province'] else short_dic['province'][name]
+                if idx not in pro_ids:
+                    pro_ids[idx] = 0
+                pro_ids[idx] += score
+
+            tmp_pro = {}
+            for city in city_l:
+                name, score = city
+                if name in full_dic['city']:
+                    for idx in full_dic['city'][name]:
+                        if idx not in city_ids:
+                            city_ids[idx] = 0
+                        city_ids[idx] += score
+                        pro_idx = idx_dic[idx]['省']
+                        if pro_idx in tmp_pro:
+                            tmp_pro[pro_idx] += score
+                        else:
+                            tmp_pro[pro_idx] = score
+                elif name in short_dic['city']:
+                    for idx in short_dic['city'][name]:
+                        if idx not in city_ids:
+                            city_ids[idx] = 0
+                        city_ids[idx] += score
+                        pro_idx = idx_dic[idx]['省']
+                        if pro_idx in tmp_pro:
+                            tmp_pro[pro_idx] += score
+                        else:
+                            tmp_pro[pro_idx] = score
+            if set(tmp_pro) & set(pro_ids) != set():
+                for k, v in tmp_pro.items():
+                    if k in pro_ids:
+                        pro_ids[k] += v
+            else:
+                pro_ids.update(tmp_pro)
+            tmp_pro = {}
+            tmp_city = {}
+            for dis in district_l:
+                name, score = dis
+                if name in full_dic['district']:
+                    for idx in full_dic['district'][name]:
+                        if idx not in dis_ids:
+                            dis_ids[idx] = 0
+                        dis_ids[idx] += score
+                        pro_idx = idx_dic[idx]['省']
+                        if pro_idx in tmp_pro:
+                            tmp_pro[pro_idx] += score
+                        else:
+                            tmp_pro[pro_idx] = score
+                        city_idx = idx_dic[idx]['市']
+                        if city_idx in tmp_city:
+                            tmp_city[city_idx] += score
+                        else:
+                            tmp_city[city_idx] = score
+                elif name in short_dic['district']:
+                    for idx in short_dic['district'][name]:
+                        if idx not in dis_ids:
+                            dis_ids[idx] = 0
+                        dis_ids[idx] += score
+                        pro_idx = idx_dic[idx]['省']
+                        if filter_short_dist and pro_idx not in pro_ids:
+                            continue
+                        if pro_idx in tmp_pro:
+                            tmp_pro[pro_idx] += score
+                        else:
+                            tmp_pro[pro_idx] = score
+                        city_idx = idx_dic[idx]['市']
+                        if city_idx in tmp_city:
+                            tmp_city[city_idx] += score
+                        else:
+                            tmp_city[city_idx] = score
+            if set(tmp_pro) & set(pro_ids) != set():
+                for k, v in tmp_pro.items():
+                    if k in pro_ids:
+                        pro_ids[k] += v
+            else:
+                pro_ids.update(tmp_pro)
+            if set(tmp_city) & set(city_ids) != set():
+                for k, v in tmp_city.items():
+                    if k in city_ids:
+                        city_ids[k] += v
+            else:
+                city_ids.update(tmp_city)
+            return pro_ids, city_ids, dis_ids
+
+        def get_final_addr(pro_ids, city_ids, dis_ids):
+            '''
+            先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市
+            :param province_l: 匹配到的所有省份
+            :param city_l: 匹配到的所有城市
+            :param district_l: 匹配到的所有区县
+            :return:
+            '''
+            big_area = ""
+            pred_pro = ""
+            pred_city = ""
+            pred_dis = ""
+
+            final_pro = ""
+            final_city = ""
+            prob = 0
+            max_score = 0
+            if len(pro_ids) >= 1:
+                pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
+                scores = [it[1] for it in pro_l]
+                prob = max(scores)/sum(scores)
+                max_score = max(scores)
+                final_pro, score = pro_l[0]
+                if score >= 0.01:
+                    pred_pro = idx_dic[final_pro]['返回名称']
+                    big_area = idx_dic[final_pro]['大区']
+            if pred_pro != "" and len(city_ids) >= 1:
+                city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
+                for it in city_l:
+                    if idx_dic[it[0]]['省'] == final_pro:
+                        final_city = it[0]
+                        pred_city = idx_dic[final_city]['返回名称']
+                        break
+            if final_city != "" and len(set(dis_ids)) >= 1:
+                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
+                for it in dis_l:
+                    if idx_dic[it[0]]['市'] == final_city:
+                        pred_dis = idx_dic[it[0]]['返回名称']
+            elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1:  # 20241111 省份不为空,市为空,如果区县在省份下,补充对应的市县
+                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
+                for it in dis_l:
+                    if idx_dic[it[0]]['省'] == final_pro:
+                        pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
+                        pred_dis = idx_dic[it[0]]['返回名称']
+            if pred_city in ['北京', '天津', '上海', '重庆']:
+                pred_city = pred_dis
+                pred_dis = ""
+            return big_area, pred_pro, pred_city, pred_dis, prob, max_score
+
         def get_ree_addr(prem):
             tenderee = ""
             tenderee_address = ""
@@ -5787,92 +6007,6 @@ class DistrictPredictor():
             except Exception as e:
                 print('解析prem 获取招标人、及地址出错')
             return tenderee, tenderee_address
-        def get_area(text, web_source_name, not_in_content=True):
-            score_l = []
-            id_set = set()
-
-            if re.search(self.short_name, text):
-                for it in re.finditer(self.full_name, text):
-                    name = it.group(0)
-                    score = len(name) / len(text)
-                    for _id in self.full2id[name]:
-                        area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
-                        # score_l.append([_id, score] + area)
-                        # w = self.dist_dic[_id]['权重']
-                        score_l.append([_id, score + 1] + area) # 匹配全称的加1 ,不加权重,因为权重某些赋值不好
-
-                flag = 0
-                for it in re.finditer(self.short_name, text):
-                    if it.end() < len(text) and re.search('^(村|镇|街|路|江|河|湖|北路|南路|东路|大道|社区)', text[it.end():]) == None:
-                        name = it.group(0)
-                        score = (it.start() + len(name)) / len(text)
-                        for _id in self.short2id[name]:
-                            score2 = 0
-                            w = self.dist_dic[_id]['权重']
-                            _type = self.dist_dic[_id]['类型']
-                            area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
-                            if area[0] in ['2', '16', '20', '30']:
-                                _type += 10
-                            if w < 1 and it.end() < len(text) and text[it.end()] in ['省', '市', '县']: # 如果简称后面 有省市县权重改为1
-                                w = 1
-                            score2 += w
-                            if _id not in id_set:
-                                if _type == 20:
-                                    type_w = 3
-                                elif _type == 30:
-                                    if it.start()>3 and text[it.start()-1] == '市': # 城市后面 简称不能作为市
-                                        type_w = 0
-                                    else:
-                                        type_w = 2
-                                else:
-                                    if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
-                                        type_w = 2
-                                    else:
-                                        type_w = 0.5
-                                id_set.add(_id)
-                                score2 += w * type_w
-                            score_l.append([_id, score * w + score2] + area)
-
-                if flag == 1:
-                    pass
-                #         print('score', score)
-            if re.search('公司', web_source_name) == None:
-                for it in re.finditer(self.short_name, web_source_name):
-                    name = it.group(0)
-                    for _id in self.short2id[name]:
-                        area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
-                        w = self.dist_dic[_id]['权重']
-                        score = w * 0.2
-                        score_l.append([_id, score] + area)
-            area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
-            if len(score_l) == 0:
-                return {'district': area_dic}
-            else:
-                df = pd.DataFrame(score_l, columns=['id', 'score', 'province', 'city', 'district'])
-                df['简称'] = df['id'].apply(lambda x: self.dist_dic[x]['地区'])
-                # print('地区评分:')
-                # print(df)
-                df_pro = df.groupby('province').sum().sort_values(by=['score'], ascending=False)
-                pro_id = df_pro.index[0]
-                if df_pro.loc[pro_id, 'score'] < 0.1 and not_in_content:  # 不是二次全文匹配的 省级评分小于0.1的不要
-                    # print('评分低于0.1', df_pro.loc[pro_id, 'score'], self.dist_dic[pro_id]['地区'])
-                    return {'district': area_dic}
-                area_dic['province'] = self.dist_dic[pro_id]['地区']
-                area_dic['area'] = self.dist_dic[pro_id]['大区']
-                df = df[df['city'] != ""]
-                df = df[df['province'] == pro_id]
-                if len(df) > 0:
-                    df_city = df.groupby('city').sum().sort_values(by=['score'], ascending=False)
-                    city_id = df_city.index[0]
-                    area_dic['city'] = self.dist_dic[city_id]['地区']
-                    df = df[df['district'] != ""]
-                    df = df[df['city'] == city_id]
-                    if len(df) > 0:
-                        df_dist = df.groupby('district').sum().sort_values(by=['score'], ascending=False)
-                        dist_id = df_dist.index[0]
-                        area_dic['district'] = self.dist_dic[dist_id]['地区']
-                # print(area_dic)
-                return {'district': area_dic}
 
         def get_role_address(text):
             '''正则匹配获取招标人地址
@@ -5892,14 +6026,17 @@ class DistrictPredictor():
                 return ''
 
         def get_project_addr(text):
-            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+([\w()]{,20}[,。])?|\w{2,15}[,。])'
+            p2 = '项目位于(?P<addr>\w{2}市\w{2,4}区)'
             if re.search(p1, text):
                 return re.search(p1, text).group('addr')
+            elif re.search(p2, text):
+                return re.search(p2, text).group('addr')
             else:
                 return ''
 
         def get_bid_addr(text):
-            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售|所属)(地址|地点|所在地区?|地域):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
             if re.search(p2, text):
                 return re.search(p2, text).group('addr')
             else:
@@ -5909,7 +6046,7 @@ class DistrictPredictor():
             tenderee_l = []
             addr_l = []
             for ent in list_entitys[0]:
-                if ent.entity_type == 'location' and len(ent.entity_text)>2:
+                if ent.entity_type == 'location' and len(ent.entity_text) > 2:
                     addr_l.append(ent.entity_text)
                 elif ent.entity_type in ['org', 'company']:
                     if ent.label in [0, 1]:  # 加招标或代理
@@ -5923,85 +6060,43 @@ class DistrictPredictor():
             else:
                 return ''
 
-        if '##attachment##' in list_articles[0].content:
-            content, attachment = list_articles[0].content.split('##attachment##')
-            if len(content) < 200:
-                content += attachment
-        else:
-            content = list_articles[0].content
-
-        tenderee, tenderee_address = get_ree_addr(prem)
-        msc = ""
-        pro_addr = get_project_addr(content)
-        if pro_addr != "":
-            msc += '使用规则提取的项目地址;'
-            tenderee_address = pro_addr
-        else:
-            role_addr = get_role_address(content)
-            if role_addr != "":
-                msc += '使用规则提取的联系人地址;'
-                tenderee_address = role_addr
-
-        if tenderee_address == "":
-            title_addr = get_title_addr(title)
-            if title_addr != "":
-                msc += '使用规则提取的标题地址;'
-                tenderee_address = title_addr
-            else:
-                bid_addr = get_bid_addr(content)
-                if bid_addr != "":
-                    msc += '使用规则提取的开标地址;'
-                    tenderee_address = bid_addr
-
-        project_name = str(project_name)
-        tenderee = str(tenderee)
-
-        # print('招标人地址',role_addr, tenderee_address)
-
-        project_name = project_name + title if project_name not in title else project_name
-        project_name = project_name.replace(tenderee, '')
-
-        text1 = "{0} {1} {2}".format(project_name, tenderee, tenderee_address)
-
-        web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
-        text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  #预防提取错 合肥 路南 新会 等地区
-
-        if pro_addr:
-            msc += '## 使用项目地址输入:%s ##;' % pro_addr
-            rs = get_area(pro_addr, '')
-            msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
-                rs['district']['province'], rs['district']['city'], rs['district']['district'])
-            if rs['district']['province'] != '全国':
-                # print('地区匹配:', msc)
-                return rs
-
-        # print('text1:', text1)
-        msc += '## 第一次预测输入:%s ##;'%text1
-        rs = get_area(text1, web_source_name)
-        msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
-        rs['district']['province'], rs['district']['city'], rs['district']['district'])
-        # self.f.write('%s %s \n' % (list_articles[0].id, msc))
-        # print('地区匹配:', msc)
-        if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
-            msc = ""
-            all_addr, tenderees = get_all_addr(list_entitys)
-            text2 = tenderees + " " + all_addr + ' ' + title
-            msc += '使用实体列表所有招标人+所有地址;'
-            # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
-            text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
-            # print('text2:', text2)
-            msc += '## 第二次预测输入:%s ##'%text2
-            rs2 = get_area(text2, web_source_name, not_in_content=False)
-            rs2['district']['is_in_text'] = True
-            if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
-                rs = rs2
-            elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
-                rs = rs2
-            msc += '预测结果:省份:%s, 城市:%s,区县:%s'%(
-                rs['district']['province'],rs['district']['city'],rs['district']['district'])
-        # self.f.write('%s %s \n'%(list_articles[0].id, msc))
-        # print('地区匹配:', msc)
-        return rs
+        area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
+        province_l, city_l, district_l = find_whole_areas(title)
+        pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
+        big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
+        # print('关键词1:', province_l, city_l, district_l)
+        # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
+        if pred_city == "" or prob < 0.7 or max_score<2:
+            province_l2, city_l2, district_l2 = find_whole_areas('%s %s' % (ree, addr), weight=0.8)
+            province_l.extend(province_l2)
+            city_l.extend(city_l2)
+            district_l.extend(district_l2)
+            pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
+            big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
+            # print('关键词2:', province_l, city_l, district_l)
+            # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
+            if pred_city == "" or prob < 0.7 or max_score<2:
+                province_l3, city_l3, district_l3 = find_whole_areas(web_source_name, weight=0.6)
+                province_l.extend(province_l3)
+                city_l.extend(city_l3)
+                district_l.extend(district_l3)
+                pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
+                big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
+                # print('关键词3:', province_l, city_l, district_l)
+                # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
+
+        in_content = False
+        if big_area != "":
+            area_dic['area'] = big_area
+        if pred_pro != "":
+            area_dic['province'] = pred_pro
+        if pred_city != "":
+            area_dic['city'] = pred_city
+        if pred_dis != "":
+            area_dic['district'] = pred_dis
+        if in_content:
+            area_dic['is_in_text'] = True
+        return {'district': area_dic}
 
     def get_area(self, text, web_name, in_content=False):
         p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
@@ -6651,6 +6746,8 @@ class TablePremExtractor(object):
                         continue
                     # print('表头错误,一个td匹配到两个表头:', header_dic)
                     return flag, contain_header, dict(), not_sure_winner
+                if text == '单位': # 20241128 补充金额单位
+                    header_dic['amount_unit'] = (i, text)
             if re.search(';金额((万?元))?;', ';'.join(td_list)):  # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
                 if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
                     for i in range(len(td_list)):
@@ -6750,6 +6847,7 @@ class TablePremExtractor(object):
             win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
             win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else ""
             serviceTime = df.loc[i, headers['serviceTime'][0]].strip() if "serviceTime" in headers else ""
+            amount_unit = df.loc[i, headers['amount_unit'][0]].strip() if "amount_unit" in headers else ""
 
             if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
                 # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
@@ -6764,7 +6862,7 @@ class TablePremExtractor(object):
                 project_name = ""
 
             package_code = package_code_raw
-            if re.search('合计|总计', package_code+project_code):
+            if re.search('合计|总计', package_code+project_code+project_name):
                 continue
             if package_code + project_code == previous_package:  # 处理 208162730 一个包采购多种东西情况
                 same_package = True
@@ -6843,7 +6941,14 @@ class TablePremExtractor(object):
                     prem_dic.pop(package)
                     break
                 budget_header = headers['budget'][1] if 'budget' in headers else ''
+                if amount_unit!='' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元', budget_+budget_header)==None : # 20241128 补充某些表格价格单位分开两列, 例:557953660
+                    budget_ += amount_unit
                 budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率|期加点\d+BP', budget_)==None else (0, '')
+                if re.search('元[/每]', amount_unit) or re.search('单价', budget_header):
+                    unit_tendereeMoney = budget
+                    budget = 0
+                else:
+                    unit_tendereeMoney = 0
 
                 if (re.search('费率|下浮率|[%%‰折]|优惠率',
                               budget_header + budget_) and budget < 100) or budget > 50000000000:  # 如果是费率或大于500亿的金额改为0
@@ -6854,6 +6959,13 @@ class TablePremExtractor(object):
                     else:
                         prem_dic[package]['tendereeMoney'] = budget
                     prem_dic[package]['tendereeMoneyUnit'] = money_unit
+                if unit_tendereeMoney > 0:
+                    if 'unit_tendereeMoney' not in prem_dic[package]:
+                        prem_dic[package]['unit_tendereeMoney'] = 0
+                    if same_package and prem_dic[package]['unit_tendereeMoney'] != unit_tendereeMoney:  # 处理 类似 136839070 一包多物品多预算
+                        prem_dic[package]['unit_tendereeMoney'] += unit_tendereeMoney
+                    else:
+                        prem_dic[package]['unit_tendereeMoney'] = unit_tendereeMoney
             if tenderee and not same_package:
                 prem_dic[package]['roleList'].append({
                         "address": "",
@@ -6874,8 +6986,16 @@ class TablePremExtractor(object):
                               bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     prem_dic.pop(package)
                     break
-
+                bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
+                if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and bid_amount_!='' and re.search('元',
+                                                                                                       bid_amount_ + bid_amount_header) == None:
+                    bid_amount_ += amount_unit
                 bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率|期加点\d+BP', bid_amount_)==None and 'bid_amount' in headers else (0, '')
+                if re.search('元[/每]', amount_unit) or re.search('单价', bid_amount_header):
+                    unit_price = bid_amount
+                    bid_amount = 0
+                else:
+                    unit_price = 0
                 if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
                     if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的包 丢弃
                         prem_dic.pop(package)
@@ -6885,7 +7005,6 @@ class TablePremExtractor(object):
                         prem_dic.pop(package)
                     continue
 
-                bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
                 if (re.search('费率|下浮率|[%%‰折]|优惠率',
                               bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000:  # 如果是费率或大于500亿的金额改为0
                     bid_amount = 0
@@ -6897,7 +7016,7 @@ class TablePremExtractor(object):
                     serviceTime = extract_serviceTime(serviceTime[0]['body'],"") if serviceTime else ""
                     # print(serviceTime)
                 if not same_package or len(prem_dic[package]['roleList'])==0:
-                    prem_dic[package]['roleList'].append({
+                    role_dic = {
                             "address": "",
                             "linklist": [],
                             "role_money": {
@@ -6910,17 +7029,20 @@ class TablePremExtractor(object):
                             "role_name": "win_tenderer",
                             "role_text": tenderer,
                             "serviceTime": serviceTime
-                    })
+                    }
+                    if unit_price > 0:
+                        role_dic['role_money']['unit_price'] = unit_price
+                    prem_dic[package]['roleList'].append(role_dic)
                 elif prem_dic[package]['roleList'] and prem_dic[package]['roleList'][-1].get('role_name', '')=='win_tenderer':
                     if 'multi_winner' not in prem_dic[package]['roleList'][-1]:
                         prem_dic[package]['roleList'][-1]['multi_winner'] = prem_dic[package]['roleList'][-1]['role_text']
                         prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
                     elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
                         prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
-                    if bid_amount != 0: # 有中标金额的才放进去
+                    if bid_amount != 0 or unit_price > 0: # 有中标金额的才放进去
                         if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
                             prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
-                        prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit,"serviceTime":serviceTime})
+                        prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit, "serviceTime": serviceTime})
                 tenderer_list.append(tenderer)
                 serviceTime_list.append(serviceTime)
             if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃 并不再继续往下匹配
@@ -7113,6 +7235,7 @@ class CandidateExtractor(object):
             flag = True
             for i in range(len(td_list)) :
                 text = td_list[i]
+                text = re.sub('\s|[((]排名不分先后[))]', '', text)
                 if len(text) > 15: # 长度大于15 不进行表头匹配
                     continue
                 if re.search('未(中标|成交)原因', text):  # 不提取此种表格
@@ -7134,6 +7257,8 @@ class CandidateExtractor(object):
                 if num>1:
                     # print('表头错误,一个td匹配到两个表头:', header_dic)
                     return flag, contain_header, dict()
+                if text == '单位': # 20241128 补充金额单位
+                    header_dic['amount_unit'] = (i, text)
             if ('candidate' in header_dic and 'win_sort' in header_dic) or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic): # 有排名才返回表头进行提取
                 return flag, contain_header, header_dic
         elif len(set(fix_td_list) & self.headerset) >= 2  or (len(set(fix_td_list)) == 2 and len(set(fix_td_list) & self.headerset) >= 1):  # 如果包含两个表头以上或 只有两列且包含一个表头
@@ -7210,6 +7335,7 @@ class CandidateExtractor(object):
             win_tenderer = df.loc[i, headers['win_tenderer'][0]].strip() if "win_tenderer" in headers else ""
             second_tenderer = df.loc[i, headers['second_tenderer'][0]].strip() if "second_tenderer" in headers else ""
             third_tenderer = df.loc[i, headers['third_tenderer'][0]].strip() if "third_tenderer" in headers else ""
+            amount_unit = df.loc[i, headers['amount_unit'][0]].strip() if "amount_unit" in headers else ""
 
             if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配 # 排除 ,win_sort 避免367940050漏提取
                 # print('包含表头, 停止匹配')
@@ -7286,7 +7412,14 @@ class CandidateExtractor(object):
                         if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
                                       text)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                             break
+                        if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元', text+header)==None: # 补充另外在一列的金额单位
+                            text += amount_unit
                         money, money_unit = money_process(text, header)
+                        if re.search('元[/每]', amount_unit) or re.search('单价', header):
+                            unit_price = money
+                            money = 0
+                        else:
+                            unit_price = 0
 
                         if (re.search('费率|下浮率|[%%‰折]|优惠率', header+text) and money < 100) or money > 50000000000: # 如果是费率或大于500亿的金额改为0
                             money = 0
@@ -7295,6 +7428,11 @@ class CandidateExtractor(object):
                                 role_dic[type] = dict()
                             role_dic[type]['money'] = money
                             role_dic[type]['money_unit'] = money_unit
+                        if unit_price > 0:
+                            if type not in role_dic:
+                                role_dic[type] = dict()
+                            role_dic[type]['unit_price'] = unit_price
+                            role_dic[type]['money_unit'] = money_unit
                 else:
                     line_num += 1
                     if findtop3 and findmoney:
@@ -7322,13 +7460,21 @@ class CandidateExtractor(object):
                         prem_dic[package]['name'] = project_name
                     if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', bid_amount_))> 5:  # 金额字段出现超过5个非金额字符,中断匹配
                         break
+                    header = headers['bid_amount'][1] if "bid_amount" in headers else ''
+                    if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元',
+                                                                                                           bid_amount_ + header) == None:  # 补充另外在一列的金额单位
+                        bid_amount_ += amount_unit
                     bid_amount, money_unit  = money_process(bid_amount_, headers['bid_amount'][1])  if "bid_amount" in headers else (0, "")
+                    if re.search('元[/每]', amount_unit) or re.search('单价', header):
+                        unit_price = bid_amount
+                        bid_amount = 0
+                    else:
+                        unit_price = 0
 
-                    header = headers['bid_amount'][1] if "bid_amount" in headers else ''
                     if (re.search('费率|下浮率|[%%‰折]|优惠率',
                                   header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000:  # 如果是费率或大于500亿的金额改为0
                         bid_amount = 0
-                    prem_dic[package]['roleList'].append({
+                    tmp_role_dic = {
                             "address": "",
                             "linklist": [],
                             "role_money": {
@@ -7341,7 +7487,10 @@ class CandidateExtractor(object):
                             "role_name": role_type,
                             "role_text": candidate,
                             "serviceTime": ""
-                    })
+                    }
+                    if unit_price > 0:
+                        tmp_role_dic['role_money']['unit_price'] = unit_price
+                    prem_dic[package]['roleList'].append(tmp_role_dic)
                     if len(prem_dic[package]['roleList']) == 0:  # 只有项目编号和名称的 丢弃
                         prem_dic.pop(package)
         if role_dic and prem_dic == dict():