'''
Created on 2018年12月20日

@author: User
'''

import numpy as np
import re
import gensim
from keras import backend as K
import os,sys
import time

import traceback

from threading import RLock

# from pai_tf_predict_proto import tf_predict_pb2
import requests


model_w2v = None
lock_model_w2v = RLock()

USE_PAI_EAS = False

Lazy_load = False
# API_URL = "http://192.168.2.103:8802"
API_URL = "http://127.0.0.1:888"
# USE_API = True
USE_API = False

def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
    _time = time.strftime(format,time.localtime())
    return _time

def getw2vfilepath():
    filename = "wiki_128_word_embedding_new.vector"
    w2vfile = getFileFromSysPath(filename)
    if w2vfile is not None:
        return w2vfile
    return filename

def getLazyLoad():
    global Lazy_load
    return Lazy_load


def getFileFromSysPath(filename):
    for _path in sys.path:
        if os.path.isdir(_path):
            for _file in os.listdir(_path):
                _abspath = os.path.join(_path,_file)
                if os.path.isfile(_abspath):
                    if _file==filename:
                        return _abspath
    return None


model_word_file = os.path.dirname(__file__)+"/../singlew2v_model.vector"
model_word = None
lock_model_word = RLock()

from decimal import Decimal
import logging
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
import pickle
import os

import json

#自定义jsonEncoder
class MyEncoder(json.JSONEncoder):

    def __init__(self):
        import numpy as np
        global np

    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, bytes):
            return str(obj, encoding='utf-8')
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj,(np.int64,np.int32)):
            return int(obj)
        return json.JSONEncoder.default(self, obj)

vocab_word = None
vocab_words = None

file_vocab_word = "vocab_word.pk"
file_vocab_words = "vocab_words.pk"

selffool_authorization = "NjlhMWFjMjVmNWYyNzI0MjY1OGQ1M2Y0ZmY4ZGY0Mzg3Yjc2MTVjYg=="
selffool_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_gpu"
selffool_seg_authorization = "OWUwM2Q0ZmE3YjYxNzU4YzFiMjliNGVkMTA3MzJkNjQ2MzJiYzBhZg=="
selffool_seg_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_seg_gpu"
codename_authorization = "Y2M5MDUxMzU1MTU4OGM3ZDk2ZmEzYjkxYmYyYzJiZmUyYTgwYTg5NA=="
codename_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codename_gpu"

form_item_authorization = "ODdkZWY1YWY0NmNhNjU2OTI2NWY4YmUyM2ZlMDg1NTZjOWRkYTVjMw=="
form_item_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/form"
person_authorization = "N2I2MDU2N2Q2MGQ0ZWZlZGM3NDkyNTA1Nzc4YmM5OTlhY2MxZGU1Mw=="
person_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/person"
role_authorization = "OWM1ZDg5ZDEwYTEwYWI4OGNjYmRlMmQ1NzYwNWNlZGZkZmRmMjE4OQ=="
role_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/role"
money_authorization = "MDQyNjc2ZDczYjBhYmM4Yzc4ZGI4YjRmMjc3NGI5NTdlNzJiY2IwZA=="
money_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/money"
codeclasses_authorization = "MmUyNWIxZjQ2NjAzMWJlMGIzYzkxMjMzNWY5OWI3NzJlMWQ1ZjY4Yw=="
codeclasses_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codeclasses"

def viterbi_decode(score, transition_params):
    """Decode the highest scoring sequence of tags outside of TensorFlow.

    This should only be used at test time.

    Args:
      score: A [seq_len, num_tags] matrix of unary potentials.
      transition_params: A [num_tags, num_tags] matrix of binary potentials.

    Returns:
      viterbi: A [seq_len] list of integers containing the highest scoring tag
          indices.
      viterbi_score: A float containing the score for the Viterbi sequence.
    """
    trellis = np.zeros_like(score)
    backpointers = np.zeros_like(score, dtype=np.int32)
    trellis[0] = score[0]

    for t in range(1, score.shape[0]):
        v = np.expand_dims(trellis[t - 1], 1) + transition_params
        trellis[t] = score[t] + np.max(v, 0)
        backpointers[t] = np.argmax(v, 0)

    viterbi = [np.argmax(trellis[-1])]
    for bp in reversed(backpointers[1:]):
        viterbi.append(bp[viterbi[-1]])
    viterbi.reverse()

    viterbi_score = np.max(trellis[-1])
    return viterbi, viterbi_score


def limitRun(sess,list_output,feed_dict,MAX_BATCH=1024):
    len_sample = 0
    if len(feed_dict.keys())>0:
        len_sample = len(feed_dict[list(feed_dict.keys())[0]])
    if len_sample>MAX_BATCH:
        list_result = [[] for _ in range(len(list_output))]
        _begin = 0
        while(_begin<len_sample):
            new_dict = dict()
            for _key in feed_dict.keys():
                if isinstance(feed_dict[_key],(float,int,np.int32,np.float_,np.float16,np.float32,np.float64)):
                    new_dict[_key] = feed_dict[_key]
                else:
                    new_dict[_key] = feed_dict[_key][_begin:_begin+MAX_BATCH]
            _output = sess.run(list_output,feed_dict=new_dict)
            for _index in range(len(list_output)):
                list_result[_index].extend(_output[_index])
            _begin += MAX_BATCH
    else:
        list_result = sess.run(list_output,feed_dict=feed_dict)
    return list_result


def get_values(response,output_name):
        """
        Get the value of a specified output tensor
        :param output_name: name of the output tensor
        :return: the content of the output tensor
        """
        output = response.outputs[output_name]
        if output.dtype == tf_predict_pb2.DT_FLOAT:
            _value = output.float_val
        elif output.dtype == tf_predict_pb2.DT_INT8 or output.dtype == tf_predict_pb2.DT_INT16 or \
                output.dtype == tf_predict_pb2.DT_INT32:
            _value = output.int_val
        elif output.dtype == tf_predict_pb2.DT_INT64:
            _value = output.int64_val
        elif output.dtype == tf_predict_pb2.DT_DOUBLE:
            _value = output.double_val
        elif output.dtype == tf_predict_pb2.DT_STRING:
            _value = output.string_val
        elif output.dtype == tf_predict_pb2.DT_BOOL:
            _value = output.bool_val
        return np.array(_value).reshape(response.outputs[output_name].array_shape.dim)

def vpc_requests(url,authorization,request_data,list_outputs):
    
    
    headers = {"Authorization": authorization}
    dict_outputs = dict()
    
    response = tf_predict_pb2.PredictResponse()
    resp = requests.post(url, data=request_data, headers=headers)
    
    
    if resp.status_code != 200:
        print(resp.status_code,resp.content)
        log("调用pai-eas接口出错,authorization:"+str(authorization))
        return None
    else:
        response = tf_predict_pb2.PredictResponse()
        response.ParseFromString(resp.content)
        for _output in list_outputs:
            dict_outputs[_output] = get_values(response, _output)
        return dict_outputs

def encodeInput(data,word_len,word_flag=True,userFool=False):
    result = []
    out_index = 0
    for item in data:
        if out_index in [0]:
            list_word = item[-word_len:]
        else:
            list_word = item[:word_len]
        temp = []
        if word_flag:
            for word in list_word:
                if userFool:
                    temp.append(getIndexOfWord_fool(word))
                else:
                    temp.append(getIndexOfWord(word))
            list_append = []
            temp_len = len(temp)
            while(temp_len<word_len):
                if userFool:
                    list_append.append(0)
                else:
                    list_append.append(getIndexOfWord("<pad>"))
                temp_len += 1
            if out_index in [0]:
                temp = list_append+temp
            else:
                temp = temp+list_append
        else:
            for words in list_word:
                temp.append(getIndexOfWords(words))
                
            list_append = []
            temp_len = len(temp)
            while(temp_len<word_len):
                list_append.append(getIndexOfWords("<pad>"))
                temp_len += 1
            if out_index in [0,1]:
                temp = list_append+temp
            else:
                temp = temp+list_append
        result.append(temp)
        out_index += 1
    return result

def encodeInput_form(input,MAX_LEN=30):
    x = np.zeros([MAX_LEN])
    for i in range(len(input)):
        if i>=MAX_LEN:
            break
        x[i] = getIndexOfWord(input[i])
    return x
    

def getVocabAndMatrix(model,Embedding_size = 60):
    '''
    @summary:获取子向量的词典和子向量矩阵
    '''
    vocab = ["<pad>"]+model.index2word
    
    embedding_matrix = np.zeros((len(vocab),Embedding_size))
    for i in range(1,len(vocab)):
        embedding_matrix[i] = model[vocab[i]]
    
    return vocab,embedding_matrix

def getIndexOfWord(word):
    global vocab_word,file_vocab_word
    if vocab_word is None:
        if os.path.exists(file_vocab_word):
            vocab = load(file_vocab_word)
            vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab)))
        else:
            model = getModel_word()
            vocab,_ = getVocabAndMatrix(model, Embedding_size=60)
            vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab)))
            save(vocab,file_vocab_word)
    if word in vocab_word.keys():
        return vocab_word[word]
    else:
        return vocab_word['<pad>']

def changeIndexFromWordToWords(tokens,word_index):
    '''
    @summary:转换某个字的字偏移为词偏移
    '''
    before_index = 0
    after_index = 0
    for i in range(len(tokens)):
        after_index = after_index+len(tokens[i])
        if before_index<=word_index and after_index>word_index:
            return i
        before_index = after_index
    return i+1

        
def getIndexOfWords(words):
    global vocab_words,file_vocab_words
    if vocab_words is None:
        if os.path.exists(file_vocab_words):
            vocab = load(file_vocab_words)
            vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab)))
        else:
            model = getModel_w2v()
            vocab,_ = getVocabAndMatrix(model, Embedding_size=128)
            vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab)))
            save(vocab,file_vocab_words)
    if words in vocab_words.keys():
        return vocab_words[words]
    else:
        return vocab_words["<pad>"]

    
def log(msg):
    '''
    @summary:打印信息
    '''
    logger.info(msg)

def debug(msg):
    '''
    @summary:打印信息
    '''
    logger.debug(msg)


def save(object_to_save, path):
    '''
    保存对象
    @Arugs:
        object_to_save: 需要保存的对象

    @Return:
        保存的路径
    '''
    with open(path, 'wb') as f:
        pickle.dump(object_to_save, f)

def load(path):
    '''
    读取对象
    @Arugs:
        path: 读取的路径

    @Return:
        读取的对象
    '''
    with open(path, 'rb') as f:
        object1 = pickle.load(f)
        return object1
    
fool_char_to_id = load(os.path.dirname(__file__)+"/fool_char_to_id.pk")

def getIndexOfWord_fool(word):
    
    if word in fool_char_to_id.keys():
        return fool_char_to_id[word]
    else:
        return fool_char_to_id["[UNK]"]


def find_index(list_tofind,text):
    '''
    @summary: 查找所有词汇在字符串中第一次出现的位置
    @param:
        list_tofind:待查找词汇
        text:字符串
    @return: list,每个词汇第一次出现的位置
    
    '''
    result = []
    for item in list_tofind:
        index = text.find(item)
        if index>=0:
            result.append(index)
        else:
            result.append(-1)
    return result


def combine(list1,list2):
    '''
    @summary:将两个list中的字符串两两拼接
    @param:
        list1:字符串list
        list2:字符串list
    @return:拼接结果list
    '''
    result = []
    for item1 in list1:
        for item2 in list2:
            result.append(str(item1)+str(item2))
    return result


def getDigitsDic(unit):
    '''
    @summary:拿到中文对应的数字
    '''
    DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
                 "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
    return DigitsDic.get(unit)

def getMultipleFactor(unit):
    '''
    @summary:拿到单位对应的值
    '''
    MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
    return MultipleFactor.get(unit)

def getUnifyMoney(money):
    '''
    @summary:将中文金额字符串转换为数字金额
    @param:
        money:中文金额字符串
    @return: decimal,数据金额
    '''
    
    MAX_MONEY = 1000000000000
    MAX_NUM = 12
    #去掉逗号
    money = re.sub("[，,]","",money)
    money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
    result = Decimal(0)
    chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
    # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
    chnFactorUnits = ["兆", "亿", "万", "仟", '千', "佰", '百', "拾", '十',"圆", "元", "角", "分"]  # 20240611 修复大写提取错误 '陆拾陆亿伍千柒佰零叁万肆千叁佰陆拾伍元' Decimal('11607430365')
    
    LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
    BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
    try:
        if re.search(LowMoneypattern,money) is not None:
            return Decimal(money)
        elif re.search(BigMoneypattern,money) is not None:
            return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney"))
        for factorUnit in chnFactorUnits:
            if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
                subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
                if re.search(re.compile("^(\d+)(\.\d+)?$"),subMoneys[0]) is not None:
                    if MAX_MONEY/getMultipleFactor(factorUnit)<Decimal(subMoneys[0]):
                        return Decimal(0)
                    result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
                elif len(subMoneys[0])==1:
                    if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
                        result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
                # subMoneys[0]中无金额单位，不可再拆分
                elif subMoneys[0]=="":
                    result += 0
                elif re.search(re.compile("[%s]"%("".join(chnFactorUnits))),subMoneys[0]) is None:
                    # print(subMoneys)
                    # subMoneys[0] = subMoneys[0][0]
                    result += Decimal(getUnifyMoney(subMoneys[0])) * (getMultipleFactor(factorUnit))
                else:
                    result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
                if len(subMoneys)>1:
                    if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
                        result += Decimal(subMoneys[1])
                    elif len(subMoneys[1])==1:
                        if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
                            result += Decimal(getDigitsDic(subMoneys[1]))
                    else:
                        result += Decimal(getUnifyMoney(subMoneys[1]))
                break
    except Exception as e:
        # traceback.print_exc()
        return Decimal(0)
    return result


def getModel_w2v():
    '''
    @summary:加载词向量
    '''
    global model_w2v,lock_model_w2v
    with lock_model_w2v:
        if model_w2v is None:
            model_w2v = gensim.models.KeyedVectors.load_word2vec_format(getw2vfilepath(),binary=True)
        return model_w2v

def getModel_word():
    '''
    @summary:加载字向量
    '''

    global model_word,lock_model_w2v
    with lock_model_word:
        if model_word is None:
            model_word = gensim.models.KeyedVectors.load_word2vec_format(model_word_file,binary=True)
        return model_word

# getModel_w2v()
# getModel_word()

def findAllIndex(substr,wholestr):
    '''
    @summary: 找到字符串的子串的所有begin_index
    @param:
        substr:子字符串
        wholestr:子串所在完整字符串
    @return: list,字符串的子串的所有begin_index
    '''
    copystr = wholestr
    result = []
    indexappend = 0
    while(True):
        index = copystr.find(substr)
        if index<0:
            break
        else:
            result.append(indexappend+index)
            indexappend += index+len(substr)
            copystr = copystr[index+len(substr):]
    return result
    
  
def spanWindow(tokens,begin_index,end_index,size,center_include=False,word_flag = False,use_text = False,text = None):
    '''
    @summary:取得某个实体的上下文词汇
    @param:
        tokens:句子分词list
        begin_index:实体的开始index
        end_index:实体的结束index
        size:左右两边各取多少个词
        center_include:是否包含实体
        word_flag:词/字,默认是词
    @return: list,实体的上下文词汇
    '''  
    if use_text:
        assert text is not None
    length_tokens = len(tokens)
    if begin_index>size:
        begin = begin_index-size
    else:
        begin = 0
    if end_index+size<length_tokens:
        end = end_index+size+1
    else:
        end = length_tokens
    result = []
    if not word_flag:
        result.append(tokens[begin:begin_index])
        if center_include:
            if use_text:
                result.append(text)
            else:
                result.append(tokens[begin_index:end_index+1])
        result.append(tokens[end_index+1:end])
    else:
        result.append("".join(tokens[begin:begin_index]))
        if center_include:
            if use_text:
                result.append(text)
            else:
                result.append("".join(tokens[begin_index:end_index+1]))
        result.append("".join(tokens[end_index+1:end]))
    #print(result)
    return result

def get_context(sentence_text, begin_index, end_index, size=20, center_include=False):
    '''
    返回实体上下文信息
    :param sentence_text: 句子文本
    :param begin_index: 实体字开始位置
    :param end_index: 实体字结束位置
    :param size: 字偏移量
    :param center_include:
    :return:
    '''
    result = []
    begin = begin_index - size if begin_index>size else 0
    end = end_index + size
    result.append(sentence_text[begin: begin_index])
    if center_include:
        result.append(sentence_text[begin_index: end_index])
    result.append(sentence_text[end_index: end])
    return result


#根据规则补全编号或名称两边的符号
def fitDataByRule(data):
    symbol_dict = {"(":")",
                   "（":"）",
                   "[":"]",
                   "【":"】",
                   ")":"(",
                   "）":"（",
                   "]":"[",
                   "】":"【"}
    leftSymbol_pattern = re.compile("[\(（\[【]")
    rightSymbol_pattern = re.compile("[\)）\]】]")
    leftfinds = re.findall(leftSymbol_pattern,data)
    rightfinds = re.findall(rightSymbol_pattern,data)
    result = data
    if len(leftfinds)+len(rightfinds)==0:
        return data
    elif len(leftfinds)==len(rightfinds):
        return data
    elif abs(len(leftfinds)-len(rightfinds))==1:
        if len(leftfinds)>len(rightfinds):
            if symbol_dict.get(data[0]) is not None:
                result = data[1:]
            else:
                #print(symbol_dict.get(leftfinds[0]))
                result = data+symbol_dict.get(leftfinds[0])
        else:
            if symbol_dict.get(data[-1]) is not None:
                result = data[:-1]
            else:
                result = symbol_dict.get(rightfinds[0])+data
    result = re.sub("[。]","",result)
    return  result

from datetime import date
# 时间合法性判断
def isValidDate(year, month, day):
    try:
        date(year, month, day)
    except:
        return False
    else:
        return True

time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]?\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3})?)")
from BiddingKG.dl.ratio.re_ratio import getUnifyNum
import calendar

def get_maxday(year, month):
    # calendar.monthrange(year, month) 返回一个元组，其中第一个元素是那个月第一天的星期几（0-6代表周一到周日），
    # 第二个元素是那个月的天数。
    _, last_day = calendar.monthrange(year, month)
    return last_day

def timeFormat(_time, default_first_day=True):
    '''
    日期格式化：年-月-日
    :param _time:
    :param default_first_day: True取当月第一天，否则取最后一天
    :return:
    '''
    current_year = time.strftime("%Y",time.localtime())
    all_match = re.finditer(time_format_pattern,_time)
    for _match in all_match:
        if len(_match.group())>0:
            legal = True
            year = ""
            month = ""
            day = ""
            for k,v in _match.groupdict().items():
                if k=="year":
                    year = v
                if k=="month":
                    month = v
                if k=="day":
                    day = v
            if year!="":
                if re.search("^\d+$",year):
                    if len(year)==2:
                        year = "20"+year
                    if int(year)-int(current_year)>10:
                        legal = False
                else:
                    _year = ""
                    for word in year:
                        if word == '0':
                            _year += word
                        else:
                            _year += str(getDigitsDic(word))
                    year = _year
            else:
                legal = False
            if month!="":
                if re.search("^\d+$", month):
                    if int(month)>12:
                        legal = False
                else:
                    month = int(getUnifyNum(month))
                    if month>=1 and month<=12:
                        month = str(month)
                    else:
                        legal = False
            else:
                legal = False
            if day == None:
                day = "01" if (default_first_day or legal == False) else str(get_maxday(int(year), int(month)))
            if day!="":
                if re.search("^\d+$", day):
                    if int(day)>31:
                        legal = False
                else:
                    day = int(getUnifyNum(day))
                    if day >= 1 and day <= 31:
                        day = str(day)
                    else:
                        legal = False
            else:
                legal = False
            # print(year,month,day)
            if not isValidDate(int(year),int(month),int(day)):
                legal = False
            if legal:
                return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
    return ""


def embedding(datas,shape):
    '''
    @summary:查找词汇对应的词向量
    @param:
        datas:词汇的list
        shape:结果的shape
    @return: array,返回对应shape的词嵌入
    '''
    model_w2v = getModel_w2v()
    embed = np.zeros(shape)
    length = shape[1]
    out_index = 0
    #print(datas)
    for data in datas:
        index = 0
        for item in data:
            item_not_space = re.sub("\s*","",item)
            if index>=length:
                break
            if item_not_space in model_w2v.vocab:
                embed[out_index][index] = model_w2v[item_not_space]
                index += 1
            else:
                #embed[out_index][index] = model_w2v['unk']
                index += 1
        out_index += 1
    return embed

def embedding_word(datas,shape):
    '''
    @summary:查找词汇对应的词向量
    @param:
        datas:词汇的list
        shape:结果的shape
    @return: array,返回对应shape的词嵌入
    '''
    model_w2v = getModel_word()
    embed = np.zeros(shape)
    length = shape[1]
    out_index = 0
    #print(datas)
    for data in datas:
        index = 0
        for item in str(data)[-shape[1]:]:
            if index>=length:
                break
            if item in model_w2v.vocab:
                embed[out_index][index] = model_w2v[item]
                index += 1
            else:
                # embed[out_index][index] = model_w2v['unk']
                index += 1
        out_index += 1
    return embed


def embedding_word_forward(datas,shape):
    '''
    @summary:查找词汇对应的词向量
    @param:
        datas:词汇的list
        shape:结果的shape
    @return: array,返回对应shape的词嵌入
    '''
    model_w2v = getModel_word()
    embed = np.zeros(shape)
    length = shape[1]
    out_index = 0
    #print(datas)
    for data in datas:
        index = 0
        for item in str(data)[:shape[1]]:
            if index>=length:
                break
            if item in model_w2v.vocab:
                embed[out_index][index] = model_w2v[item]
                index += 1
            else:
                # embed[out_index][index] = model_w2v['unk']
                index += 1
        out_index += 1
    return embed


def formEncoding(text,shape=(100,60),expand=False):
    embedding = np.zeros(shape)
    word_model = getModel_word()
    for i in range(len(text)):
        if i>=shape[0]:
            break
        if text[i] in word_model.vocab:
            embedding[i] = word_model[text[i]]
    if expand:
        embedding = np.expand_dims(embedding,0)
    return embedding

def partMoney(entity_text,input2_shape = [7]):
    '''
    @summary:对金额分段
    @param:
        entity_text:数值金额
        input2_shape:分类数
    @return: array,分段之后的独热编码
    '''
    money = float(entity_text)
    parts = np.zeros(input2_shape)
    if money<100:
        parts[0] = 1
    elif money<1000:
        parts[1] = 1
    elif money<10000:
        parts[2] = 1
    elif money<100000:
        parts[3] = 1
    elif money<1000000:
        parts[4] = 1
    elif money<10000000:
        parts[5] = 1
    else:
        parts[6] = 1
    return parts

def uniform_num(num):
    d1 = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十': '10'}
    # d2 = {'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5', 'F': '6', 'G': '7', 'H': '8', 'I': '9', 'J': '10'}
    d3 = {'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5', 'Ⅵ': '6', 'Ⅶ': '7'}
    if num.isdigit():
        if re.search('^0[\d]$', num):
            num = num[1:]
        return num
    elif re.search('^[一二三四五六七八九十]+$', num):
        _digit = re.search('^[一二三四五六七八九十]+$', num).group(0)
        if len(_digit) == 1:
            num = d1[_digit]
        elif len(_digit) == 2 and _digit[0] == '十':
            num = '1'+ d1[_digit[1]]
        elif len(_digit) == 2 and _digit[1] == '十':
            num = d1[_digit[0]] + '0'
        elif len(_digit) == 3 and _digit[1] == '十':
            num = d1[_digit[0]] + d1[_digit[2]]
    elif re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num):
        num = re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num).group(0)
        num = d3[num]
    return num

def uniform_package_name(package_name):
    '''
    统一规范化包号。数值类型统一为阿拉伯数字，字母统一为大写，包含施工监理等抽到前面, 例 A包监理一标段 统一为 监理A1 ; 包Ⅱ 统一为 2
    :param package_name: 字符串类型 包号
    :return:
    '''
    package_name_raw = package_name
    package_name = re.sub('pdf|doc|docs|xlsx|rar|\d{4}年', ' ', package_name)
    package_name = package_name.replace('标段（包）', '标段').replace('№', '')
    package_name = re.sub('\[|【', '', package_name)
    kw = re.search('(施工|监理|监测|勘察|设计|劳务)', package_name)
    name = ""
    if kw:
        name += kw.group(0)
    if re.search('^[a-zA-Z0-9-]{5,}$', package_name):   # 五个字符以上编号
        _digit = re.search('^[a-zA-Z0-9-]{5,}$', package_name).group(0).upper()
        # print('规范化包号1', _digit)
        name += _digit
    elif re.search('(?P<eng>[a-zA-Z])包[：）]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name): # 处理类似 A包2标段
        ser = re.search('(?P<eng>[a-zA-Z])包[：）]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name)
        # print('规范化包号2', ser.group(0))
        _char = ser.groupdict().get('eng')
        if _char:
            _char = _char.upper()
        _digit = ser.groupdict().get('num')
        _digit = uniform_num(_digit)
        name += _char.upper() + _digit
    elif re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name): # 处理类似 A包2标段
        ser = re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name)
        # print('规范化包号3', ser.group(0))
        _char = ser.groupdict().get('eng')
        if _char:
            _char = _char.upper()
        _digit = ser.groupdict().get('num')
        _digit = uniform_num(_digit)
        if _char:
            name += _char.upper()
        name += _digit
    elif re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[:：]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name):  # 数字的统一的阿拉伯数字
        ser = re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[:：]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name)
        # print('规范化包号4', ser.group(0))
        _char = ser.groupdict().get('eng')
        if _char:
            _char = _char.upper()
        _digit = ser.groupdict().get('num')
        _digit = uniform_num(_digit)
        if _char:
            name += _char.upper()
        name += _digit
    elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[:：]?(?P<eng>[a-zA-Z-]{1,5})', package_name):  # 数字的统一的阿拉伯数字
        _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[:：]?(?P<eng>[a-zA-Z-]{1,5})', package_name).group('eng').upper()
        # print('规范化包号5', _digit)
        name += _digit
    elif re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name):  # 数字的统一的阿拉伯数字
        _digit = re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name).group('eng').upper()
        # print('规范化包号6', _digit)
        name += _digit
    elif re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name):  # 数字的统一的阿拉伯数字
        _digit = re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name).group(0)
        # print('规范化包号7', _digit)
        _digit = uniform_num(_digit)
        name += _digit
    elif re.search('^[a-zA-Z0-9-]+$', package_name):
        _char = re.search('^[a-zA-Z0-9-]+$', package_name).group(0)
        # print('规范化包号8', _char)
        name += _char.upper()
    if name == "":
        return package_name_raw
    else:
        if name.isdigit():
            name = str(int(name))
        # print('原始包号：%s, 处理后：%s'%(package_name, name))
        return name

def money_process(money_text, header):
    '''
    输入金额文本及金额列表头，返回统一数字化金额及金额单位
    :param money_text:金额字符串
    :param header:金额列表头，用于提取单位
    :return:
    '''
    money = 0
    money_unit = ""
    moneys, _ = get_money_entity('%s：%s' % (header, money_text))
    if len(moneys) == 1:
        money = float(moneys[0][0])
        money_unit = moneys[0][3]
    elif len(moneys) == 2 and moneys[0][0]==moneys[1][0]:
        money = float(moneys[0][0])
        money_unit = moneys[0][3]
    # # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[（(]?万?", money_text)
    # money_text = re.sub('\s', '', money_text) # 2024/04/19 修复 457699044 556.46751 万元 金额与单位有空格造成万漏提取
    # if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text) and re.search('\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[（(]?万?', money_text):
    #     money_text = re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text).group(0)  # 如果表格同时包含大小写金额，取大写金额，避免单位取错 463310590 790000（柒拾玖万元整）
    # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[（(]?万?", money_text)
    # if re_price:
    #     money_re = re_price.group(0)
    #     if (re.search('万元|[（(]万[)）]',  header) or re.search('万元|[（(]万[)）]', money_text)) and '万' not in money_re:  # 修复37797825 控制价（万） # 修复 460307391 万元不在表头，在数字前面
    #         money_re += '万元'
    #     elif (re.search('亿元|[（(]亿[)）]',  header) or re.search('亿元|[（(]亿[)）]', money_text)) and '亿' not in money_re:  # 修复37797825 控制价（万） # 修复 460307391 万元不在表头，在数字前面
    #         money_re += '亿元'
    #     # money = float(getUnifyMoney(money_text))
    #     money = float(getUnifyMoney(money_re))
    #     if money > 10000000000000:  # 大于万亿的去除
    #         money = 0
    #     # money_unit = '万元' if '万' in money_re and re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None else '元'
    #     if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None:
    #         if '万' in money_re:
    #             money_unit = '万元'
    #         elif '亿' in money_re:
    #             money_unit = '亿元'
    #         else:
    #             money_unit = '元'
    return (money, money_unit)

package_number_pattern = re.compile(
        '((施工|监理|监测|勘察|设计|劳务)(标段)?：?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
|(([a-zA-Z]包[：（）]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
|(([，；。、：（]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
|((标[段包项]|品目|标段（包）|包[组件标]|[标分子（]包)(\[|【)?：?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
|([，；。、：（]|^)(标的?|(招标|采购)?项目|子项目?)(\[|【)?：?([一二三四五六七八九十]+|[0-9]{1,9})\
|((([标分子（]|合同|项目|采购)包|[，。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[:：]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,9}[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{0,9})\
|[，；。、：（]?(合同|分|子)?包：?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
filter_package_pattern =  'CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
|标[识注签贴配]|[商油]标号|第X包|第[一二三四五六七八九十]+至[一二三四五六七八九十]+(标[段包项]?|包[组件标]?|合同[包段])\
|\.(docx|doc|pdf|xlsx|xls|jpg)|[一二三四五]次|五金|\d+[年月]|[\d.,]+万?元|\d+\.\d+' # 过滤错误的非包号
def find_package(content):
    '''
    通过正则找包和标段号
    :param content:
    :return:
    '''
    packages = []
    content = content.replace('号，', '号：').replace(':', '：').replace('(', '（').replace(')', '）')
    # .replace('-包',' 包').replace('包-', '包 ').replace('-标', ' 标').replace('标段-', '标段 ').replace('-合同包', ' 合同包') # 72760191 标段：№10
    content = re.sub('[一二三四五六七八九十\d](标[段包项]|包[组件标])编号', ' 标段编号', content)

    for it in re.finditer(filter_package_pattern, content):
        content = content.replace(it.group(0), ' ' * len(it.group(0)))

    for iter in re.finditer(package_number_pattern, content):
        if re.search('(业绩|信誉要求)：|业绩(如下)?\d*[、：]', content[:iter.start()]):  # 前面有业绩或信誉的标段去掉
            continue
        # print('提取到标段：%s， 前后文：%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5]))
        if re.match('\d', iter.group(0)) and re.search('\d\.$', content[:iter.start()]):  # 排除2.10标段3  5.4标段划分 这种情况
            # print('过滤掉错误包：', iter.group())
            continue
        if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('\d包[/每]\w|一包[0-9一二三四五六七八九十]+', content[
                                                                                                  iter.start():iter.end() + 3]) or re.search(
            '[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
            # print('过滤掉错误包：', iter.group())
            continue
        elif iter.end() + 2 < len(content) and re.search('标的物|包装|划分|标(书|准|志|记|识|签|贴|帜|本|底|价|量)',
                                                         content[iter.start():iter.end() + 2]):
            # print('过滤掉错误包：', iter.group())
            continue
        elif re.search('同一(标段?|包)', content[max(0, iter.start() - 2):iter.end()]):  # 不得参加同一标段
            # print('过滤掉错误包：', iter.group())
            continue
        elif re.search('三包', content[max(0, iter.start() - 2):iter.end()]) and re.search('第三包', content[max(0,
                                                                                                            iter.start() - 2):iter.end()]) == None:  # 规规章和“三包”规定
            # print('过滤掉错误包：', iter.group())
            continue
        elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)):
            # print('过滤掉错误包号5：', iter.group(0))
            continue
        elif re.search('单位：包|1包\d|[张箱]', content[max(0, iter.start()-3): iter.end()+2]): # 处理 463166661 包号错误 钢丝，单位：包X10根。
            # print('过滤掉错误包号，单位：包|1包', iter.group(0))
            continue
        elif iter.group(0) == '劳务分包': # 20241203 修复562534840劳务分包作包号
            continue
        packages.append(iter)
        # print('提取到标段：%s， 前后文：%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5]))
    return packages

def cut_repeat_name(s):
    '''
    公司连续重复名称去重
    :param s:
    :return:
    '''
    if len(s) >= 8:
        n = s.count(s[-4:])
        id = s.find(s[-4:]) + 4
        sub_s = s[:id]
        if n>=2 and s == sub_s * n:
            s = sub_s
    return s

def del_tabel_achievement(soup):
    if re.search('中标|成交|入围|结果|评标|开标|候选人', soup.text[:800]) == None or re.search('业绩', soup.text)==None:
        return None
    p1 = '(中标|成交)(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)'
    '''删除前面标签 命中业绩规则；当前标签为表格且公布业绩相关信息的去除'''
    for tag in soup.find_all('table'):
        pre_text = ""
        if tag.findPreviousSibling() != None:
            pre_text = tag.findPreviousSibling().text.strip()
            if pre_text == "" and tag.findPreviousSibling().findPreviousSibling() != None: # 修复表格前一标签没内容，再前一个才有内容情况
                pre_text = tag.findPreviousSibling().findPreviousSibling().text.strip()

        tr_text = tag.find('tr').text.strip() if tag.find('tr') != None else ""
        #     print(re.search(p1, pre_text),pre_text, len(pre_text), re.findall('序号|中标候选人名称|项目名称|工程名称|合同金额|建设单位|业主', tr_text))
        if re.search(p1, pre_text) and len(pre_text) < 20 and tag.find('tr') != None and len(tr_text)<100:
            _count = 0
            for td in tag.find('tr').find_all('td'):
                td_text = td.text.strip()
                if len(td_text) > 25:
                    break
                if len(td_text) < 25 and re.search('中标候选人|第[一二三四五1-5]候选人|(项目|业绩|工程)名称|\w{,10}业绩$|合同金额|建设单位|采购单位|业主|甲方', td_text):
                    _count += 1
                if _count >=2:
                    pre_tag = tag.findPreviousSibling().extract()
                    del_tag = tag.extract()
                    # print('删除表格业绩内容', pre_tag.text + del_tag.text)
                    break
        elif re.search('业绩名称', tr_text) and re.search('建设单位|采购单位|业主', tr_text) and len(tr_text)<100:
            del_tag = tag.extract()
            # print('删除表格业绩内容', del_tag.text)
    del_trs = []
    '''删除表格某些行公布的业绩信息'''
    for tag in soup.find_all('table'):
        text = tag.text
        if re.search('业绩', text) == None:
            continue
        # for tr in tag.find_all('tr'):
        trs = tag.find_all('tr')
        i = 0
        while i < len(trs):
            tr = trs[i]
            if len(tr.find_all('td'))==2 and tr.td!=None and tr.td.findNextSibling()!=None:
                td1_text =tr.td.text
                td2_text =tr.td.findNextSibling().text
                if re.search('业绩', td1_text)!=None and len(td1_text)<10 and len(re.findall('(\d、|（\d）)?[-\w（）、]+(工程|项目|勘察|设计|施工|监理|总承包|采购|更新)', td2_text))>=2:
                    # del_tag = tr.extract()
                    # print('删除表格业绩内容', del_tag.text)
                    del_trs.append(tr)
            elif tr.td != None and re.search('^业绩|业绩$', tr.td.text.strip()) and len(tr.td.text.strip())<25:
                rows = tr.td.attrs.get('rowspan', '')
                cols = tr.td.attrs.get('colspan', '')
                if rows.isdigit() and int(rows)>2:
                    for j in range(int(rows)):
                        if i+j < len(trs):
                            del_trs.append(trs[i+j])
                    i += j
                elif cols.isdigit() and int(cols)>3 and len(tr.find_all('td'))==1 and i+2 < len(trs):
                    next_tr_cols = 0
                    td_num = 0
                    for td in trs[i+1].find_all('td'):
                        td_num += 1
                        if td.attrs.get('colspan', '').isdigit():
                            next_tr_cols += int(td.attrs.get('colspan', ''))
                    if next_tr_cols == int(cols):
                        del_trs.append(tr)
                        for j in range(1,len(trs)-i):
                            if len(trs[i+j].find_all('td')) == 1:
                                break
                            elif len(trs[i+j].find_all('td')) >= td_num-1:
                                del_trs.append(trs[i+j])
                            else:
                                break
                        i += j
            i += 1
        for tr in del_trs:
            del_tag = tr.extract()
            # print('删除表格业绩内容', del_tag.text)

def is_all_winner(title):
    '''
    是否提取所有投标人作为中标人，存管类不分排名都作中标人；入围类按排名，无排名都做中标人
    :param title: 标题
    :return:
    '''
    if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)存放|存放银行|存款服务|国库现金管理', title):
        return 1
    elif re.search('招募|入围|框架(协议)?采购|(单位|商|机构)入库|入库供应商|集中采购', title):
        return 2
    return False

def is_deposit_project(title, name, requirement):
    '''
    通过正则判断项目是否为银行存款类项目
    :param title: 标题
    :param name: 项目名称
    :param requirement: 采购内容
    :return:
    '''
    if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)(（.{2,10}）)?存放|存放银行|存款(服务|业务|项目)|国库现金管理|存款账户开户|(管理|存款|合作)(定点|专户)?银行|贷款合作银行|资金监管账户|开户银行项目|专户开户银行|银行专户选择|定期存[款放]|专项债券?专用账户', title+name+requirement):
        return True
    return False

def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
    money_list = []
    # 使用正则识别金额
    entity_type = "money"
    list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资|成本)(\d：|\d=\d[-+×]\d：)?(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[）\)]?))",
                          "front_m": "((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)]?)\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
                          "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
    # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元，合同金额：378.8万元 提取

    pattern_money = re.compile("%s|%s|%s|%s" % (
    list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
    list_money_pattern["front_m"]))

    # sentence_text = re.sub('\d+[年月日]', '', sentence_text) # 修复560180018 中标价（元）：3年投标报价（元）含税6299700.00 3年作为金额

    # if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
    #     found_yeji += 1
    # if found_yeji >= 2:  # 过滤掉业绩后面的所有金额 # 20250210修复逻辑错误，中标金额被前面句子业绩表达过滤 评分因素：业绩（9分），评分标准：提供2021年1月1日以来类似项目业绩， 589003579
    #     all_match = []
    # else:
    ser = re.search('((收费标准|计算[方公]?式)：|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s（）()\[\]【】\d\.%％‰\+\-*×/]{20,}[，。]?', sentence_text)  # 过滤掉收费标准里面的金额
    if ser:
        sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
    all_match = re.finditer(pattern_money, sentence_text)
    # print('all_match:', all_match)
    for _match in all_match:
        # print('_match: ', _match.group())
        if re.search('^元/1\d{10}，$', _match.group(0)): # 修复 495042766 现场负责人 姚元 / 13488160460 预测为金额
            continue
        if len(_match.group()) > 0:
            # print("===",_match.group())
            # # print(_match.groupdict())
            notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
            unit = ""
            entity_text = ""
            start_index = ""
            end_index = ""
            text_beforeMoney = ""
            filter = ""
            filter_unit = False
            notSure = False
            science = ""
            if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
                # print('金额在业绩后面: ', _match.group(0))
                found_yeji += 1
                break
            for k, v in _match.groupdict().items():
                if v != "" and v is not None:
                    if k == 'text_key_word':
                        notSure = True
                    if k.split("_")[0] == "money":
                        entity_text = v
                        # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
                        if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾，应该小数点识别错，直接去掉
                            entity_text = entity_text[:-3]
                    if k.split("_")[0] == "unit":
                        if 'behind' in k or unit == "":  # 优先后面单位  预算金额(元)：160万元  总价（万元）：最终报价：695000.00（元）
                            unit = v
                    if k.split("_")[0] == "text":
                        text_beforeMoney = v
                    if k.split("_")[0] == "filter":
                        filter = v
                    if re.search("filter_unit", k) is not None:
                        filter_unit = True
                    if k.split("_")[0] == 'science':
                        science = v
            # print("金额：{0} ,单位：{1}, 前文：{2}, filter: {3}, filter_unit: {4}".format(entity_text,unit,text_beforeMoney,filter,filter_unit))
            # if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
            #     if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0] - 2):_match.span()[0]]):
            #         entity_text = re.sub('\d+,', '', entity_text)
            #     else:
            #         entity_text = entity_text.replace(',', '.')
            #     # print(' 修正OCR识别小数点为逗号')

            if filter != "":
                continue
            if len(entity_text)>30 or len(re.sub('[E-]', '', science))>2: # 限制数字长度，避免类似265339018附件金额错误，数值超大报错 decimal.InvalidOperation
                continue
            start_index, end_index = _match.span()
            start_index += len(text_beforeMoney)

            '''过滤掉手机号码作为金额'''
            if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
                # print('过滤掉手机号码作为金额')
                continue
            elif re.search('^1[3-9]\d{9}$', entity_text) and re.search('：\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额（万元）：季勇13863441880'
                # print('过滤掉手机号码作为金额')
                continue
            elif re.search('^\d(.\d{1,2})?$', entity_text) and re.search('\d$', _match.group(0)) and re.search('^[、.]', sentence_text[_match.end():]): # 170756755 控制价为：1、合理利润率上限
                # print('过滤错误金额：', _match.group(0))
                continue

            if unit == "":  # 2021/7/21 有明显金额特征的补充单位，避免被过滤
                if (re.search('(￥|¥|RMB|CNY)[:：]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
                    if entity_text.endswith('万元'):
                        unit = '万元'
                        entity_text = entity_text[:-2]
                    else:
                        unit = '元'
                    # print('1明显金额特征补充单位 元')
                elif re.search('USD[:：]?$', text_beforeMoney):
                    unit = '美元'
                elif re.search('EUR[:：]?$', text_beforeMoney):
                    unit = '欧元'
                elif re.search('JPY[:：]?$', text_beforeMoney):
                    unit = '日元'
                elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
                    # print('两个金额连接后面的有单位，用后面单位')
                    unit = '万元'
                elif re.search('^，?(价格币种：\w{2,3}，)?价格单位：万元', sentence_text[end_index:]): # 修复494731937金额单位缺漏 中标价格：39501.094425，价格币种：人民币，价格单位：万元，
                    unit = '万元'
                elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型：（万元）报价：13311.1582，得分：84.46，
                    unit = '万元'
                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|成本)(小写)?[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
                    if re.search('^[\d，,.]+$', entity_text) and float(re.sub('[,，]', '', entity_text))<500 and re.search('万元', sentence_text):
                        unit = '万元'
                        # print('金额较小且句子中有万元的，补充单位为万元')
                    elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
                        unit = '万元'
                    else:
                        unit = '元'
                        # print('金额前面紧接关键词的补充单位 元')
                elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7}，?)$)|(^\d{,3}(,\d{3})+，?$)', entity_text):
                    unit = '元'
                    # print('3明显金额特征补充单位 元')
                else:
                    # print('过滤掉没单位金额: ',entity_text)
                    continue
            elif unit == '万元':
                if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
                    unit = '元'
                elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
                    unit = '元'
            if unit.find("万") >= 0 and entity_text.find("万") >= 0:  # 2021/7/19修改为金额文本有万，不计算单位
                # print('修正金额及单位都有万， 金额：',entity_text, '单位:',unit)
                unit = "元"
            if re.search('.*万元万元', entity_text):  # 2021/7/19 修正两个万元
                # print(' 修正两个万元',entity_text)
                entity_text = entity_text.replace('万元万元', '万元')
            else:
                if filter_unit:
                    continue

            # symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else ''  # 负值金额前面保留负号 ，后面这些不作为负金额 起拍价：105.29-200.46万元  预 算 --- 350000.0 2023/04/14 取消符号

            entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
            # print('转换前金额：', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
            if re.search('总投资|投资总额|总预算|总概算|(投资|招标|资金|存放|操作|融资)规模|批复概算|投资额|总规模|工程造价|总金额',
                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额  20241031工程造价作总投资
                # print('总投资金额: ', _match.group(0))
                notes = '总投资'
            elif re.search('投资|概算|建安费|其他费用|基本预备费',
                           sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
                notes = '投资'
            # elif re.search('工程造价',
            #                sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
            #     notes = '工程造价'
            elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
                  or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\(（]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\)）]*[:：为]',
                               sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
                  or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
                               sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
                notes = '保证金'
                # print('保证金信息：', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
            elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
                           sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
                notes = '成本警戒线'
            elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为：]|服务金额', sentence_text[_match.span()[0]:_match.span()[1]]):
                # cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
                # notes = cost_re.group(1)
                notes = '招标或中标金额'
            elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
                notes = '单价'
            elif re.search('^[/每]', sentence_text[_match.end():]):
                # print('单价：', _match.group(0))
                notes = '单价'
            elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
                notes = '大写'
                if entity_text[0] == "拾":  # 2021/12/16 修正大写金额省略了数字转换错误问题
                    entity_text = "壹" + entity_text
                # print("补充备注：notes = 大写")
            if len(unit) > 0:
                if unit.find('万') >= 0 and len(entity_text.split('.')[0]) >= 8:  # 2021/7/19 修正万元金额过大的情况
                    # print('修正单位万元金额过大的情况 金额：', entity_text, '单位:', unit)
                    entity_text = str(
                        getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]) / 10000)
                    unit = '元'  # 修正金额后单位 重置为元
                else:
                    # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
                    entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]))
            else:
                if entity_text.find('万') >= 0 and entity_text.split('.')[0].isdigit() and len(
                        entity_text.split('.')[0]) >= 8:
                    entity_text = str(getUnifyMoney(entity_text) / 10000)
                    # print('修正金额字段含万 过大的情况')
                else:
                    entity_text = str(getUnifyMoney(entity_text))
            if science and re.search('^E-?\d+$', science):  # 科学计数
                entity_text = str(Decimal(entity_text + science)) if Decimal(entity_text + science) > 100 and Decimal(
                    entity_text + science) < 10000000000 else entity_text  # 结果大于100及小于100万才使用科学计算

            if float(entity_text) > 100000000000:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
                # print('过滤掉金额：float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
                continue

            if notSure and unit == "" and float(entity_text) > 100 * 10000:
                # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000：', entity_text, unit)
                continue
            # print("金额：{0} ,单位：{1}, 前文：{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
            #                                                                      filter, filter_unit))
            if re.search('[%％‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
                # print('过滤掉可能是费率的金额')
                continue
            money_list.append((entity_text, start_index, end_index, unit, notes))
    return money_list, found_yeji

def recall(y_true, y_pred):
    '''
    计算召回率
    @Argus:
        y_true: 正确的标签
        y_pred: 模型预测的标签

    @Return
        召回率
    '''
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
    if c3 == 0:
        return 0
    recall = c1 / c3
    return recall


def f1_score(y_true, y_pred):
    '''
    计算F1

    @Argus:
        y_true: 正确的标签
        y_pred: 模型预测的标签

    @Return
        F1值
    '''

    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
    precision = c1 / c2
    if c3 == 0:
        recall = 0
    else:
        recall = c1 / c3
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score


def precision(y_true, y_pred):
    '''
    计算精确率

    @Argus:
        y_true: 正确的标签
        y_pred: 模型预测的标签

    @Return
        精确率
    '''
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = c1 / c2
    return precision

# def print_metrics(history):
#     '''
#     制作每次迭代的各metrics变化图片
#
#     @Arugs:
#         history: 模型训练迭代的历史记录
#     '''
#     import matplotlib.pyplot as plt
#
#     # loss图
#     loss = history.history['loss']
#     val_loss = history.history['val_loss']
#     epochs = range(1, len(loss) + 1)
#     plt.subplot(2, 2, 1)
#     plt.plot(epochs, loss, 'bo', label='Training loss')
#     plt.plot(epochs, val_loss, 'b', label='Validation loss')
#     plt.title('Training and validation loss')
#     plt.xlabel('Epochs')
#     plt.ylabel('Loss')
#     plt.legend()
#
#     # f1图
#     f1 = history.history['f1_score']
#     val_f1 = history.history['val_f1_score']
#     plt.subplot(2, 2, 2)
#     plt.plot(epochs, f1, 'bo', label='Training f1')
#     plt.plot(epochs, val_f1, 'b', label='Validation f1')
#     plt.title('Training and validation f1')
#     plt.xlabel('Epochs')
#     plt.ylabel('F1')
#     plt.legend()
#
#     # precision图
#     prec = history.history['precision']
#     val_prec = history.history['val_precision']
#     plt.subplot(2, 2, 3)
#     plt.plot(epochs, prec, 'bo', label='Training precision')
#     plt.plot(epochs, val_prec, 'b', label='Validation pecision')
#     plt.title('Training and validation precision')
#     plt.xlabel('Epochs')
#     plt.ylabel('Precision')
#     plt.legend()
#
#     # recall图
#     recall = history.history['recall']
#     val_recall = history.history['val_recall']
#     plt.subplot(2, 2, 4)
#     plt.plot(epochs, recall, 'bo', label='Training recall')
#     plt.plot(epochs, val_recall, 'b', label='Validation recall')
#     plt.title('Training and validation recall')
#     plt.xlabel('Epochs')
#     plt.ylabel('Recall')
#     plt.legend()
#
#     plt.show()

def clean_company(entity_text):
    '''
    清洗公司名称
    :param entity_text:
    :return:
    '''
    entity_text = re.sub('\s', '', entity_text)
    if re.search('^(\d{4}年)?[\-\d月日份]*\w{2,3}分公司$|^\w{,6}某(部|医院)$|空间布局$', entity_text):  # 删除
        # print('公司实体不符合规范：', entity_text)
        return ''
    elif re.match('xx|XX', entity_text):  # 删除
        # print('公司实体不符合规范：', entity_text)
        return ''
    elif re.match('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', entity_text):
        entity_text = re.sub('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', '', entity_text)
    elif re.match('（\d+）|\d+\.|\s|&nbsp', entity_text):
        entity_text = re.sub('（\d+）|\d+\.|\s|&nbsp', '', entity_text)
    elif re.match(
            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
            entity_text):
        filter = re.match(
            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
            entity_text).group(1)
        entity_text = entity_text.replace(filter, '')
    elif re.search('\]|\[|\]|[【】{}「?:∶〔·.\'#~_ΓΙεⅠ]', entity_text):
        entity_text = re.sub('\]|\[|\]|[【】「?:∶〔·.\'#~_ΓΙεⅠ]', '', entity_text)
    if len(re.sub('(项目|分|有限)?公司|集团|制造部|中心|医院|学校|大学|中学|小学|幼儿园', '', entity_text)) < 2:
        # print('公司实体不符合规范：', entity_text)
        return ''
    return entity_text

if __name__=="__main__":
    # print(fool_char_to_id[">"])
    print(getUnifyMoney('伍仟贰佰零壹拾伍万零捌佰壹拾元陆角伍分'))
    # model = getModel_w2v()
    # vocab,matrix = getVocabAndMatrix(model, Embedding_size=128)
    # save([vocab,matrix],"vocabMatrix_words.pk")