''' Created on 2018年12月20日 @author: User ''' import numpy as np import re import gensim from keras import backend as K import os,sys import time import traceback from threading import RLock # from pai_tf_predict_proto import tf_predict_pb2 import requests model_w2v = None lock_model_w2v = RLock() USE_PAI_EAS = False Lazy_load = False # API_URL = "http://192.168.2.103:8802" API_URL = "http://127.0.0.1:888" # USE_API = True USE_API = False def getCurrent_date(format="%Y-%m-%d %H:%M:%S"): _time = time.strftime(format,time.localtime()) return _time def getw2vfilepath(): filename = "wiki_128_word_embedding_new.vector" w2vfile = getFileFromSysPath(filename) if w2vfile is not None: return w2vfile return filename def getLazyLoad(): global Lazy_load return Lazy_load def getFileFromSysPath(filename): for _path in sys.path: if os.path.isdir(_path): for _file in os.listdir(_path): _abspath = os.path.join(_path,_file) if os.path.isfile(_abspath): if _file==filename: return _abspath return None model_word_file = os.path.dirname(__file__)+"/../singlew2v_model.vector" model_word = None lock_model_word = RLock() from decimal import Decimal import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) import pickle import os import json #自定义jsonEncoder class MyEncoder(json.JSONEncoder): def __init__(self): import numpy as np global np def default(self, obj): if isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, bytes): return str(obj, encoding='utf-8') elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): return float(obj) elif isinstance(obj,(np.int64,np.int32)): return int(obj) return json.JSONEncoder.default(self, obj) vocab_word = None vocab_words = None file_vocab_word = "vocab_word.pk" file_vocab_words = "vocab_words.pk" selffool_authorization = "NjlhMWFjMjVmNWYyNzI0MjY1OGQ1M2Y0ZmY4ZGY0Mzg3Yjc2MTVjYg==" selffool_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_gpu" selffool_seg_authorization = "OWUwM2Q0ZmE3YjYxNzU4YzFiMjliNGVkMTA3MzJkNjQ2MzJiYzBhZg==" selffool_seg_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_seg_gpu" codename_authorization = "Y2M5MDUxMzU1MTU4OGM3ZDk2ZmEzYjkxYmYyYzJiZmUyYTgwYTg5NA==" codename_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codename_gpu" form_item_authorization = "ODdkZWY1YWY0NmNhNjU2OTI2NWY4YmUyM2ZlMDg1NTZjOWRkYTVjMw==" form_item_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/form" person_authorization = "N2I2MDU2N2Q2MGQ0ZWZlZGM3NDkyNTA1Nzc4YmM5OTlhY2MxZGU1Mw==" person_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/person" role_authorization = "OWM1ZDg5ZDEwYTEwYWI4OGNjYmRlMmQ1NzYwNWNlZGZkZmRmMjE4OQ==" role_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/role" money_authorization = "MDQyNjc2ZDczYjBhYmM4Yzc4ZGI4YjRmMjc3NGI5NTdlNzJiY2IwZA==" money_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/money" codeclasses_authorization = "MmUyNWIxZjQ2NjAzMWJlMGIzYzkxMjMzNWY5OWI3NzJlMWQ1ZjY4Yw==" codeclasses_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codeclasses" def viterbi_decode(score, transition_params): """Decode the highest scoring sequence of tags outside of TensorFlow. This should only be used at test time. Args: score: A [seq_len, num_tags] matrix of unary potentials. transition_params: A [num_tags, num_tags] matrix of binary potentials. Returns: viterbi: A [seq_len] list of integers containing the highest scoring tag indices. viterbi_score: A float containing the score for the Viterbi sequence. """ trellis = np.zeros_like(score) backpointers = np.zeros_like(score, dtype=np.int32) trellis[0] = score[0] for t in range(1, score.shape[0]): v = np.expand_dims(trellis[t - 1], 1) + transition_params trellis[t] = score[t] + np.max(v, 0) backpointers[t] = np.argmax(v, 0) viterbi = [np.argmax(trellis[-1])] for bp in reversed(backpointers[1:]): viterbi.append(bp[viterbi[-1]]) viterbi.reverse() viterbi_score = np.max(trellis[-1]) return viterbi, viterbi_score def limitRun(sess,list_output,feed_dict,MAX_BATCH=1024): len_sample = 0 if len(feed_dict.keys())>0: len_sample = len(feed_dict[list(feed_dict.keys())[0]]) if len_sample>MAX_BATCH: list_result = [[] for _ in range(len(list_output))] _begin = 0 while(_begin")) temp_len += 1 if out_index in [0]: temp = list_append+temp else: temp = temp+list_append else: for words in list_word: temp.append(getIndexOfWords(words)) list_append = [] temp_len = len(temp) while(temp_len")) temp_len += 1 if out_index in [0,1]: temp = list_append+temp else: temp = temp+list_append result.append(temp) out_index += 1 return result def encodeInput_form(input,MAX_LEN=30): x = np.zeros([MAX_LEN]) for i in range(len(input)): if i>=MAX_LEN: break x[i] = getIndexOfWord(input[i]) return x def getVocabAndMatrix(model,Embedding_size = 60): ''' @summary:获取子向量的词典和子向量矩阵 ''' vocab = [""]+model.index2word embedding_matrix = np.zeros((len(vocab),Embedding_size)) for i in range(1,len(vocab)): embedding_matrix[i] = model[vocab[i]] return vocab,embedding_matrix def getIndexOfWord(word): global vocab_word,file_vocab_word if vocab_word is None: if os.path.exists(file_vocab_word): vocab = load(file_vocab_word) vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab))) else: model = getModel_word() vocab,_ = getVocabAndMatrix(model, Embedding_size=60) vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab))) save(vocab,file_vocab_word) if word in vocab_word.keys(): return vocab_word[word] else: return vocab_word[''] def changeIndexFromWordToWords(tokens,word_index): ''' @summary:转换某个字的字偏移为词偏移 ''' before_index = 0 after_index = 0 for i in range(len(tokens)): after_index = after_index+len(tokens[i]) if before_index<=word_index and after_index>word_index: return i before_index = after_index return i+1 def getIndexOfWords(words): global vocab_words,file_vocab_words if vocab_words is None: if os.path.exists(file_vocab_words): vocab = load(file_vocab_words) vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab))) else: model = getModel_w2v() vocab,_ = getVocabAndMatrix(model, Embedding_size=128) vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab))) save(vocab,file_vocab_words) if words in vocab_words.keys(): return vocab_words[words] else: return vocab_words[""] def log(msg): ''' @summary:打印信息 ''' logger.info(msg) def debug(msg): ''' @summary:打印信息 ''' logger.debug(msg) def save(object_to_save, path): ''' 保存对象 @Arugs: object_to_save: 需要保存的对象 @Return: 保存的路径 ''' with open(path, 'wb') as f: pickle.dump(object_to_save, f) def load(path): ''' 读取对象 @Arugs: path: 读取的路径 @Return: 读取的对象 ''' with open(path, 'rb') as f: object1 = pickle.load(f) return object1 fool_char_to_id = load(os.path.dirname(__file__)+"/fool_char_to_id.pk") def getIndexOfWord_fool(word): if word in fool_char_to_id.keys(): return fool_char_to_id[word] else: return fool_char_to_id["[UNK]"] def find_index(list_tofind,text): ''' @summary: 查找所有词汇在字符串中第一次出现的位置 @param: list_tofind:待查找词汇 text:字符串 @return: list,每个词汇第一次出现的位置 ''' result = [] for item in list_tofind: index = text.find(item) if index>=0: result.append(index) else: result.append(-1) return result def combine(list1,list2): ''' @summary:将两个list中的字符串两两拼接 @param: list1:字符串list list2:字符串list @return:拼接结果list ''' result = [] for item1 in list1: for item2 in list2: result.append(str(item1)+str(item2)) return result def getDigitsDic(unit): ''' @summary:拿到中文对应的数字 ''' DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9, "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9} return DigitsDic.get(unit) def getMultipleFactor(unit): ''' @summary:拿到单位对应的值 ''' MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)} return MultipleFactor.get(unit) def getUnifyMoney(money): ''' @summary:将中文金额字符串转换为数字金额 @param: money:中文金额字符串 @return: decimal,数据金额 ''' MAX_MONEY = 1000000000000 MAX_NUM = 12 #去掉逗号 money = re.sub("[,,]","",money) money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money) result = Decimal(0) chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"] # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"] chnFactorUnits = ["兆", "亿", "万", "仟", '千', "佰", '百', "拾", '十',"圆", "元", "角", "分"] # 20240611 修复大写提取错误 '陆拾陆亿伍千柒佰零叁万肆千叁佰陆拾伍元' Decimal('11607430365') LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$") BigMoneypattern = re.compile("^零?(?P[%s])$"%("".join(chnDigits))) try: if re.search(LowMoneypattern,money) is not None: return Decimal(money) elif re.search(BigMoneypattern,money) is not None: return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney")) for factorUnit in chnFactorUnits: if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None: subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money) if re.search(re.compile("^(\d+)(\.\d+)?$"),subMoneys[0]) is not None: if MAX_MONEY/getMultipleFactor(factorUnit)1: if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None: result += Decimal(subMoneys[1]) elif len(subMoneys[1])==1: if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None: result += Decimal(getDigitsDic(subMoneys[1])) else: result += Decimal(getUnifyMoney(subMoneys[1])) break except Exception as e: # traceback.print_exc() return Decimal(0) return result def getModel_w2v(): ''' @summary:加载词向量 ''' global model_w2v,lock_model_w2v with lock_model_w2v: if model_w2v is None: model_w2v = gensim.models.KeyedVectors.load_word2vec_format(getw2vfilepath(),binary=True) return model_w2v def getModel_word(): ''' @summary:加载字向量 ''' global model_word,lock_model_w2v with lock_model_word: if model_word is None: model_word = gensim.models.KeyedVectors.load_word2vec_format(model_word_file,binary=True) return model_word # getModel_w2v() # getModel_word() def findAllIndex(substr,wholestr): ''' @summary: 找到字符串的子串的所有begin_index @param: substr:子字符串 wholestr:子串所在完整字符串 @return: list,字符串的子串的所有begin_index ''' copystr = wholestr result = [] indexappend = 0 while(True): index = copystr.find(substr) if index<0: break else: result.append(indexappend+index) indexappend += index+len(substr) copystr = copystr[index+len(substr):] return result def spanWindow(tokens,begin_index,end_index,size,center_include=False,word_flag = False,use_text = False,text = None): ''' @summary:取得某个实体的上下文词汇 @param: tokens:句子分词list begin_index:实体的开始index end_index:实体的结束index size:左右两边各取多少个词 center_include:是否包含实体 word_flag:词/字,默认是词 @return: list,实体的上下文词汇 ''' if use_text: assert text is not None length_tokens = len(tokens) if begin_index>size: begin = begin_index-size else: begin = 0 if end_index+sizesize else 0 end = end_index + size result.append(sentence_text[begin: begin_index]) if center_include: result.append(sentence_text[begin_index: end_index]) result.append(sentence_text[end_index: end]) return result #根据规则补全编号或名称两边的符号 def fitDataByRule(data): symbol_dict = {"(":")", "(":")", "[":"]", "【":"】", ")":"(", ")":"(", "]":"[", "】":"【"} leftSymbol_pattern = re.compile("[\((\[【]") rightSymbol_pattern = re.compile("[\))\]】]") leftfinds = re.findall(leftSymbol_pattern,data) rightfinds = re.findall(rightSymbol_pattern,data) result = data if len(leftfinds)+len(rightfinds)==0: return data elif len(leftfinds)==len(rightfinds): return data elif abs(len(leftfinds)-len(rightfinds))==1: if len(leftfinds)>len(rightfinds): if symbol_dict.get(data[0]) is not None: result = data[1:] else: #print(symbol_dict.get(leftfinds[0])) result = data+symbol_dict.get(leftfinds[0]) else: if symbol_dict.get(data[-1]) is not None: result = data[:-1] else: result = symbol_dict.get(rightfinds[0])+data result = re.sub("[。]","",result) return result from datetime import date # 时间合法性判断 def isValidDate(year, month, day): try: date(year, month, day) except: return False else: return True time_format_pattern = re.compile("((?P20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]?\s*(?P\d{1,2}|[一二三四五六七八九十]{1,3})?)") from BiddingKG.dl.ratio.re_ratio import getUnifyNum import calendar def get_maxday(year, month): # calendar.monthrange(year, month) 返回一个元组,其中第一个元素是那个月第一天的星期几(0-6代表周一到周日), # 第二个元素是那个月的天数。 _, last_day = calendar.monthrange(year, month) return last_day def timeFormat(_time, default_first_day=True): ''' 日期格式化:年-月-日 :param _time: :param default_first_day: True取当月第一天,否则取最后一天 :return: ''' current_year = time.strftime("%Y",time.localtime()) all_match = re.finditer(time_format_pattern,_time) for _match in all_match: if len(_match.group())>0: legal = True year = "" month = "" day = "" for k,v in _match.groupdict().items(): if k=="year": year = v if k=="month": month = v if k=="day": day = v if year!="": if re.search("^\d+$",year): if len(year)==2: year = "20"+year if int(year)-int(current_year)>10: legal = False else: _year = "" for word in year: if word == '0': _year += word else: _year += str(getDigitsDic(word)) year = _year else: legal = False if month!="": if re.search("^\d+$", month): if int(month)>12: legal = False else: month = int(getUnifyNum(month)) if month>=1 and month<=12: month = str(month) else: legal = False else: legal = False if day == None: day = "01" if (default_first_day or legal == False) else str(get_maxday(int(year), int(month))) if day!="": if re.search("^\d+$", day): if int(day)>31: legal = False else: day = int(getUnifyNum(day)) if day >= 1 and day <= 31: day = str(day) else: legal = False else: legal = False # print(year,month,day) if not isValidDate(int(year),int(month),int(day)): legal = False if legal: return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")) return "" def embedding(datas,shape): ''' @summary:查找词汇对应的词向量 @param: datas:词汇的list shape:结果的shape @return: array,返回对应shape的词嵌入 ''' model_w2v = getModel_w2v() embed = np.zeros(shape) length = shape[1] out_index = 0 #print(datas) for data in datas: index = 0 for item in data: item_not_space = re.sub("\s*","",item) if index>=length: break if item_not_space in model_w2v.vocab: embed[out_index][index] = model_w2v[item_not_space] index += 1 else: #embed[out_index][index] = model_w2v['unk'] index += 1 out_index += 1 return embed def embedding_word(datas,shape): ''' @summary:查找词汇对应的词向量 @param: datas:词汇的list shape:结果的shape @return: array,返回对应shape的词嵌入 ''' model_w2v = getModel_word() embed = np.zeros(shape) length = shape[1] out_index = 0 #print(datas) for data in datas: index = 0 for item in str(data)[-shape[1]:]: if index>=length: break if item in model_w2v.vocab: embed[out_index][index] = model_w2v[item] index += 1 else: # embed[out_index][index] = model_w2v['unk'] index += 1 out_index += 1 return embed def embedding_word_forward(datas,shape): ''' @summary:查找词汇对应的词向量 @param: datas:词汇的list shape:结果的shape @return: array,返回对应shape的词嵌入 ''' model_w2v = getModel_word() embed = np.zeros(shape) length = shape[1] out_index = 0 #print(datas) for data in datas: index = 0 for item in str(data)[:shape[1]]: if index>=length: break if item in model_w2v.vocab: embed[out_index][index] = model_w2v[item] index += 1 else: # embed[out_index][index] = model_w2v['unk'] index += 1 out_index += 1 return embed def formEncoding(text,shape=(100,60),expand=False): embedding = np.zeros(shape) word_model = getModel_word() for i in range(len(text)): if i>=shape[0]: break if text[i] in word_model.vocab: embedding[i] = word_model[text[i]] if expand: embedding = np.expand_dims(embedding,0) return embedding def partMoney(entity_text,input2_shape = [7]): ''' @summary:对金额分段 @param: entity_text:数值金额 input2_shape:分类数 @return: array,分段之后的独热编码 ''' money = float(entity_text) parts = np.zeros(input2_shape) if money<100: parts[0] = 1 elif money<1000: parts[1] = 1 elif money<10000: parts[2] = 1 elif money<100000: parts[3] = 1 elif money<1000000: parts[4] = 1 elif money<10000000: parts[5] = 1 else: parts[6] = 1 return parts def uniform_num(num): d1 = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十': '10'} # d2 = {'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5', 'F': '6', 'G': '7', 'H': '8', 'I': '9', 'J': '10'} d3 = {'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5', 'Ⅵ': '6', 'Ⅶ': '7'} if num.isdigit(): if re.search('^0[\d]$', num): num = num[1:] return num elif re.search('^[一二三四五六七八九十]+$', num): _digit = re.search('^[一二三四五六七八九十]+$', num).group(0) if len(_digit) == 1: num = d1[_digit] elif len(_digit) == 2 and _digit[0] == '十': num = '1'+ d1[_digit[1]] elif len(_digit) == 2 and _digit[1] == '十': num = d1[_digit[0]] + '0' elif len(_digit) == 3 and _digit[1] == '十': num = d1[_digit[0]] + d1[_digit[2]] elif re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num): num = re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num).group(0) num = d3[num] return num def uniform_package_name(package_name): ''' 统一规范化包号。数值类型统一为阿拉伯数字,字母统一为大写,包含施工监理等抽到前面, 例 A包监理一标段 统一为 监理A1 ; 包Ⅱ 统一为 2 :param package_name: 字符串类型 包号 :return: ''' package_name_raw = package_name package_name = re.sub('pdf|doc|docs|xlsx|rar|\d{4}年', ' ', package_name) package_name = package_name.replace('标段(包)', '标段').replace('№', '') package_name = re.sub('\[|【', '', package_name) kw = re.search('(施工|监理|监测|勘察|设计|劳务)', package_name) name = "" if kw: name += kw.group(0) if re.search('^[a-zA-Z0-9-]{5,}$', package_name): # 五个字符以上编号 _digit = re.search('^[a-zA-Z0-9-]{5,}$', package_name).group(0).upper() # print('规范化包号1', _digit) name += _digit elif re.search('(?P[a-zA-Z])包[:)]?第?(?P([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name): # 处理类似 A包2标段 ser = re.search('(?P[a-zA-Z])包[:)]?第?(?P([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name) # print('规范化包号2', ser.group(0)) _char = ser.groupdict().get('eng') if _char: _char = _char.upper() _digit = ser.groupdict().get('num') _digit = uniform_num(_digit) name += _char.upper() + _digit elif re.search('第?(?P[0-9a-zA-Z-]{1,4})?(?P([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name): # 处理类似 A包2标段 ser = re.search('第?(?P[0-9a-zA-Z-]{1,4})?(?P([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name) # print('规范化包号3', ser.group(0)) _char = ser.groupdict().get('eng') if _char: _char = _char.upper() _digit = ser.groupdict().get('num') _digit = uniform_num(_digit) if _char: name += _char.upper() name += _digit elif re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[::]?(?P[0-9a-zA-Z-]{1,4})?(?P([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name): # 数字的统一的阿拉伯数字 ser = re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[::]?(?P[0-9a-zA-Z-]{1,4})?(?P([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name) # print('规范化包号4', ser.group(0)) _char = ser.groupdict().get('eng') if _char: _char = _char.upper() _digit = ser.groupdict().get('num') _digit = uniform_num(_digit) if _char: name += _char.upper() name += _digit elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P[a-zA-Z-]{1,5})', package_name): # 数字的统一的阿拉伯数字 _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P[a-zA-Z-]{1,5})', package_name).group('eng').upper() # print('规范化包号5', _digit) name += _digit elif re.search('(?P[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name): # 数字的统一的阿拉伯数字 _digit = re.search('(?P[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name).group('eng').upper() # print('规范化包号6', _digit) name += _digit elif re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name): # 数字的统一的阿拉伯数字 _digit = re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name).group(0) # print('规范化包号7', _digit) _digit = uniform_num(_digit) name += _digit elif re.search('^[a-zA-Z0-9-]+$', package_name): _char = re.search('^[a-zA-Z0-9-]+$', package_name).group(0) # print('规范化包号8', _char) name += _char.upper() if name == "": return package_name_raw else: if name.isdigit(): name = str(int(name)) # print('原始包号:%s, 处理后:%s'%(package_name, name)) return name def money_process(money_text, header): ''' 输入金额文本及金额列表头,返回统一数字化金额及金额单位 :param money_text:金额字符串 :param header:金额列表头,用于提取单位 :return: ''' money = 0 money_unit = "" # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[((]?万?", money_text) money_text = re.sub('\s', '', money_text) # 2024/04/19 修复 457699044 556.46751 万元 金额与单位有空格造成万漏提取 if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text) and re.search('\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?', money_text): money_text = re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text).group(0) # 如果表格同时包含大小写金额,取大写金额,避免单位取错 463310590 790000(柒拾玖万元整) re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?", money_text) if re_price: money_re = re_price.group(0) if (re.search('万元|[((]万[))]', header) or re.search('万元|[((]万[))]', money_text)) and '万' not in money_re: # 修复37797825 控制价(万) # 修复 460307391 万元不在表头,在数字前面 money_re += '万元' elif (re.search('亿元|[((]亿[))]', header) or re.search('亿元|[((]亿[))]', money_text)) and '亿' not in money_re: # 修复37797825 控制价(万) # 修复 460307391 万元不在表头,在数字前面 money_re += '亿元' # money = float(getUnifyMoney(money_text)) money = float(getUnifyMoney(money_re)) if money > 10000000000000: # 大于万亿的去除 money = 0 # money_unit = '万元' if '万' in money_re and re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None else '元' if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None: if '万' in money_re: money_unit = '万元' elif '亿' in money_re: money_unit = '亿元' else: money_unit = '元' return (money, money_unit) package_number_pattern = re.compile( '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\ |(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\ |(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\ |((标[段包项]|品目|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\ |([,;。、:(]|^)(标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\ |((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,9}[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{0,9})\ |[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})') filter_package_pattern = 'CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\ |标[识注签贴配]|[商油]标号|第X包|第[一二三四五六七八九十]+至[一二三四五六七八九十]+(标[段包项]?|包[组件标]?|合同[包段])\ |\.(docx|doc|pdf|xlsx|xls|jpg)|[一二三四五]次|五金|\d+[年月]|[\d.,]+万?元|\d+\.\d+' # 过滤错误的非包号 def find_package(content): ''' 通过正则找包和标段号 :param content: :return: ''' packages = [] content = content.replace('号,', '号:').replace(':', ':').replace('(', '(').replace(')', ')') # .replace('-包',' 包').replace('包-', '包 ').replace('-标', ' 标').replace('标段-', '标段 ').replace('-合同包', ' 合同包') # 72760191 标段:№10 content = re.sub('[一二三四五六七八九十\d](标[段包项]|包[组件标])编号', ' 标段编号', content) for it in re.finditer(filter_package_pattern, content): content = content.replace(it.group(0), ' ' * len(it.group(0))) for iter in re.finditer(package_number_pattern, content): if re.search('(业绩|信誉要求):|业绩(如下)?\d*[、:]', content[:iter.start()]): # 前面有业绩或信誉的标段去掉 continue # print('提取到标段:%s, 前后文:%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5])) if re.match('\d', iter.group(0)) and re.search('\d\.$', content[:iter.start()]): # 排除2.10标段3 5.4标段划分 这种情况 # print('过滤掉错误包:', iter.group()) continue if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('\d包[/每]\w|一包[0-9一二三四五六七八九十]+', content[ iter.start():iter.end() + 3]) or re.search( '[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)): # print('过滤掉错误包:', iter.group()) continue elif iter.end() + 2 < len(content) and re.search('标准|标的物|标志|包装|划分|标书', content[iter.start():iter.end() + 2]): # print('过滤掉错误包:', iter.group()) continue elif re.search('同一(标段?|包)', content[max(0, iter.start() - 2):iter.end()]): # 不得参加同一标段 # print('过滤掉错误包:', iter.group()) continue elif re.search('三包', content[max(0, iter.start() - 2):iter.end()]) and re.search('第三包', content[max(0, iter.start() - 2):iter.end()]) == None: # 规规章和“三包”规定 # print('过滤掉错误包:', iter.group()) continue elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)): # print('过滤掉错误包号5:', iter.group(0)) continue elif re.search('单位:包|1包\d|[张箱]', content[max(0, iter.start()-3): iter.end()+2]): # 处理 463166661 包号错误 钢丝,单位:包X10根。 # print('过滤掉错误包号,单位:包|1包', iter.group(0)) continue packages.append(iter) # print('提取到标段:%s, 前后文:%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5])) return packages def cut_repeat_name(s): ''' 公司连续重复名称去重 :param s: :return: ''' if len(s) >= 8: n = s.count(s[-4:]) id = s.find(s[-4:]) + 4 sub_s = s[:id] if n>=2 and s == sub_s * n: s = sub_s return s def del_tabel_achievement(soup): if re.search('中标|成交|入围|结果|评标|开标|候选人', soup.text[:800]) == None or re.search('业绩', soup.text)==None: return None p1 = '(中标|成交)(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)' '''删除前面标签 命中业绩规则;当前标签为表格且公布业绩相关信息的去除''' for tag in soup.find_all('table'): pre_text = "" if tag.findPreviousSibling() != None: pre_text = tag.findPreviousSibling().text.strip() if pre_text == "" and tag.findPreviousSibling().findPreviousSibling() != None: # 修复表格前一标签没内容,再前一个才有内容情况 pre_text = tag.findPreviousSibling().findPreviousSibling().text.strip() tr_text = tag.find('tr').text.strip() if tag.find('tr') != None else "" # print(re.search(p1, pre_text),pre_text, len(pre_text), re.findall('序号|中标候选人名称|项目名称|工程名称|合同金额|建设单位|业主', tr_text)) if re.search(p1, pre_text) and len(pre_text) < 20 and tag.find('tr') != None and len(tr_text)<100: _count = 0 for td in tag.find('tr').find_all('td'): td_text = td.text.strip() if len(td_text) > 25: break if len(td_text) < 25 and re.search('中标候选人|第[一二三四五1-5]候选人|(项目|业绩|工程)名称|\w{,10}业绩$|合同金额|建设单位|采购单位|业主|甲方', td_text): _count += 1 if _count >=2: pre_tag = tag.findPreviousSibling().extract() del_tag = tag.extract() # print('删除表格业绩内容', pre_tag.text + del_tag.text) break elif re.search('业绩名称', tr_text) and re.search('建设单位|采购单位|业主', tr_text) and len(tr_text)<100: del_tag = tag.extract() # print('删除表格业绩内容', del_tag.text) del_trs = [] '''删除表格某些行公布的业绩信息''' for tag in soup.find_all('table'): text = tag.text if re.search('业绩', text) == None: continue # for tr in tag.find_all('tr'): trs = tag.find_all('tr') i = 0 while i < len(trs): tr = trs[i] if len(tr.find_all('td'))==2 and tr.td!=None and tr.td.findNextSibling()!=None: td1_text =tr.td.text td2_text =tr.td.findNextSibling().text if re.search('业绩', td1_text)!=None and len(td1_text)<10 and len(re.findall('(\d、|(\d))?[-\w()、]+(工程|项目|勘察|设计|施工|监理|总承包|采购|更新)', td2_text))>=2: # del_tag = tr.extract() # print('删除表格业绩内容', del_tag.text) del_trs.append(tr) elif tr.td != None and re.search('^业绩|业绩$', tr.td.text.strip()) and len(tr.td.text.strip())<25: rows = tr.td.attrs.get('rowspan', '') cols = tr.td.attrs.get('colspan', '') if rows.isdigit() and int(rows)>2: for j in range(int(rows)): if i+j < len(trs): del_trs.append(trs[i+j]) i += j elif cols.isdigit() and int(cols)>3 and len(tr.find_all('td'))==1 and i+2 < len(trs): next_tr_cols = 0 td_num = 0 for td in trs[i+1].find_all('td'): td_num += 1 if td.attrs.get('colspan', '').isdigit(): next_tr_cols += int(td.attrs.get('colspan', '')) if next_tr_cols == int(cols): del_trs.append(tr) for j in range(1,len(trs)-i): if len(trs[i+j].find_all('td')) == 1: break elif len(trs[i+j].find_all('td')) >= td_num-1: del_trs.append(trs[i+j]) else: break i += j i += 1 for tr in del_trs: del_tag = tr.extract() # print('删除表格业绩内容', del_tag.text) def recall(y_true, y_pred): ''' 计算召回率 @Argus: y_true: 正确的标签 y_pred: 模型预测的标签 @Return 召回率 ''' c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) c3 = K.sum(K.round(K.clip(y_true, 0, 1))) if c3 == 0: return 0 recall = c1 / c3 return recall def f1_score(y_true, y_pred): ''' 计算F1 @Argus: y_true: 正确的标签 y_pred: 模型预测的标签 @Return F1值 ''' c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) c2 = K.sum(K.round(K.clip(y_pred, 0, 1))) c3 = K.sum(K.round(K.clip(y_true, 0, 1))) precision = c1 / c2 if c3 == 0: recall = 0 else: recall = c1 / c3 f1_score = 2 * (precision * recall) / (precision + recall) return f1_score def precision(y_true, y_pred): ''' 计算精确率 @Argus: y_true: 正确的标签 y_pred: 模型预测的标签 @Return 精确率 ''' c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) c2 = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = c1 / c2 return precision # def print_metrics(history): # ''' # 制作每次迭代的各metrics变化图片 # # @Arugs: # history: 模型训练迭代的历史记录 # ''' # import matplotlib.pyplot as plt # # # loss图 # loss = history.history['loss'] # val_loss = history.history['val_loss'] # epochs = range(1, len(loss) + 1) # plt.subplot(2, 2, 1) # plt.plot(epochs, loss, 'bo', label='Training loss') # plt.plot(epochs, val_loss, 'b', label='Validation loss') # plt.title('Training and validation loss') # plt.xlabel('Epochs') # plt.ylabel('Loss') # plt.legend() # # # f1图 # f1 = history.history['f1_score'] # val_f1 = history.history['val_f1_score'] # plt.subplot(2, 2, 2) # plt.plot(epochs, f1, 'bo', label='Training f1') # plt.plot(epochs, val_f1, 'b', label='Validation f1') # plt.title('Training and validation f1') # plt.xlabel('Epochs') # plt.ylabel('F1') # plt.legend() # # # precision图 # prec = history.history['precision'] # val_prec = history.history['val_precision'] # plt.subplot(2, 2, 3) # plt.plot(epochs, prec, 'bo', label='Training precision') # plt.plot(epochs, val_prec, 'b', label='Validation pecision') # plt.title('Training and validation precision') # plt.xlabel('Epochs') # plt.ylabel('Precision') # plt.legend() # # # recall图 # recall = history.history['recall'] # val_recall = history.history['val_recall'] # plt.subplot(2, 2, 4) # plt.plot(epochs, recall, 'bo', label='Training recall') # plt.plot(epochs, val_recall, 'b', label='Validation recall') # plt.title('Training and validation recall') # plt.xlabel('Epochs') # plt.ylabel('Recall') # plt.legend() # # plt.show() if __name__=="__main__": # print(fool_char_to_id[">"]) print(getUnifyMoney('伍仟贰佰零壹拾伍万零捌佰壹拾元陆角伍分')) # model = getModel_w2v() # vocab,matrix = getVocabAndMatrix(model, Embedding_size=128) # save([vocab,matrix],"vocabMatrix_words.pk")