''' Created on 2018年12月20日 @author: User ''' import numpy as np import re import gensim from keras import backend as K import os from threading import RLock from pai_tf_predict_proto import tf_predict_pb2 import requests import time import smtplib from email.mime.application import MIMEApplication from email.mime.multipart import MIMEMultipart from email.utils import formataddr model_w2v = None lock_model_w2v = RLock() USE_PAI_EAS = False Lazy_load = False ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') import traceback def sendEmail(host,username,password,receivers,attachs=[]): try: #处理附件 msg = MIMEMultipart() msg["From"] = formataddr(["广州比地数据科技有限公司",username]) msg["To"] = formataddr(["客户",receivers[0]]) msg["Subject"] = "数据导出服务" for at in attachs: xlsfile = MIMEApplication(open(at,"rb").read()) xlsfile.add_header("Content-Disposition","attachment",filename=('gbk', '', at.split("/")[-1])) log(at.split("/")[-1]) msg.attach(xlsfile) server = smtplib.SMTP() server.connect(host,25) server.login(username,password) server.sendmail(username,receivers,msg.as_string()) log("发送邮件成功%s"%str(attachs)) server.close() except Exception as e: traceback.print_exc() log("发送邮件错误%s"%str(e)) def getLegal_str(_str): if _str is not None: return ILLEGAL_CHARACTERS_RE.sub("",str(_str)) def getRow_ots_primary(row): _dict = dict() if row is None: return _dict for part in row.attribute_columns: _dict[part[0]] = part[1] for part in row.primary_key: _dict[part[0]] = part[1] return _dict def getRow_ots(rows): list_dict = [] for row in rows: _dict = dict() for part in row: for v in part: _dict[v[0]] = v[1] list_dict.append(_dict) return list_dict def getw2vfilepath(): w2vfile = os.path.dirname(__file__)+"/../wiki_128_word_embedding_new.vector" if os.path.exists(w2vfile): return w2vfile return "wiki_128_word_embedding_new.vector" def getLazyLoad(): global Lazy_load return Lazy_load def get_file_name(url, headers): filename = '' if 'Content-Disposition' in headers and headers['Content-Disposition']: disposition_split = headers['Content-Disposition'].split(';') if len(disposition_split) > 1: if disposition_split[1].strip().lower().startswith('filename='): file_name = disposition_split[1].split('=') if len(file_name) > 1: filename = file_name[1] if not filename and os.path.basename(url): filename = os.path.basename(url).split("?")[0] if not filename: return time.time() return filename model_word_file = os.path.dirname(__file__)+"/../singlew2v_model.vector" model_word = None lock_model_word = RLock() from decimal import Decimal import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) import pickle import os import json #自定义jsonEncoder class MyEncoder(json.JSONEncoder): def __init__(self): import numpy as np global np def default(self, obj): if isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, bytes): return str(obj, encoding='utf-8') elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): return float(obj) elif isinstance(obj,(np.int64,np.int32)): return int(obj) return json.JSONEncoder.default(self, obj) vocab_word = None vocab_words = None file_vocab_word = "vocab_word.pk" file_vocab_words = "vocab_words.pk" selffool_authorization = "NjlhMWFjMjVmNWYyNzI0MjY1OGQ1M2Y0ZmY4ZGY0Mzg3Yjc2MTVjYg==" selffool_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_gpu" selffool_seg_authorization = "OWUwM2Q0ZmE3YjYxNzU4YzFiMjliNGVkMTA3MzJkNjQ2MzJiYzBhZg==" selffool_seg_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_seg_gpu" codename_authorization = "Y2M5MDUxMzU1MTU4OGM3ZDk2ZmEzYjkxYmYyYzJiZmUyYTgwYTg5NA==" codename_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codename_gpu" form_item_authorization = "ODdkZWY1YWY0NmNhNjU2OTI2NWY4YmUyM2ZlMDg1NTZjOWRkYTVjMw==" form_item_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/form" person_authorization = "N2I2MDU2N2Q2MGQ0ZWZlZGM3NDkyNTA1Nzc4YmM5OTlhY2MxZGU1Mw==" person_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/person" role_authorization = "OWM1ZDg5ZDEwYTEwYWI4OGNjYmRlMmQ1NzYwNWNlZGZkZmRmMjE4OQ==" role_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/role" money_authorization = "MDQyNjc2ZDczYjBhYmM4Yzc4ZGI4YjRmMjc3NGI5NTdlNzJiY2IwZA==" money_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/money" codeclasses_authorization = "MmUyNWIxZjQ2NjAzMWJlMGIzYzkxMjMzNWY5OWI3NzJlMWQ1ZjY4Yw==" codeclasses_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codeclasses" def viterbi_decode(score, transition_params): """Decode the highest scoring sequence of tags outside of TensorFlow. This should only be used at test time. Args: score: A [seq_len, num_tags] matrix of unary potentials. transition_params: A [num_tags, num_tags] matrix of binary potentials. Returns: viterbi: A [seq_len] list of integers containing the highest scoring tag indices. viterbi_score: A float containing the score for the Viterbi sequence. """ trellis = np.zeros_like(score) backpointers = np.zeros_like(score, dtype=np.int32) trellis[0] = score[0] for t in range(1, score.shape[0]): v = np.expand_dims(trellis[t - 1], 1) + transition_params trellis[t] = score[t] + np.max(v, 0) backpointers[t] = np.argmax(v, 0) viterbi = [np.argmax(trellis[-1])] for bp in reversed(backpointers[1:]): viterbi.append(bp[viterbi[-1]]) viterbi.reverse() viterbi_score = np.max(trellis[-1]) return viterbi, viterbi_score import ctypes import inspect def _async_raise(tid, exctype): """raises the exception, performs cleanup if needed""" tid = ctypes.c_long(tid) if not inspect.isclass(exctype): exctype = type(exctype) res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype)) if res == 0: raise ValueError("invalid thread id") elif res != 1: ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None) raise SystemError("PyThreadState_SetAsyncExc failed") def stop_thread(thread): _async_raise(thread.ident, SystemExit) def limitRun(sess,list_output,feed_dict,MAX_BATCH=1024): len_sample = 0 if len(feed_dict.keys())>0: len_sample = len(feed_dict[list(feed_dict.keys())[0]]) if len_sample>MAX_BATCH: list_result = [[] for _ in range(len(list_output))] _begin = 0 while(_begin")) temp_len += 1 if out_index in [0]: temp = list_append+temp else: temp = temp+list_append else: for words in list_word: temp.append(getIndexOfWords(words)) list_append = [] temp_len = len(temp) while(temp_len")) temp_len += 1 if out_index in [0,1]: temp = list_append+temp else: temp = temp+list_append result.append(temp) out_index += 1 return result def encodeInput_form(input,MAX_LEN=30): x = np.zeros([MAX_LEN]) for i in range(len(input)): if i>=MAX_LEN: break x[i] = getIndexOfWord(input[i]) return x def getVocabAndMatrix(model,Embedding_size = 60): ''' @summary:获取子向量的词典和子向量矩阵 ''' vocab = [""]+model.index2word embedding_matrix = np.zeros((len(vocab),Embedding_size)) for i in range(1,len(vocab)): embedding_matrix[i] = model[vocab[i]] return vocab,embedding_matrix def getIndexOfWord(word): global vocab_word,file_vocab_word if vocab_word is None: if os.path.exists(file_vocab_word): vocab = load(file_vocab_word) vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab))) else: model = getModel_word() vocab,_ = getVocabAndMatrix(model, Embedding_size=60) vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab))) save(vocab,file_vocab_word) if word in vocab_word.keys(): return vocab_word[word] else: return vocab_word[''] def getIndexOfWords(words): global vocab_words,file_vocab_words if vocab_words is None: if os.path.exists(file_vocab_words): vocab = load(file_vocab_words) vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab))) else: model = getModel_w2v() vocab,_ = getVocabAndMatrix(model, Embedding_size=128) vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab))) save(vocab,file_vocab_words) if words in vocab_words.keys(): return vocab_words[words] else: return vocab_words[""] def log_tofile(filename): logging.basicConfig(filename=filename,level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def log(msg): ''' @summary:打印信息 ''' logger.info(msg) def debug(msg): ''' @summary:打印信息 ''' logger.debug(msg) def save(object_to_save, path): ''' 保存对象 @Arugs: object_to_save: 需要保存的对象 @Return: 保存的路径 ''' with open(path, 'wb') as f: pickle.dump(object_to_save, f) def load(path): ''' 读取对象 @Arugs: path: 读取的路径 @Return: 读取的对象 ''' with open(path, 'rb') as f: object1 = pickle.load(f) return object1 def getIndexOfWord_fool(word): if word in fool_char_to_id.keys(): return fool_char_to_id[word] else: return fool_char_to_id["[UNK]"] def find_index(list_tofind,text): ''' @summary: 查找所有词汇在字符串中第一次出现的位置 @param: list_tofind:待查找词汇 text:字符串 @return: list,每个词汇第一次出现的位置 ''' result = [] for item in list_tofind: index = text.find(item) if index>=0: result.append(index) else: result.append(-1) return result def combine(list1,list2): ''' @summary:将两个list中的字符串两两拼接 @param: list1:字符串list list2:字符串list @return:拼接结果list ''' result = [] for item1 in list1: for item2 in list2: result.append(str(item1)+str(item2)) return result def getDigitsDic(unit): ''' @summary:拿到中文对应的数字 ''' DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9, "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9} return DigitsDic.get(unit) def getMultipleFactor(unit): ''' @summary:拿到单位对应的值 ''' MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)} return MultipleFactor.get(unit) def getUnifyMoney(money): ''' @summary:将中文金额字符串转换为数字金额 @param: money:中文金额字符串 @return: decimal,数据金额 ''' MAX_NUM = 12 #去掉逗号 money = re.sub("[,,]","",money) money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億〇一二三四五六七八九十百千万亿元角分]","",money) result = Decimal(0) chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"] chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"] LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$") BigMoneypattern = re.compile("^零?(?P[%s])$"%("".join(chnDigits))) if re.search(LowMoneypattern,money) is not None: return Decimal(money) elif re.search(BigMoneypattern,money) is not None: return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney")) for factorUnit in chnFactorUnits: if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None: subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money) if re.search(re.compile("^(\d+(,)?)+(\.\d+)?$"),subMoneys[0]) is not None: result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit)) elif len(subMoneys[0])==1: if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None: result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit)) else: result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit)) if len(subMoneys)>1: if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None: result += Decimal(subMoneys[1]) elif len(subMoneys[1])==1: if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None: result += Decimal(getDigitsDic(subMoneys[1])) else: result += Decimal(getUnifyMoney(subMoneys[1])) break return result def getModel_w2v(): ''' @summary:加载词向量 ''' global model_w2v,lock_model_w2v with lock_model_w2v: if model_w2v is None: model_w2v = gensim.models.KeyedVectors.load_word2vec_format(getw2vfilepath(),binary=True) return model_w2v def getModel_word(): ''' @summary:加载字向量 ''' global model_word,lock_model_w2v with lock_model_word: if model_word is None: model_word = gensim.models.KeyedVectors.load_word2vec_format(model_word_file,binary=True) return model_word # getModel_w2v() # getModel_word() def findAllIndex(substr,wholestr): ''' @summary: 找到字符串的子串的所有begin_index @param: substr:子字符串 wholestr:子串所在完整字符串 @return: list,字符串的子串的所有begin_index ''' copystr = wholestr result = [] indexappend = 0 while(True): index = copystr.find(substr) if index<0: break else: result.append(indexappend+index) indexappend += index+len(substr) copystr = copystr[index+len(substr):] return result def spanWindow(tokens,begin_index,end_index,size,center_include=False,word_flag = False,use_text = False,text = None): ''' @summary:取得某个实体的上下文词汇 @param: tokens:句子分词list begin_index:实体的开始index end_index:实体的结束index size:左右两边各取多少个词 center_include:是否包含实体 word_flag:词/字,默认是词 @return: list,实体的上下文词汇 ''' if use_text: assert text is not None length_tokens = len(tokens) if begin_index>size: begin = begin_index-size else: begin = 0 if end_index+sizelen(rightfinds): if symbol_dict.get(data[0]) is not None: result = data[1:] else: #print(symbol_dict.get(leftfinds[0])) result = data+symbol_dict.get(leftfinds[0]) else: if symbol_dict.get(data[-1]) is not None: result = data[:-1] else: result = symbol_dict.get(rightfinds[0])+data result = re.sub("[。]","",result) return result def embedding(datas,shape): ''' @summary:查找词汇对应的词向量 @param: datas:词汇的list shape:结果的shape @return: array,返回对应shape的词嵌入 ''' model_w2v = getModel_w2v() embed = np.zeros(shape) length = shape[1] out_index = 0 #print(datas) for data in datas: index = 0 for item in data: item_not_space = re.sub("\s*","",item) if index>=length: break if item_not_space in model_w2v.vocab: embed[out_index][index] = model_w2v[item_not_space] index += 1 else: #embed[out_index][index] = model_w2v['unk'] index += 1 out_index += 1 return embed def embedding_word(datas,shape): ''' @summary:查找词汇对应的词向量 @param: datas:词汇的list shape:结果的shape @return: array,返回对应shape的词嵌入 ''' model_w2v = getModel_word() embed = np.zeros(shape) length = shape[1] out_index = 0 #print(datas) for data in datas: index = 0 for item in str(data)[-shape[1]:]: if index>=length: break if item in model_w2v.vocab: embed[out_index][index] = model_w2v[item] index += 1 else: # embed[out_index][index] = model_w2v['unk'] index += 1 out_index += 1 return embed def formEncoding(text,shape=(100,60),expand=False): embedding = np.zeros(shape) word_model = getModel_word() for i in range(len(text)): if i>=shape[0]: break if text[i] in word_model.vocab: embedding[i] = word_model[text[i]] if expand: embedding = np.expand_dims(embedding,0) return embedding def partMoney(entity_text,input2_shape = [7]): ''' @summary:对金额分段 @param: entity_text:数值金额 input2_shape:分类数 @return: array,分段之后的独热编码 ''' money = float(entity_text) parts = np.zeros(input2_shape) if money<100: parts[0] = 1 elif money<1000: parts[1] = 1 elif money<10000: parts[2] = 1 elif money<100000: parts[3] = 1 elif money<1000000: parts[4] = 1 elif money<10000000: parts[5] = 1 else: parts[6] = 1 return parts def recall(y_true, y_pred): ''' 计算召回率 @Argus: y_true: 正确的标签 y_pred: 模型预测的标签 @Return 召回率 ''' c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) c3 = K.sum(K.round(K.clip(y_true, 0, 1))) if c3 == 0: return 0 recall = c1 / c3 return recall def f1_score(y_true, y_pred): ''' 计算F1 @Argus: y_true: 正确的标签 y_pred: 模型预测的标签 @Return F1值 ''' c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) c2 = K.sum(K.round(K.clip(y_pred, 0, 1))) c3 = K.sum(K.round(K.clip(y_true, 0, 1))) precision = c1 / c2 if c3 == 0: recall = 0 else: recall = c1 / c3 f1_score = 2 * (precision * recall) / (precision + recall) return f1_score def precision(y_true, y_pred): ''' 计算精确率 @Argus: y_true: 正确的标签 y_pred: 模型预测的标签 @Return 精确率 ''' c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) c2 = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = c1 / c2 return precision # def print_metrics(history): # ''' # 制作每次迭代的各metrics变化图片 # # @Arugs: # history: 模型训练迭代的历史记录 # ''' # import matplotlib.pyplot as plt # # # loss图 # loss = history.history['loss'] # val_loss = history.history['val_loss'] # epochs = range(1, len(loss) + 1) # plt.subplot(2, 2, 1) # plt.plot(epochs, loss, 'bo', label='Training loss') # plt.plot(epochs, val_loss, 'b', label='Validation loss') # plt.title('Training and validation loss') # plt.xlabel('Epochs') # plt.ylabel('Loss') # plt.legend() # # # f1图 # f1 = history.history['f1_score'] # val_f1 = history.history['val_f1_score'] # plt.subplot(2, 2, 2) # plt.plot(epochs, f1, 'bo', label='Training f1') # plt.plot(epochs, val_f1, 'b', label='Validation f1') # plt.title('Training and validation f1') # plt.xlabel('Epochs') # plt.ylabel('F1') # plt.legend() # # # precision图 # prec = history.history['precision'] # val_prec = history.history['val_precision'] # plt.subplot(2, 2, 3) # plt.plot(epochs, prec, 'bo', label='Training precision') # plt.plot(epochs, val_prec, 'b', label='Validation pecision') # plt.title('Training and validation precision') # plt.xlabel('Epochs') # plt.ylabel('Precision') # plt.legend() # # # recall图 # recall = history.history['recall'] # val_recall = history.history['val_recall'] # plt.subplot(2, 2, 4) # plt.plot(epochs, recall, 'bo', label='Training recall') # plt.plot(epochs, val_recall, 'b', label='Validation recall') # plt.title('Training and validation recall') # plt.xlabel('Epochs') # plt.ylabel('Recall') # plt.legend() # # plt.show() import pandas as pd dict_name_locations = {} dict_id_location = {} def getLocationDict(): global dict_name_locations,dict_id_location df = pd.read_excel(os.path.dirname(__file__)+"/省份信息.xlsx") for _id,_cname,_parentid,_ctype in zip(df["id"],df["cname"],df["parentid"],df["ctype"]): _dict = {"id":_id,"cname":_cname,"parentid":_parentid,"ctype":_ctype} dict_id_location[_id] =_dict if _cname not in dict_name_locations: dict_name_locations[_cname] = [] dict_name_locations[_cname].append(_dict) getLocationDict() def getProvinceCityDistrict(loc): list_loc = dict_name_locations.get(loc,[]) list_result = [] for _loc in list_loc: dict_loc_parents = {} _current_loc = _loc while(True): if _current_loc is None: break if _current_loc.get("ctype")>=20: dict_loc_parents[_current_loc.get("ctype")] = _current_loc _current_loc = dict_id_location.get(_current_loc.get("parentid")) if len(dict_loc_parents.keys())>0: list_result.append(dict_loc_parents) return list_result def chooseLocation(list_result): province = "" city = "" district = "" dict_province = {} for _dict in list_result: province = _dict.get(20,{}).get("cname","") if province!="": if province not in dict_province: dict_province[province] = 0 dict_province[province] += 1 max_province = "" max_province_count = 0 for k,v in dict_province.items(): if v>max_province_count: max_province = k max_province_count = v if len(list_result)>0: list_result.sort(key=lambda x:len(list(x.keys())),reverse=True) for _dict in list_result: _province = _dict.get(20,{}).get("cname","") if _province!=max_province: continue province = _province city = _dict.get(30,{}).get("cname","") district = _dict.get(40,{}).get("cname","") break return province,city,district def getCurrent_date(format="%Y-%m-%d %H:%M:%S"): _time = time.strftime(format,time.localtime()) return _time def getLocation(_str): list_names = list(dict_name_locations.keys()) name_pattern = "(?P%s)"%"|".join(list_names) name_pattern_subcompany = "(%s)分"%name_pattern list_result = [] for _iter in re.finditer(name_pattern_subcompany,_str): _loc = _iter.groupdict().get("locations") list_result.extend(getProvinceCityDistrict(_loc)) if len(list_result)>0: return chooseLocation(list_result) for _iter in re.finditer(name_pattern,_str): _loc = _iter.groupdict().get("locations") list_result.extend(getProvinceCityDistrict(_loc)) return chooseLocation(list_result) if __name__=="__main__": print(getLocation("佛山市顺德区顺控路桥投资有限公司")) # print(fool_char_to_id[">"]) # model = getModel_w2v() # vocab,matrix = getVocabAndMatrix(model, Embedding_size=128) # save([vocab,matrix],"vocabMatrix_words.pk") pass