Utils.py 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031
  1. '''
  2. Created on 2018年12月20日
  3. @author: User
  4. '''
  5. import numpy as np
  6. import re
  7. import gensim
  8. from keras import backend as K
  9. import os,sys
  10. import time
  11. import traceback
  12. from threading import RLock
  13. # from pai_tf_predict_proto import tf_predict_pb2
  14. import requests
  15. model_w2v = None
  16. lock_model_w2v = RLock()
  17. USE_PAI_EAS = False
  18. Lazy_load = False
  19. # API_URL = "http://192.168.2.103:8802"
  20. API_URL = "http://127.0.0.1:888"
  21. # USE_API = True
  22. USE_API = False
  23. def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
  24. _time = time.strftime(format,time.localtime())
  25. return _time
  26. def getw2vfilepath():
  27. filename = "wiki_128_word_embedding_new.vector"
  28. w2vfile = getFileFromSysPath(filename)
  29. if w2vfile is not None:
  30. return w2vfile
  31. return filename
  32. def getLazyLoad():
  33. global Lazy_load
  34. return Lazy_load
  35. def getFileFromSysPath(filename):
  36. for _path in sys.path:
  37. if os.path.isdir(_path):
  38. for _file in os.listdir(_path):
  39. _abspath = os.path.join(_path,_file)
  40. if os.path.isfile(_abspath):
  41. if _file==filename:
  42. return _abspath
  43. return None
  44. model_word_file = os.path.dirname(__file__)+"/../singlew2v_model.vector"
  45. model_word = None
  46. lock_model_word = RLock()
  47. from decimal import Decimal
  48. import logging
  49. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  50. logger = logging.getLogger(__name__)
  51. import pickle
  52. import os
  53. import json
  54. #自定义jsonEncoder
  55. class MyEncoder(json.JSONEncoder):
  56. def __init__(self):
  57. import numpy as np
  58. global np
  59. def default(self, obj):
  60. if isinstance(obj, np.ndarray):
  61. return obj.tolist()
  62. elif isinstance(obj, bytes):
  63. return str(obj, encoding='utf-8')
  64. elif isinstance(obj, (np.float_, np.float16, np.float32,
  65. np.float64)):
  66. return float(obj)
  67. elif isinstance(obj,(np.int64,np.int32)):
  68. return int(obj)
  69. return json.JSONEncoder.default(self, obj)
  70. vocab_word = None
  71. vocab_words = None
  72. file_vocab_word = "vocab_word.pk"
  73. file_vocab_words = "vocab_words.pk"
  74. selffool_authorization = "NjlhMWFjMjVmNWYyNzI0MjY1OGQ1M2Y0ZmY4ZGY0Mzg3Yjc2MTVjYg=="
  75. selffool_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_gpu"
  76. selffool_seg_authorization = "OWUwM2Q0ZmE3YjYxNzU4YzFiMjliNGVkMTA3MzJkNjQ2MzJiYzBhZg=="
  77. selffool_seg_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_seg_gpu"
  78. codename_authorization = "Y2M5MDUxMzU1MTU4OGM3ZDk2ZmEzYjkxYmYyYzJiZmUyYTgwYTg5NA=="
  79. codename_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codename_gpu"
  80. form_item_authorization = "ODdkZWY1YWY0NmNhNjU2OTI2NWY4YmUyM2ZlMDg1NTZjOWRkYTVjMw=="
  81. form_item_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/form"
  82. person_authorization = "N2I2MDU2N2Q2MGQ0ZWZlZGM3NDkyNTA1Nzc4YmM5OTlhY2MxZGU1Mw=="
  83. person_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/person"
  84. role_authorization = "OWM1ZDg5ZDEwYTEwYWI4OGNjYmRlMmQ1NzYwNWNlZGZkZmRmMjE4OQ=="
  85. role_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/role"
  86. money_authorization = "MDQyNjc2ZDczYjBhYmM4Yzc4ZGI4YjRmMjc3NGI5NTdlNzJiY2IwZA=="
  87. money_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/money"
  88. codeclasses_authorization = "MmUyNWIxZjQ2NjAzMWJlMGIzYzkxMjMzNWY5OWI3NzJlMWQ1ZjY4Yw=="
  89. codeclasses_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codeclasses"
  90. def viterbi_decode(score, transition_params):
  91. """Decode the highest scoring sequence of tags outside of TensorFlow.
  92. This should only be used at test time.
  93. Args:
  94. score: A [seq_len, num_tags] matrix of unary potentials.
  95. transition_params: A [num_tags, num_tags] matrix of binary potentials.
  96. Returns:
  97. viterbi: A [seq_len] list of integers containing the highest scoring tag
  98. indices.
  99. viterbi_score: A float containing the score for the Viterbi sequence.
  100. """
  101. trellis = np.zeros_like(score)
  102. backpointers = np.zeros_like(score, dtype=np.int32)
  103. trellis[0] = score[0]
  104. for t in range(1, score.shape[0]):
  105. v = np.expand_dims(trellis[t - 1], 1) + transition_params
  106. trellis[t] = score[t] + np.max(v, 0)
  107. backpointers[t] = np.argmax(v, 0)
  108. viterbi = [np.argmax(trellis[-1])]
  109. for bp in reversed(backpointers[1:]):
  110. viterbi.append(bp[viterbi[-1]])
  111. viterbi.reverse()
  112. viterbi_score = np.max(trellis[-1])
  113. return viterbi, viterbi_score
  114. def limitRun(sess,list_output,feed_dict,MAX_BATCH=1024):
  115. len_sample = 0
  116. if len(feed_dict.keys())>0:
  117. len_sample = len(feed_dict[list(feed_dict.keys())[0]])
  118. if len_sample>MAX_BATCH:
  119. list_result = [[] for _ in range(len(list_output))]
  120. _begin = 0
  121. while(_begin<len_sample):
  122. new_dict = dict()
  123. for _key in feed_dict.keys():
  124. if isinstance(feed_dict[_key],(float,int,np.int32,np.float_,np.float16,np.float32,np.float64)):
  125. new_dict[_key] = feed_dict[_key]
  126. else:
  127. new_dict[_key] = feed_dict[_key][_begin:_begin+MAX_BATCH]
  128. _output = sess.run(list_output,feed_dict=new_dict)
  129. for _index in range(len(list_output)):
  130. list_result[_index].extend(_output[_index])
  131. _begin += MAX_BATCH
  132. else:
  133. list_result = sess.run(list_output,feed_dict=feed_dict)
  134. return list_result
  135. def get_values(response,output_name):
  136. """
  137. Get the value of a specified output tensor
  138. :param output_name: name of the output tensor
  139. :return: the content of the output tensor
  140. """
  141. output = response.outputs[output_name]
  142. if output.dtype == tf_predict_pb2.DT_FLOAT:
  143. _value = output.float_val
  144. elif output.dtype == tf_predict_pb2.DT_INT8 or output.dtype == tf_predict_pb2.DT_INT16 or \
  145. output.dtype == tf_predict_pb2.DT_INT32:
  146. _value = output.int_val
  147. elif output.dtype == tf_predict_pb2.DT_INT64:
  148. _value = output.int64_val
  149. elif output.dtype == tf_predict_pb2.DT_DOUBLE:
  150. _value = output.double_val
  151. elif output.dtype == tf_predict_pb2.DT_STRING:
  152. _value = output.string_val
  153. elif output.dtype == tf_predict_pb2.DT_BOOL:
  154. _value = output.bool_val
  155. return np.array(_value).reshape(response.outputs[output_name].array_shape.dim)
  156. def vpc_requests(url,authorization,request_data,list_outputs):
  157. headers = {"Authorization": authorization}
  158. dict_outputs = dict()
  159. response = tf_predict_pb2.PredictResponse()
  160. resp = requests.post(url, data=request_data, headers=headers)
  161. if resp.status_code != 200:
  162. print(resp.status_code,resp.content)
  163. log("调用pai-eas接口出错,authorization:"+str(authorization))
  164. return None
  165. else:
  166. response = tf_predict_pb2.PredictResponse()
  167. response.ParseFromString(resp.content)
  168. for _output in list_outputs:
  169. dict_outputs[_output] = get_values(response, _output)
  170. return dict_outputs
  171. def encodeInput(data,word_len,word_flag=True,userFool=False):
  172. result = []
  173. out_index = 0
  174. for item in data:
  175. if out_index in [0]:
  176. list_word = item[-word_len:]
  177. else:
  178. list_word = item[:word_len]
  179. temp = []
  180. if word_flag:
  181. for word in list_word:
  182. if userFool:
  183. temp.append(getIndexOfWord_fool(word))
  184. else:
  185. temp.append(getIndexOfWord(word))
  186. list_append = []
  187. temp_len = len(temp)
  188. while(temp_len<word_len):
  189. if userFool:
  190. list_append.append(0)
  191. else:
  192. list_append.append(getIndexOfWord("<pad>"))
  193. temp_len += 1
  194. if out_index in [0]:
  195. temp = list_append+temp
  196. else:
  197. temp = temp+list_append
  198. else:
  199. for words in list_word:
  200. temp.append(getIndexOfWords(words))
  201. list_append = []
  202. temp_len = len(temp)
  203. while(temp_len<word_len):
  204. list_append.append(getIndexOfWords("<pad>"))
  205. temp_len += 1
  206. if out_index in [0,1]:
  207. temp = list_append+temp
  208. else:
  209. temp = temp+list_append
  210. result.append(temp)
  211. out_index += 1
  212. return result
  213. def encodeInput_form(input,MAX_LEN=30):
  214. x = np.zeros([MAX_LEN])
  215. for i in range(len(input)):
  216. if i>=MAX_LEN:
  217. break
  218. x[i] = getIndexOfWord(input[i])
  219. return x
  220. def getVocabAndMatrix(model,Embedding_size = 60):
  221. '''
  222. @summary:获取子向量的词典和子向量矩阵
  223. '''
  224. vocab = ["<pad>"]+model.index2word
  225. embedding_matrix = np.zeros((len(vocab),Embedding_size))
  226. for i in range(1,len(vocab)):
  227. embedding_matrix[i] = model[vocab[i]]
  228. return vocab,embedding_matrix
  229. def getIndexOfWord(word):
  230. global vocab_word,file_vocab_word
  231. if vocab_word is None:
  232. if os.path.exists(file_vocab_word):
  233. vocab = load(file_vocab_word)
  234. vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab)))
  235. else:
  236. model = getModel_word()
  237. vocab,_ = getVocabAndMatrix(model, Embedding_size=60)
  238. vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab)))
  239. save(vocab,file_vocab_word)
  240. if word in vocab_word.keys():
  241. return vocab_word[word]
  242. else:
  243. return vocab_word['<pad>']
  244. def changeIndexFromWordToWords(tokens,word_index):
  245. '''
  246. @summary:转换某个字的字偏移为词偏移
  247. '''
  248. before_index = 0
  249. after_index = 0
  250. for i in range(len(tokens)):
  251. after_index = after_index+len(tokens[i])
  252. if before_index<=word_index and after_index>word_index:
  253. return i
  254. before_index = after_index
  255. def getIndexOfWords(words):
  256. global vocab_words,file_vocab_words
  257. if vocab_words is None:
  258. if os.path.exists(file_vocab_words):
  259. vocab = load(file_vocab_words)
  260. vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab)))
  261. else:
  262. model = getModel_w2v()
  263. vocab,_ = getVocabAndMatrix(model, Embedding_size=128)
  264. vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab)))
  265. save(vocab,file_vocab_words)
  266. if words in vocab_words.keys():
  267. return vocab_words[words]
  268. else:
  269. return vocab_words["<pad>"]
  270. def log(msg):
  271. '''
  272. @summary:打印信息
  273. '''
  274. logger.info(msg)
  275. def debug(msg):
  276. '''
  277. @summary:打印信息
  278. '''
  279. logger.debug(msg)
  280. def save(object_to_save, path):
  281. '''
  282. 保存对象
  283. @Arugs:
  284. object_to_save: 需要保存的对象
  285. @Return:
  286. 保存的路径
  287. '''
  288. with open(path, 'wb') as f:
  289. pickle.dump(object_to_save, f)
  290. def load(path):
  291. '''
  292. 读取对象
  293. @Arugs:
  294. path: 读取的路径
  295. @Return:
  296. 读取的对象
  297. '''
  298. with open(path, 'rb') as f:
  299. object1 = pickle.load(f)
  300. return object1
  301. fool_char_to_id = load(os.path.dirname(__file__)+"/fool_char_to_id.pk")
  302. def getIndexOfWord_fool(word):
  303. if word in fool_char_to_id.keys():
  304. return fool_char_to_id[word]
  305. else:
  306. return fool_char_to_id["[UNK]"]
  307. def find_index(list_tofind,text):
  308. '''
  309. @summary: 查找所有词汇在字符串中第一次出现的位置
  310. @param:
  311. list_tofind:待查找词汇
  312. text:字符串
  313. @return: list,每个词汇第一次出现的位置
  314. '''
  315. result = []
  316. for item in list_tofind:
  317. index = text.find(item)
  318. if index>=0:
  319. result.append(index)
  320. else:
  321. result.append(-1)
  322. return result
  323. def combine(list1,list2):
  324. '''
  325. @summary:将两个list中的字符串两两拼接
  326. @param:
  327. list1:字符串list
  328. list2:字符串list
  329. @return:拼接结果list
  330. '''
  331. result = []
  332. for item1 in list1:
  333. for item2 in list2:
  334. result.append(str(item1)+str(item2))
  335. return result
  336. def getDigitsDic(unit):
  337. '''
  338. @summary:拿到中文对应的数字
  339. '''
  340. DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
  341. "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
  342. return DigitsDic.get(unit)
  343. def getMultipleFactor(unit):
  344. '''
  345. @summary:拿到单位对应的值
  346. '''
  347. MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
  348. return MultipleFactor.get(unit)
  349. def getUnifyMoney(money):
  350. '''
  351. @summary:将中文金额字符串转换为数字金额
  352. @param:
  353. money:中文金额字符串
  354. @return: decimal,数据金额
  355. '''
  356. MAX_MONEY = 1000000000000
  357. MAX_NUM = 12
  358. #去掉逗号
  359. money = re.sub("[,,]","",money)
  360. money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
  361. result = Decimal(0)
  362. chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
  363. # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
  364. chnFactorUnits = ["圆", "元","兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
  365. LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
  366. BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
  367. try:
  368. if re.search(LowMoneypattern,money) is not None:
  369. return Decimal(money)
  370. elif re.search(BigMoneypattern,money) is not None:
  371. return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney"))
  372. for factorUnit in chnFactorUnits:
  373. if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
  374. subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
  375. if re.search(re.compile("^(\d+)(\.\d+)?$"),subMoneys[0]) is not None:
  376. if MAX_MONEY/getMultipleFactor(factorUnit)<Decimal(subMoneys[0]):
  377. return Decimal(0)
  378. result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
  379. elif len(subMoneys[0])==1:
  380. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
  381. result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
  382. # subMoneys[0]中无金额单位,不可再拆分
  383. elif subMoneys[0]=="":
  384. result += 0
  385. elif re.search(re.compile("[%s]"%("".join(chnFactorUnits))),subMoneys[0]) is None:
  386. # print(subMoneys)
  387. # subMoneys[0] = subMoneys[0][0]
  388. result += Decimal(getUnifyMoney(subMoneys[0])) * (getMultipleFactor(factorUnit))
  389. else:
  390. result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
  391. if len(subMoneys)>1:
  392. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
  393. result += Decimal(subMoneys[1])
  394. elif len(subMoneys[1])==1:
  395. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
  396. result += Decimal(getDigitsDic(subMoneys[1]))
  397. else:
  398. result += Decimal(getUnifyMoney(subMoneys[1]))
  399. break
  400. except Exception as e:
  401. # traceback.print_exc()
  402. return Decimal(0)
  403. return result
  404. def getModel_w2v():
  405. '''
  406. @summary:加载词向量
  407. '''
  408. global model_w2v,lock_model_w2v
  409. with lock_model_w2v:
  410. if model_w2v is None:
  411. model_w2v = gensim.models.KeyedVectors.load_word2vec_format(getw2vfilepath(),binary=True)
  412. return model_w2v
  413. def getModel_word():
  414. '''
  415. @summary:加载字向量
  416. '''
  417. global model_word,lock_model_w2v
  418. with lock_model_word:
  419. if model_word is None:
  420. model_word = gensim.models.KeyedVectors.load_word2vec_format(model_word_file,binary=True)
  421. return model_word
  422. # getModel_w2v()
  423. # getModel_word()
  424. def findAllIndex(substr,wholestr):
  425. '''
  426. @summary: 找到字符串的子串的所有begin_index
  427. @param:
  428. substr:子字符串
  429. wholestr:子串所在完整字符串
  430. @return: list,字符串的子串的所有begin_index
  431. '''
  432. copystr = wholestr
  433. result = []
  434. indexappend = 0
  435. while(True):
  436. index = copystr.find(substr)
  437. if index<0:
  438. break
  439. else:
  440. result.append(indexappend+index)
  441. indexappend += index+len(substr)
  442. copystr = copystr[index+len(substr):]
  443. return result
  444. def spanWindow(tokens,begin_index,end_index,size,center_include=False,word_flag = False,use_text = False,text = None):
  445. '''
  446. @summary:取得某个实体的上下文词汇
  447. @param:
  448. tokens:句子分词list
  449. begin_index:实体的开始index
  450. end_index:实体的结束index
  451. size:左右两边各取多少个词
  452. center_include:是否包含实体
  453. word_flag:词/字,默认是词
  454. @return: list,实体的上下文词汇
  455. '''
  456. if use_text:
  457. assert text is not None
  458. length_tokens = len(tokens)
  459. if begin_index>size:
  460. begin = begin_index-size
  461. else:
  462. begin = 0
  463. if end_index+size<length_tokens:
  464. end = end_index+size+1
  465. else:
  466. end = length_tokens
  467. result = []
  468. if not word_flag:
  469. result.append(tokens[begin:begin_index])
  470. if center_include:
  471. if use_text:
  472. result.append(text)
  473. else:
  474. result.append(tokens[begin_index:end_index+1])
  475. result.append(tokens[end_index+1:end])
  476. else:
  477. result.append("".join(tokens[begin:begin_index]))
  478. if center_include:
  479. if use_text:
  480. result.append(text)
  481. else:
  482. result.append("".join(tokens[begin_index:end_index+1]))
  483. result.append("".join(tokens[end_index+1:end]))
  484. #print(result)
  485. return result
  486. #根据规则补全编号或名称两边的符号
  487. def fitDataByRule(data):
  488. symbol_dict = {"(":")",
  489. "(":")",
  490. "[":"]",
  491. "【":"】",
  492. ")":"(",
  493. ")":"(",
  494. "]":"[",
  495. "】":"【"}
  496. leftSymbol_pattern = re.compile("[\((\[【]")
  497. rightSymbol_pattern = re.compile("[\))\]】]")
  498. leftfinds = re.findall(leftSymbol_pattern,data)
  499. rightfinds = re.findall(rightSymbol_pattern,data)
  500. result = data
  501. if len(leftfinds)+len(rightfinds)==0:
  502. return data
  503. elif len(leftfinds)==len(rightfinds):
  504. return data
  505. elif abs(len(leftfinds)-len(rightfinds))==1:
  506. if len(leftfinds)>len(rightfinds):
  507. if symbol_dict.get(data[0]) is not None:
  508. result = data[1:]
  509. else:
  510. #print(symbol_dict.get(leftfinds[0]))
  511. result = data+symbol_dict.get(leftfinds[0])
  512. else:
  513. if symbol_dict.get(data[-1]) is not None:
  514. result = data[:-1]
  515. else:
  516. result = symbol_dict.get(rightfinds[0])+data
  517. result = re.sub("[。]","",result)
  518. return result
  519. from datetime import date
  520. # 时间合法性判断
  521. def isValidDate(year, month, day):
  522. try:
  523. date(year, month, day)
  524. except:
  525. return False
  526. else:
  527. return True
  528. time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
  529. from BiddingKG.dl.ratio.re_ratio import getUnifyNum
  530. def timeFormat(_time):
  531. current_year = time.strftime("%Y",time.localtime())
  532. all_match = re.finditer(time_format_pattern,_time)
  533. for _match in all_match:
  534. if len(_match.group())>0:
  535. legal = True
  536. year = ""
  537. month = ""
  538. day = ""
  539. for k,v in _match.groupdict().items():
  540. if k=="year":
  541. year = v
  542. if k=="month":
  543. month = v
  544. if k=="day":
  545. day = v
  546. if year!="":
  547. if re.search("^\d+$",year):
  548. if len(year)==2:
  549. year = "20"+year
  550. if int(year)>int(current_year):
  551. legal = False
  552. else:
  553. _year = ""
  554. for word in year:
  555. if word == '0':
  556. _year += word
  557. else:
  558. _year += str(getDigitsDic(word))
  559. year = _year
  560. else:
  561. legal = False
  562. if month!="":
  563. if re.search("^\d+$", month):
  564. if int(month)>12:
  565. legal = False
  566. else:
  567. month = int(getUnifyNum(month))
  568. if month>=1 and month<=12:
  569. month = str(month)
  570. else:
  571. legal = False
  572. else:
  573. legal = False
  574. if day!="":
  575. if re.search("^\d+$", day):
  576. if int(day)>31:
  577. legal = False
  578. else:
  579. day = int(getUnifyNum(day))
  580. if day >= 1 and day <= 31:
  581. day = str(day)
  582. else:
  583. legal = False
  584. else:
  585. legal = False
  586. # print(year,month,day)
  587. if not isValidDate(int(year),int(month),int(day)):
  588. legal = False
  589. if legal:
  590. return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
  591. return ""
  592. def embedding(datas,shape):
  593. '''
  594. @summary:查找词汇对应的词向量
  595. @param:
  596. datas:词汇的list
  597. shape:结果的shape
  598. @return: array,返回对应shape的词嵌入
  599. '''
  600. model_w2v = getModel_w2v()
  601. embed = np.zeros(shape)
  602. length = shape[1]
  603. out_index = 0
  604. #print(datas)
  605. for data in datas:
  606. index = 0
  607. for item in data:
  608. item_not_space = re.sub("\s*","",item)
  609. if index>=length:
  610. break
  611. if item_not_space in model_w2v.vocab:
  612. embed[out_index][index] = model_w2v[item_not_space]
  613. index += 1
  614. else:
  615. #embed[out_index][index] = model_w2v['unk']
  616. index += 1
  617. out_index += 1
  618. return embed
  619. def embedding_word(datas,shape):
  620. '''
  621. @summary:查找词汇对应的词向量
  622. @param:
  623. datas:词汇的list
  624. shape:结果的shape
  625. @return: array,返回对应shape的词嵌入
  626. '''
  627. model_w2v = getModel_word()
  628. embed = np.zeros(shape)
  629. length = shape[1]
  630. out_index = 0
  631. #print(datas)
  632. for data in datas:
  633. index = 0
  634. for item in str(data)[-shape[1]:]:
  635. if index>=length:
  636. break
  637. if item in model_w2v.vocab:
  638. embed[out_index][index] = model_w2v[item]
  639. index += 1
  640. else:
  641. # embed[out_index][index] = model_w2v['unk']
  642. index += 1
  643. out_index += 1
  644. return embed
  645. def embedding_word_forward(datas,shape):
  646. '''
  647. @summary:查找词汇对应的词向量
  648. @param:
  649. datas:词汇的list
  650. shape:结果的shape
  651. @return: array,返回对应shape的词嵌入
  652. '''
  653. model_w2v = getModel_word()
  654. embed = np.zeros(shape)
  655. length = shape[1]
  656. out_index = 0
  657. #print(datas)
  658. for data in datas:
  659. index = 0
  660. for item in str(data)[:shape[1]]:
  661. if index>=length:
  662. break
  663. if item in model_w2v.vocab:
  664. embed[out_index][index] = model_w2v[item]
  665. index += 1
  666. else:
  667. # embed[out_index][index] = model_w2v['unk']
  668. index += 1
  669. out_index += 1
  670. return embed
  671. def formEncoding(text,shape=(100,60),expand=False):
  672. embedding = np.zeros(shape)
  673. word_model = getModel_word()
  674. for i in range(len(text)):
  675. if i>=shape[0]:
  676. break
  677. if text[i] in word_model.vocab:
  678. embedding[i] = word_model[text[i]]
  679. if expand:
  680. embedding = np.expand_dims(embedding,0)
  681. return embedding
  682. def partMoney(entity_text,input2_shape = [7]):
  683. '''
  684. @summary:对金额分段
  685. @param:
  686. entity_text:数值金额
  687. input2_shape:分类数
  688. @return: array,分段之后的独热编码
  689. '''
  690. money = float(entity_text)
  691. parts = np.zeros(input2_shape)
  692. if money<100:
  693. parts[0] = 1
  694. elif money<1000:
  695. parts[1] = 1
  696. elif money<10000:
  697. parts[2] = 1
  698. elif money<100000:
  699. parts[3] = 1
  700. elif money<1000000:
  701. parts[4] = 1
  702. elif money<10000000:
  703. parts[5] = 1
  704. else:
  705. parts[6] = 1
  706. return parts
  707. def uniform_num(num):
  708. d1 = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十': '10'}
  709. # d2 = {'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5', 'F': '6', 'G': '7', 'H': '8', 'I': '9', 'J': '10'}
  710. d3 = {'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5', 'Ⅵ': '6', 'Ⅶ': '7'}
  711. if num.isdigit():
  712. if re.search('^0[\d]$', num):
  713. num = num[1:]
  714. return num
  715. elif re.search('^[一二三四五六七八九十]+$', num):
  716. _digit = re.search('^[一二三四五六七八九十]+$', num).group(0)
  717. if len(_digit) == 1:
  718. num = d1[_digit]
  719. elif len(_digit) == 2 and _digit[0] == '十':
  720. num = '1'+ d1[_digit[1]]
  721. elif len(_digit) == 2 and _digit[1] == '十':
  722. num = d1[_digit[0]] + '0'
  723. elif len(_digit) == 3 and _digit[1] == '十':
  724. num = d1[_digit[0]] + d1[_digit[2]]
  725. elif re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num):
  726. num = re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num).group(0)
  727. num = d3[num]
  728. return num
  729. def uniform_package_name(package_name):
  730. '''
  731. 统一规范化包号。数值类型统一为阿拉伯数字,字母统一为大写,包含施工监理等抽到前面, 例 A包监理一标段 统一为 监理A1 ; 包Ⅱ 统一为 2
  732. :param package_name: 字符串类型 包号
  733. :return:
  734. '''
  735. package_name_raw = package_name
  736. package_name = re.sub('pdf|doc|docs|xlsx|rar|\d{4}年', ' ', package_name)
  737. kw = re.search('(施工|监理|监测|勘察|设计|劳务)', package_name)
  738. name = ""
  739. if kw:
  740. name += kw.group(0)
  741. if re.search('^[a-zA-Z0-9-]{5,}$', package_name): # 五个字符以上编号
  742. _digit = re.search('^[a-zA-Z0-9-]{5,}$', package_name).group(0).upper()
  743. name += _digit
  744. elif re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name): # 处理类似 A包2标段
  745. ser = re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name)
  746. _char = ser.groupdict().get('eng')
  747. if _char:
  748. _char = _char.upper()
  749. _digit = ser.groupdict().get('num')
  750. _digit = uniform_num(_digit)
  751. name += _char.upper() + _digit
  752. elif re.search('第?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|([分子]?[包标]))', package_name): # 处理类似 A包2标段
  753. ser = re.search('第?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|([分子]?[包标]))', package_name)
  754. _char = ser.groupdict().get('eng')
  755. if _char:
  756. _char = _char.upper()
  757. _digit = ser.groupdict().get('num')
  758. _digit = uniform_num(_digit)
  759. if _char:
  760. name += _char.upper()
  761. name += _digit
  762. elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name): # 数字的统一的阿拉伯数字
  763. ser = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name)
  764. _char = ser.groupdict().get('eng')
  765. if _char:
  766. _char = _char.upper()
  767. _digit = ser.groupdict().get('num')
  768. _digit = uniform_num(_digit)
  769. if _char:
  770. name += _char.upper()
  771. name += _digit
  772. elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})', package_name): # 数字的统一的阿拉伯数字
  773. _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})', package_name).group('eng').upper()
  774. name += _digit
  775. elif re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name): # 数字的统一的阿拉伯数字
  776. _digit = re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name).group('eng').upper()
  777. name += _digit
  778. elif re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name): # 数字的统一的阿拉伯数字
  779. _digit = re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name).group(0)
  780. _digit = uniform_num(_digit)
  781. name += _digit
  782. elif re.search('^[a-zA-Z0-9-]+$', package_name):
  783. _char = re.search('^[a-zA-Z0-9-]+$', package_name).group(0)
  784. name += _char.upper()
  785. if name == "":
  786. return package_name_raw
  787. else:
  788. # print('原始包号:%s, 处理后:%s'%(package_name, name))
  789. return name
  790. def money_process(money_text, header):
  791. '''
  792. 输入金额文本及金额列表头,返回统一数字化金额及金额单位
  793. :param money_text:金额字符串
  794. :param header:金额列表头,用于提取单位
  795. :return:
  796. '''
  797. money = 0
  798. money_unit = ""
  799. re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", money_text)
  800. if re_price:
  801. money_text = re_price.group(0)
  802. if '万元' in header and '万' not in money_text:
  803. money_text += '万元'
  804. money = float(getUnifyMoney(money_text))
  805. if money > 10000000000000: # 大于万亿的去除
  806. money = 0
  807. money_unit = '万元' if '万' in money_text else '元'
  808. return (money, money_unit)
  809. def recall(y_true, y_pred):
  810. '''
  811. 计算召回率
  812. @Argus:
  813. y_true: 正确的标签
  814. y_pred: 模型预测的标签
  815. @Return
  816. 召回率
  817. '''
  818. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  819. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  820. if c3 == 0:
  821. return 0
  822. recall = c1 / c3
  823. return recall
  824. def f1_score(y_true, y_pred):
  825. '''
  826. 计算F1
  827. @Argus:
  828. y_true: 正确的标签
  829. y_pred: 模型预测的标签
  830. @Return
  831. F1值
  832. '''
  833. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  834. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  835. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  836. precision = c1 / c2
  837. if c3 == 0:
  838. recall = 0
  839. else:
  840. recall = c1 / c3
  841. f1_score = 2 * (precision * recall) / (precision + recall)
  842. return f1_score
  843. def precision(y_true, y_pred):
  844. '''
  845. 计算精确率
  846. @Argus:
  847. y_true: 正确的标签
  848. y_pred: 模型预测的标签
  849. @Return
  850. 精确率
  851. '''
  852. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  853. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  854. precision = c1 / c2
  855. return precision
  856. # def print_metrics(history):
  857. # '''
  858. # 制作每次迭代的各metrics变化图片
  859. #
  860. # @Arugs:
  861. # history: 模型训练迭代的历史记录
  862. # '''
  863. # import matplotlib.pyplot as plt
  864. #
  865. # # loss图
  866. # loss = history.history['loss']
  867. # val_loss = history.history['val_loss']
  868. # epochs = range(1, len(loss) + 1)
  869. # plt.subplot(2, 2, 1)
  870. # plt.plot(epochs, loss, 'bo', label='Training loss')
  871. # plt.plot(epochs, val_loss, 'b', label='Validation loss')
  872. # plt.title('Training and validation loss')
  873. # plt.xlabel('Epochs')
  874. # plt.ylabel('Loss')
  875. # plt.legend()
  876. #
  877. # # f1图
  878. # f1 = history.history['f1_score']
  879. # val_f1 = history.history['val_f1_score']
  880. # plt.subplot(2, 2, 2)
  881. # plt.plot(epochs, f1, 'bo', label='Training f1')
  882. # plt.plot(epochs, val_f1, 'b', label='Validation f1')
  883. # plt.title('Training and validation f1')
  884. # plt.xlabel('Epochs')
  885. # plt.ylabel('F1')
  886. # plt.legend()
  887. #
  888. # # precision图
  889. # prec = history.history['precision']
  890. # val_prec = history.history['val_precision']
  891. # plt.subplot(2, 2, 3)
  892. # plt.plot(epochs, prec, 'bo', label='Training precision')
  893. # plt.plot(epochs, val_prec, 'b', label='Validation pecision')
  894. # plt.title('Training and validation precision')
  895. # plt.xlabel('Epochs')
  896. # plt.ylabel('Precision')
  897. # plt.legend()
  898. #
  899. # # recall图
  900. # recall = history.history['recall']
  901. # val_recall = history.history['val_recall']
  902. # plt.subplot(2, 2, 4)
  903. # plt.plot(epochs, recall, 'bo', label='Training recall')
  904. # plt.plot(epochs, val_recall, 'b', label='Validation recall')
  905. # plt.title('Training and validation recall')
  906. # plt.xlabel('Epochs')
  907. # plt.ylabel('Recall')
  908. # plt.legend()
  909. #
  910. # plt.show()
  911. if __name__=="__main__":
  912. # print(fool_char_to_id[">"])
  913. print(getUnifyMoney('伍仟贰佰零壹拾伍万零捌佰壹拾元陆角伍分'))
  914. # model = getModel_w2v()
  915. # vocab,matrix = getVocabAndMatrix(model, Embedding_size=128)
  916. # save([vocab,matrix],"vocabMatrix_words.pk")