Utils.py 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101
  1. '''
  2. Created on 2018年12月20日
  3. @author: User
  4. '''
  5. import numpy as np
  6. import re
  7. import gensim
  8. import os
  9. from threading import RLock
  10. # from pai_tf_predict_proto import tf_predict_pb2
  11. import requests
  12. import time
  13. from bs4 import BeautifulSoup
  14. model_w2v = None
  15. lock_model_w2v = RLock()
  16. USE_PAI_EAS = False
  17. Lazy_load = False
  18. ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
  19. import smtplib
  20. from email.mime.application import MIMEApplication
  21. from email.mime.multipart import MIMEMultipart
  22. from email.utils import formataddr
  23. ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
  24. def getLegal_str(_str):
  25. if _str is not None:
  26. return ILLEGAL_CHARACTERS_RE.sub("",str(_str))
  27. import traceback
  28. dict_server = {}
  29. def getServer(host,username,password,reconnect=False):
  30. key = "%s-%s-%s"%(host,username,password)
  31. if key in dict_server:
  32. server = dict_server[key]
  33. if reconnect:
  34. server = smtplib.SMTP_SSL(host, 465)
  35. server.login(username,password)
  36. else:
  37. server = smtplib.SMTP_SSL(host, 465)
  38. server.login(username,password)
  39. dict_server[key] = server
  40. return server
  41. from email.mime.text import MIMEText
  42. def sendEmail(host,username,password,receivers,subject="数据导出",content="",attachs=[]):
  43. try:
  44. #处理附件
  45. msg = MIMEMultipart()
  46. msg["From"] = formataddr(["广州比地数据科技有限公司",username])
  47. msg["To"] = formataddr(["客户",receivers[0]])
  48. msg["Subject"] = subject
  49. message = MIMEText(content, 'plain', 'utf-8')
  50. for at in attachs:
  51. xlsfile = MIMEApplication(open(at,"rb").read())
  52. xlsfile.add_header("Content-Disposition","attachment",filename=('gbk', '', at.split("/")[-1]))
  53. log(at.split("/")[-1])
  54. msg.attach(xlsfile)
  55. server = getServer(host,username,password)
  56. server.sendmail(username,receivers,msg.as_string())
  57. log("发送邮件成功%s"%str(attachs))
  58. except smtplib.SMTPServerDisconnected as e:
  59. server = getServer(host,username,password,reconnect=True)
  60. server.sendmail(username,receivers,msg.as_string())
  61. log("发送邮件成功%s"%str(attachs))
  62. except Exception as e:
  63. traceback.print_exc()
  64. log("发送邮件错误%s"%str(e))
  65. finally:
  66. server.close()
  67. mobile_pattern = re.compile("^1\d{10}$")
  68. def recog_likeType(phone):
  69. if re.search(mobile_pattern,phone) is not None:
  70. return "mobile"
  71. else:
  72. return "phone"
  73. def article_limit(soup,limit_words=30000):
  74. sub_space = re.compile("\s+")
  75. def soup_limit(_soup,_count,max_count=30000,max_gap=500):
  76. """
  77. :param _soup: soup
  78. :param _count: 当前字数
  79. :param max_count: 字数最大限制
  80. :param max_gap: 超过限制后的最大误差
  81. :return:
  82. """
  83. _gap = _count - max_count
  84. _is_skip = False
  85. next_soup = None
  86. while len(_soup.find_all(recursive=False)) == 1 and \
  87. _soup.get_text(strip=True) == _soup.find_all(recursive=False)[0].get_text(strip=True):
  88. _soup = _soup.find_all(recursive=False)[0]
  89. if len(_soup.find_all(recursive=False)) == 0:
  90. _soup.string = str(_soup.get_text())[:max_count-_count]
  91. _count += len(re.sub(sub_space, "", _soup.string))
  92. _gap = _count - max_count
  93. next_soup = None
  94. else:
  95. for _soup_part in _soup.find_all(recursive=False):
  96. if not _is_skip:
  97. _count += len(re.sub(sub_space, "", _soup_part.get_text()))
  98. if _count >= max_count:
  99. _gap = _count - max_count
  100. if _gap <= max_gap:
  101. _is_skip = True
  102. else:
  103. _is_skip = True
  104. next_soup = _soup_part
  105. _count -= len(re.sub(sub_space, "", _soup_part.get_text()))
  106. continue
  107. else:
  108. _soup_part.decompose()
  109. return _count,_gap,next_soup
  110. text_count = 0
  111. have_attachment = False
  112. attachment_part = None
  113. for child in soup.find_all(recursive=True):
  114. if child.name == 'div' and 'class' in child.attrs:
  115. if "richTextFetch" in child['class']:
  116. child.insert_before("##attachment##")
  117. attachment_part = child
  118. have_attachment = True
  119. break
  120. if not have_attachment:
  121. # 无附件
  122. if len(re.sub(sub_space, "", soup.get_text())) > limit_words:
  123. text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=500)
  124. while n_soup:
  125. text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
  126. else:
  127. # 有附件
  128. _text = re.sub(sub_space, "", soup.get_text())
  129. _text_split = _text.split("##attachment##")
  130. if len(_text_split[0])>limit_words:
  131. main_soup = attachment_part.parent
  132. main_text = main_soup.find_all(recursive=False)[0]
  133. text_count, gap, n_soup = soup_limit(main_text, text_count, max_count=limit_words, max_gap=500)
  134. while n_soup:
  135. text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
  136. if len(_text_split[1])>limit_words:
  137. # attachment_html纯文本,无子结构
  138. if len(attachment_part.find_all(recursive=False))==0:
  139. attachment_part.string = str(attachment_part.get_text())[:limit_words]
  140. else:
  141. attachment_text_nums = 0
  142. attachment_skip = False
  143. for part in attachment_part.find_all(recursive=False):
  144. if not attachment_skip:
  145. last_attachment_text_nums = attachment_text_nums
  146. attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
  147. if attachment_text_nums>=limit_words:
  148. part.string = str(part.get_text())[:limit_words-last_attachment_text_nums]
  149. attachment_skip = True
  150. else:
  151. part.decompose()
  152. soup = str(soup).replace("##attachment##","")
  153. return soup
  154. def soup_limit(_soup,_count,max_count=30000,max_gap=500,sub_space = re.compile("\s+")):
  155. """
  156. :param _soup: soup
  157. :param _count: 当前字数
  158. :param max_count: 字数最大限制
  159. :param max_gap: 超过限制后的最大误差
  160. :return:
  161. """
  162. _gap = _count - max_count
  163. _is_skip = False
  164. next_soup = None
  165. while len(_soup.find_all(recursive=False)) == 1 and \
  166. _soup.get_text(strip=True) == _soup.find_all(recursive=False)[0].get_text(strip=True):
  167. _soup = _soup.find_all(recursive=False)[0]
  168. if len(_soup.find_all(recursive=False)) == 0:
  169. _soup.string = str(_soup.get_text())[:max_count-_count]
  170. _count += len(re.sub(sub_space, "", _soup.string))
  171. _gap = _count - max_count
  172. next_soup = None
  173. else:
  174. for _soup_part in _soup.find_all(recursive=False):
  175. if not _is_skip:
  176. _count += len(re.sub(sub_space, "", _soup_part.get_text()))
  177. if _count >= max_count:
  178. _gap = _count - max_count
  179. if _gap <= max_gap:
  180. _is_skip = True
  181. else:
  182. _is_skip = True
  183. next_soup = _soup_part
  184. _count -= len(re.sub(sub_space, "", _soup_part.get_text()))
  185. continue
  186. else:
  187. _soup_part.decompose()
  188. return _count,_gap,next_soup
  189. def cut_str(text_list, only_text_list, max_bytes_length=2000000):
  190. try:
  191. # 计算有格式总字节数
  192. bytes_length = 0
  193. for text in text_list:
  194. bytes_length += len(bytes(text, encoding='utf-8'))
  195. # 小于直接返回
  196. if bytes_length < max_bytes_length:
  197. return text_list
  198. # 全部文件连接,重新计算无格式字节数
  199. all_text = ""
  200. bytes_length = 0
  201. for text in only_text_list:
  202. bytes_length += len(bytes(text, encoding='utf-8'))
  203. all_text += text
  204. # 小于直接返回
  205. if bytes_length < max_bytes_length:
  206. return only_text_list
  207. # 截取字符
  208. all_text = all_text[:max_bytes_length//3]
  209. return [all_text]
  210. except Exception as e:
  211. logging.info("cut_str " + str(e))
  212. return text_list
  213. def getLegal_str(_str):
  214. if _str is not None:
  215. return ILLEGAL_CHARACTERS_RE.sub("",str(_str))
  216. def getRow_ots_primary(row):
  217. _dict = dict()
  218. if row is None:
  219. return None
  220. for part in row.attribute_columns:
  221. _dict[part[0]] = part[1]
  222. for part in row.primary_key:
  223. _dict[part[0]] = part[1]
  224. return _dict
  225. def timeAdd(_time,days,format="%Y-%m-%d",minutes=0):
  226. a = time.mktime(time.strptime(_time,format))+86400*days+60*minutes
  227. _time1 = time.strftime(format,time.localtime(a))
  228. return _time1
  229. def getRow_ots(rows):
  230. list_dict = []
  231. for row in rows:
  232. _dict = dict()
  233. for part in row:
  234. for v in part:
  235. _dict[v[0]] = v[1]
  236. list_dict.append(_dict)
  237. return list_dict
  238. def getw2vfilepath():
  239. w2vfile = os.path.dirname(__file__)+"/../wiki_128_word_embedding_new.vector"
  240. if os.path.exists(w2vfile):
  241. return w2vfile
  242. return "wiki_128_word_embedding_new.vector"
  243. def getLazyLoad():
  244. global Lazy_load
  245. return Lazy_load
  246. def get_file_name(url, headers):
  247. filename = ''
  248. if 'Content-Disposition' in headers and headers['Content-Disposition']:
  249. disposition_split = headers['Content-Disposition'].split(';')
  250. if len(disposition_split) > 1:
  251. if disposition_split[1].strip().lower().startswith('filename='):
  252. file_name = disposition_split[1].split('=')
  253. if len(file_name) > 1:
  254. filename = file_name[1]
  255. if not filename and os.path.basename(url):
  256. filename = os.path.basename(url).split("?")[0]
  257. if not filename:
  258. return time.time()
  259. return filename
  260. model_word_file = os.path.dirname(__file__)+"/../singlew2v_model.vector"
  261. model_word = None
  262. lock_model_word = RLock()
  263. from decimal import Decimal
  264. import logging
  265. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  266. logger = logging.getLogger(__name__)
  267. import pickle
  268. import os
  269. import json
  270. #自定义jsonEncoder
  271. class MyEncoder(json.JSONEncoder):
  272. def __init__(self):
  273. import numpy as np
  274. global np
  275. def default(self, obj):
  276. if isinstance(obj, np.ndarray):
  277. return obj.tolist()
  278. elif isinstance(obj, bytes):
  279. return str(obj, encoding='utf-8')
  280. elif isinstance(obj, (np.float_, np.float16, np.float32,
  281. np.float64)):
  282. return float(obj)
  283. elif isinstance(obj,(np.int64,np.int32)):
  284. return int(obj)
  285. return json.JSONEncoder.default(self, obj)
  286. vocab_word = None
  287. vocab_words = None
  288. file_vocab_word = "vocab_word.pk"
  289. file_vocab_words = "vocab_words.pk"
  290. selffool_authorization = "NjlhMWFjMjVmNWYyNzI0MjY1OGQ1M2Y0ZmY4ZGY0Mzg3Yjc2MTVjYg=="
  291. selffool_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_gpu"
  292. selffool_seg_authorization = "OWUwM2Q0ZmE3YjYxNzU4YzFiMjliNGVkMTA3MzJkNjQ2MzJiYzBhZg=="
  293. selffool_seg_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_seg_gpu"
  294. codename_authorization = "Y2M5MDUxMzU1MTU4OGM3ZDk2ZmEzYjkxYmYyYzJiZmUyYTgwYTg5NA=="
  295. codename_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codename_gpu"
  296. form_item_authorization = "ODdkZWY1YWY0NmNhNjU2OTI2NWY4YmUyM2ZlMDg1NTZjOWRkYTVjMw=="
  297. form_item_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/form"
  298. person_authorization = "N2I2MDU2N2Q2MGQ0ZWZlZGM3NDkyNTA1Nzc4YmM5OTlhY2MxZGU1Mw=="
  299. person_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/person"
  300. role_authorization = "OWM1ZDg5ZDEwYTEwYWI4OGNjYmRlMmQ1NzYwNWNlZGZkZmRmMjE4OQ=="
  301. role_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/role"
  302. money_authorization = "MDQyNjc2ZDczYjBhYmM4Yzc4ZGI4YjRmMjc3NGI5NTdlNzJiY2IwZA=="
  303. money_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/money"
  304. codeclasses_authorization = "MmUyNWIxZjQ2NjAzMWJlMGIzYzkxMjMzNWY5OWI3NzJlMWQ1ZjY4Yw=="
  305. codeclasses_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codeclasses"
  306. def viterbi_decode(score, transition_params):
  307. """Decode the highest scoring sequence of tags outside of TensorFlow.
  308. This should only be used at test time.
  309. Args:
  310. score: A [seq_len, num_tags] matrix of unary potentials.
  311. transition_params: A [num_tags, num_tags] matrix of binary potentials.
  312. Returns:
  313. viterbi: A [seq_len] list of integers containing the highest scoring tag
  314. indices.
  315. viterbi_score: A float containing the score for the Viterbi sequence.
  316. """
  317. trellis = np.zeros_like(score)
  318. backpointers = np.zeros_like(score, dtype=np.int32)
  319. trellis[0] = score[0]
  320. for t in range(1, score.shape[0]):
  321. v = np.expand_dims(trellis[t - 1], 1) + transition_params
  322. trellis[t] = score[t] + np.max(v, 0)
  323. backpointers[t] = np.argmax(v, 0)
  324. viterbi = [np.argmax(trellis[-1])]
  325. for bp in reversed(backpointers[1:]):
  326. viterbi.append(bp[viterbi[-1]])
  327. viterbi.reverse()
  328. viterbi_score = np.max(trellis[-1])
  329. return viterbi, viterbi_score
  330. import ctypes
  331. import inspect
  332. def _async_raise(tid, exctype):
  333. """raises the exception, performs cleanup if needed"""
  334. tid = ctypes.c_long(tid)
  335. if not inspect.isclass(exctype):
  336. exctype = type(exctype)
  337. res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
  338. if res == 0:
  339. raise ValueError("invalid thread id")
  340. elif res != 1:
  341. ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
  342. raise SystemError("PyThreadState_SetAsyncExc failed")
  343. def stop_thread(thread):
  344. _async_raise(thread.ident, SystemExit)
  345. def limitRun(sess,list_output,feed_dict,MAX_BATCH=1024):
  346. len_sample = 0
  347. if len(feed_dict.keys())>0:
  348. len_sample = len(feed_dict[list(feed_dict.keys())[0]])
  349. if len_sample>MAX_BATCH:
  350. list_result = [[] for _ in range(len(list_output))]
  351. _begin = 0
  352. while(_begin<len_sample):
  353. new_dict = dict()
  354. for _key in feed_dict.keys():
  355. new_dict[_key] = feed_dict[_key][_begin:_begin+MAX_BATCH]
  356. _output = sess.run(list_output,feed_dict=new_dict)
  357. for _index in range(len(list_output)):
  358. list_result[_index].extend(_output[_index])
  359. _begin += MAX_BATCH
  360. else:
  361. list_result = sess.run(list_output,feed_dict=feed_dict)
  362. return list_result
  363. def get_values(response,output_name):
  364. """
  365. Get the value of a specified output tensor
  366. :param output_name: name of the output tensor
  367. :return: the content of the output tensor
  368. """
  369. output = response.outputs[output_name]
  370. if output.dtype == tf_predict_pb2.DT_FLOAT:
  371. _value = output.float_val
  372. elif output.dtype == tf_predict_pb2.DT_INT8 or output.dtype == tf_predict_pb2.DT_INT16 or \
  373. output.dtype == tf_predict_pb2.DT_INT32:
  374. _value = output.int_val
  375. elif output.dtype == tf_predict_pb2.DT_INT64:
  376. _value = output.int64_val
  377. elif output.dtype == tf_predict_pb2.DT_DOUBLE:
  378. _value = output.double_val
  379. elif output.dtype == tf_predict_pb2.DT_STRING:
  380. _value = output.string_val
  381. elif output.dtype == tf_predict_pb2.DT_BOOL:
  382. _value = output.bool_val
  383. return np.array(_value).reshape(response.outputs[output_name].array_shape.dim)
  384. def vpc_requests(url,authorization,request_data,list_outputs):
  385. headers = {"Authorization": authorization}
  386. dict_outputs = dict()
  387. response = tf_predict_pb2.PredictResponse()
  388. resp = requests.post(url, data=request_data, headers=headers)
  389. if resp.status_code != 200:
  390. print(resp.status_code,resp.content)
  391. log("调用pai-eas接口出错,authorization:"+str(authorization))
  392. return None
  393. else:
  394. response = tf_predict_pb2.PredictResponse()
  395. response.ParseFromString(resp.content)
  396. for _output in list_outputs:
  397. dict_outputs[_output] = get_values(response, _output)
  398. return dict_outputs
  399. def encodeInput(data,word_len,word_flag=True,userFool=False):
  400. result = []
  401. out_index = 0
  402. for item in data:
  403. if out_index in [0]:
  404. list_word = item[-word_len:]
  405. else:
  406. list_word = item[:word_len]
  407. temp = []
  408. if word_flag:
  409. for word in list_word:
  410. if userFool:
  411. temp.append(getIndexOfWord_fool(word))
  412. else:
  413. temp.append(getIndexOfWord(word))
  414. list_append = []
  415. temp_len = len(temp)
  416. while(temp_len<word_len):
  417. if userFool:
  418. list_append.append(0)
  419. else:
  420. list_append.append(getIndexOfWord("<pad>"))
  421. temp_len += 1
  422. if out_index in [0]:
  423. temp = list_append+temp
  424. else:
  425. temp = temp+list_append
  426. else:
  427. for words in list_word:
  428. temp.append(getIndexOfWords(words))
  429. list_append = []
  430. temp_len = len(temp)
  431. while(temp_len<word_len):
  432. list_append.append(getIndexOfWords("<pad>"))
  433. temp_len += 1
  434. if out_index in [0,1]:
  435. temp = list_append+temp
  436. else:
  437. temp = temp+list_append
  438. result.append(temp)
  439. out_index += 1
  440. return result
  441. def encodeInput_form(input,MAX_LEN=30):
  442. x = np.zeros([MAX_LEN])
  443. for i in range(len(input)):
  444. if i>=MAX_LEN:
  445. break
  446. x[i] = getIndexOfWord(input[i])
  447. return x
  448. def getVocabAndMatrix(model,Embedding_size = 60):
  449. '''
  450. @summary:获取子向量的词典和子向量矩阵
  451. '''
  452. vocab = ["<pad>"]+model.index2word
  453. embedding_matrix = np.zeros((len(vocab),Embedding_size))
  454. for i in range(1,len(vocab)):
  455. embedding_matrix[i] = model[vocab[i]]
  456. return vocab,embedding_matrix
  457. def getIndexOfWord(word):
  458. global vocab_word,file_vocab_word
  459. if vocab_word is None:
  460. if os.path.exists(file_vocab_word):
  461. vocab = load(file_vocab_word)
  462. vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab)))
  463. else:
  464. model = getModel_word()
  465. vocab,_ = getVocabAndMatrix(model, Embedding_size=60)
  466. vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab)))
  467. save(vocab,file_vocab_word)
  468. if word in vocab_word.keys():
  469. return vocab_word[word]
  470. else:
  471. return vocab_word['<pad>']
  472. def getIndexOfWords(words):
  473. global vocab_words,file_vocab_words
  474. if vocab_words is None:
  475. if os.path.exists(file_vocab_words):
  476. vocab = load(file_vocab_words)
  477. vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab)))
  478. else:
  479. model = getModel_w2v()
  480. vocab,_ = getVocabAndMatrix(model, Embedding_size=128)
  481. vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab)))
  482. save(vocab,file_vocab_words)
  483. if words in vocab_words.keys():
  484. return vocab_words[words]
  485. else:
  486. return vocab_words["<pad>"]
  487. def isCellphone(phone):
  488. if phone is not None and re.search("^1\d{10}$",str(phone)) is not None:
  489. return True
  490. return False
  491. def popNoneFromDict(_dict):
  492. list_pop = []
  493. for k,v in _dict.items():
  494. if v is None or v=="":
  495. list_pop.append(k)
  496. for k in list_pop:
  497. _dict.pop(k)
  498. return _dict
  499. pattern_attachment = re.compile("\.(?P<attachment>jpg|jpeg|png|swf|tif|pdf|doc|docx|xls|xlsx|zip|rar|tar|7z|wim)$")
  500. def getAttachmentTypeFromUrl(url):
  501. _match = re.search(pattern_attachment,url)
  502. if _match is not None:
  503. return _match.groupdict().get("attachment")
  504. return None
  505. def getAttachmentUrls(sourceHtml):
  506. list_urls = []
  507. _soup = BeautifulSoup(sourceHtml,"lxml")
  508. set_types = set()
  509. list_a = _soup.find_all("a")
  510. for _a in list_a:
  511. _url = _a.attrs.get("href","")
  512. _type = getAttachmentTypeFromUrl(_url)
  513. if _type is not None:
  514. list_urls.append({"url":_url,"type":_type})
  515. list_img = _soup.find_all("img")
  516. for _img in list_img:
  517. _url = _img.attrs.get("src","")
  518. _type = getAttachmentTypeFromUrl(_url)
  519. if _type is not None:
  520. list_urls.append({"url":_url,"type":_type})
  521. return list_urls
  522. def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
  523. _time = time.strftime(format,time.localtime())
  524. return _time
  525. def log_tofile(filename):
  526. logging.basicConfig(filename=filename,level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  527. logger = logging.getLogger(__name__)
  528. def log(msg):
  529. '''
  530. @summary:打印信息
  531. '''
  532. logger.info(msg)
  533. def debug(msg):
  534. '''
  535. @summary:打印信息
  536. '''
  537. logger.debug(msg)
  538. def save(object_to_save, path):
  539. '''
  540. 保存对象
  541. @Arugs:
  542. object_to_save: 需要保存的对象
  543. @Return:
  544. 保存的路径
  545. '''
  546. with open(path, 'wb') as f:
  547. pickle.dump(object_to_save, f)
  548. def load(path):
  549. '''
  550. 读取对象
  551. @Arugs:
  552. path: 读取的路径
  553. @Return:
  554. 读取的对象
  555. '''
  556. with open(path, 'rb') as f:
  557. object1 = pickle.load(f)
  558. return object1
  559. def getIndexOfWord_fool(word):
  560. if word in fool_char_to_id.keys():
  561. return fool_char_to_id[word]
  562. else:
  563. return fool_char_to_id["[UNK]"]
  564. def find_index(list_tofind,text):
  565. '''
  566. @summary: 查找所有词汇在字符串中第一次出现的位置
  567. @param:
  568. list_tofind:待查找词汇
  569. text:字符串
  570. @return: list,每个词汇第一次出现的位置
  571. '''
  572. result = []
  573. for item in list_tofind:
  574. index = text.find(item)
  575. if index>=0:
  576. result.append(index)
  577. else:
  578. result.append(-1)
  579. return result
  580. def combine(list1,list2):
  581. '''
  582. @summary:将两个list中的字符串两两拼接
  583. @param:
  584. list1:字符串list
  585. list2:字符串list
  586. @return:拼接结果list
  587. '''
  588. result = []
  589. for item1 in list1:
  590. for item2 in list2:
  591. result.append(str(item1)+str(item2))
  592. return result
  593. def getDigitsDic(unit):
  594. '''
  595. @summary:拿到中文对应的数字
  596. '''
  597. DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
  598. "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
  599. return DigitsDic.get(unit)
  600. def getMultipleFactor(unit):
  601. '''
  602. @summary:拿到单位对应的值
  603. '''
  604. MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
  605. return MultipleFactor.get(unit)
  606. def getUnifyMoney(money):
  607. '''
  608. @summary:将中文金额字符串转换为数字金额
  609. @param:
  610. money:中文金额字符串
  611. @return: decimal,数据金额
  612. '''
  613. MAX_NUM = 12
  614. #去掉逗号
  615. money = re.sub("[,,]","",money)
  616. money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億〇一二三四五六七八九十百千万亿元角分]","",money)
  617. result = Decimal(0)
  618. chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
  619. chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"]
  620. LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
  621. BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
  622. if re.search(LowMoneypattern,money) is not None:
  623. return Decimal(money)
  624. elif re.search(BigMoneypattern,money) is not None:
  625. return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney"))
  626. for factorUnit in chnFactorUnits:
  627. if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
  628. subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
  629. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?$"),subMoneys[0]) is not None:
  630. result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
  631. elif len(subMoneys[0])==1:
  632. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
  633. result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
  634. else:
  635. result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
  636. if len(subMoneys)>1:
  637. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
  638. result += Decimal(subMoneys[1])
  639. elif len(subMoneys[1])==1:
  640. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
  641. result += Decimal(getDigitsDic(subMoneys[1]))
  642. else:
  643. result += Decimal(getUnifyMoney(subMoneys[1]))
  644. break
  645. return result
  646. def getModel_w2v():
  647. '''
  648. @summary:加载词向量
  649. '''
  650. global model_w2v,lock_model_w2v
  651. with lock_model_w2v:
  652. if model_w2v is None:
  653. model_w2v = gensim.models.KeyedVectors.load_word2vec_format(getw2vfilepath(),binary=True)
  654. return model_w2v
  655. def getModel_word():
  656. '''
  657. @summary:加载字向量
  658. '''
  659. global model_word,lock_model_w2v
  660. with lock_model_word:
  661. if model_word is None:
  662. model_word = gensim.models.KeyedVectors.load_word2vec_format(model_word_file,binary=True)
  663. return model_word
  664. # getModel_w2v()
  665. # getModel_word()
  666. def formatArea(area):
  667. if area is not None and len(area)>=3:
  668. return re.sub("[省市区县]","",area)
  669. return area
  670. def findAllIndex(substr,wholestr):
  671. '''
  672. @summary: 找到字符串的子串的所有begin_index
  673. @param:
  674. substr:子字符串
  675. wholestr:子串所在完整字符串
  676. @return: list,字符串的子串的所有begin_index
  677. '''
  678. copystr = wholestr
  679. result = []
  680. indexappend = 0
  681. while(True):
  682. index = copystr.find(substr)
  683. if index<0:
  684. break
  685. else:
  686. result.append(indexappend+index)
  687. indexappend += index+len(substr)
  688. copystr = copystr[index+len(substr):]
  689. return result
  690. def spanWindow(tokens,begin_index,end_index,size,center_include=False,word_flag = False,use_text = False,text = None):
  691. '''
  692. @summary:取得某个实体的上下文词汇
  693. @param:
  694. tokens:句子分词list
  695. begin_index:实体的开始index
  696. end_index:实体的结束index
  697. size:左右两边各取多少个词
  698. center_include:是否包含实体
  699. word_flag:词/字,默认是词
  700. @return: list,实体的上下文词汇
  701. '''
  702. if use_text:
  703. assert text is not None
  704. length_tokens = len(tokens)
  705. if begin_index>size:
  706. begin = begin_index-size
  707. else:
  708. begin = 0
  709. if end_index+size<length_tokens:
  710. end = end_index+size+1
  711. else:
  712. end = length_tokens
  713. result = []
  714. if not word_flag:
  715. result.append(tokens[begin:begin_index])
  716. if center_include:
  717. if use_text:
  718. result.append(text)
  719. else:
  720. result.append(tokens[begin_index:end_index+1])
  721. result.append(tokens[end_index+1:end])
  722. else:
  723. result.append("".join(tokens[begin:begin_index]))
  724. if center_include:
  725. if use_text:
  726. result.append(text)
  727. else:
  728. result.append("".join(tokens[begin_index:end_index+1]))
  729. result.append("".join(tokens[end_index+1:end]))
  730. #print(result)
  731. return result
  732. #根据规则补全编号或名称两边的符号
  733. def fitDataByRule(data):
  734. symbol_dict = {"(":")",
  735. "(":")",
  736. "[":"]",
  737. "【":"】",
  738. ")":"(",
  739. ")":"(",
  740. "]":"[",
  741. "】":"【"}
  742. leftSymbol_pattern = re.compile("[\((\[【]")
  743. rightSymbol_pattern = re.compile("[\))\]】]")
  744. leftfinds = re.findall(leftSymbol_pattern,data)
  745. rightfinds = re.findall(rightSymbol_pattern,data)
  746. result = data
  747. if len(leftfinds)+len(rightfinds)==0:
  748. return data
  749. elif len(leftfinds)==len(rightfinds):
  750. return data
  751. elif abs(len(leftfinds)-len(rightfinds))==1:
  752. if len(leftfinds)>len(rightfinds):
  753. if symbol_dict.get(data[0]) is not None:
  754. result = data[1:]
  755. else:
  756. #print(symbol_dict.get(leftfinds[0]))
  757. result = data+symbol_dict.get(leftfinds[0])
  758. else:
  759. if symbol_dict.get(data[-1]) is not None:
  760. result = data[:-1]
  761. else:
  762. result = symbol_dict.get(rightfinds[0])+data
  763. result = re.sub("[。]","",result)
  764. return result
  765. def embedding(datas,shape):
  766. '''
  767. @summary:查找词汇对应的词向量
  768. @param:
  769. datas:词汇的list
  770. shape:结果的shape
  771. @return: array,返回对应shape的词嵌入
  772. '''
  773. model_w2v = getModel_w2v()
  774. embed = np.zeros(shape)
  775. length = shape[1]
  776. out_index = 0
  777. #print(datas)
  778. for data in datas:
  779. index = 0
  780. for item in data:
  781. item_not_space = re.sub("\s*","",item)
  782. if index>=length:
  783. break
  784. if item_not_space in model_w2v.vocab:
  785. embed[out_index][index] = model_w2v[item_not_space]
  786. index += 1
  787. else:
  788. #embed[out_index][index] = model_w2v['unk']
  789. index += 1
  790. out_index += 1
  791. return embed
  792. def embedding_word(datas,shape):
  793. '''
  794. @summary:查找词汇对应的词向量
  795. @param:
  796. datas:词汇的list
  797. shape:结果的shape
  798. @return: array,返回对应shape的词嵌入
  799. '''
  800. model_w2v = getModel_word()
  801. embed = np.zeros(shape)
  802. length = shape[1]
  803. out_index = 0
  804. #print(datas)
  805. for data in datas:
  806. index = 0
  807. for item in str(data)[-shape[1]:]:
  808. if index>=length:
  809. break
  810. if item in model_w2v.vocab:
  811. embed[out_index][index] = model_w2v[item]
  812. index += 1
  813. else:
  814. # embed[out_index][index] = model_w2v['unk']
  815. index += 1
  816. out_index += 1
  817. return embed
  818. def formEncoding(text,shape=(100,60),expand=False):
  819. embedding = np.zeros(shape)
  820. word_model = getModel_word()
  821. for i in range(len(text)):
  822. if i>=shape[0]:
  823. break
  824. if text[i] in word_model.vocab:
  825. embedding[i] = word_model[text[i]]
  826. if expand:
  827. embedding = np.expand_dims(embedding,0)
  828. return embedding
  829. def partMoney(entity_text,input2_shape = [7]):
  830. '''
  831. @summary:对金额分段
  832. @param:
  833. entity_text:数值金额
  834. input2_shape:分类数
  835. @return: array,分段之后的独热编码
  836. '''
  837. money = float(entity_text)
  838. parts = np.zeros(input2_shape)
  839. if money<100:
  840. parts[0] = 1
  841. elif money<1000:
  842. parts[1] = 1
  843. elif money<10000:
  844. parts[2] = 1
  845. elif money<100000:
  846. parts[3] = 1
  847. elif money<1000000:
  848. parts[4] = 1
  849. elif money<10000000:
  850. parts[5] = 1
  851. else:
  852. parts[6] = 1
  853. return parts
  854. def recall(y_true, y_pred):
  855. from keras import backend as K
  856. '''
  857. 计算召回率
  858. @Argus:
  859. y_true: 正确的标签
  860. y_pred: 模型预测的标签
  861. @Return
  862. 召回率
  863. '''
  864. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  865. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  866. if c3 == 0:
  867. return 0
  868. recall = c1 / c3
  869. return recall
  870. def f1_score(y_true, y_pred):
  871. from keras import backend as K
  872. '''
  873. 计算F1
  874. @Argus:
  875. y_true: 正确的标签
  876. y_pred: 模型预测的标签
  877. @Return
  878. F1值
  879. '''
  880. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  881. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  882. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  883. precision = c1 / c2
  884. if c3 == 0:
  885. recall = 0
  886. else:
  887. recall = c1 / c3
  888. f1_score = 2 * (precision * recall) / (precision + recall)
  889. return f1_score
  890. def precision(y_true, y_pred):
  891. from keras import backend as K
  892. '''
  893. 计算精确率
  894. @Argus:
  895. y_true: 正确的标签
  896. y_pred: 模型预测的标签
  897. @Return
  898. 精确率
  899. '''
  900. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  901. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  902. precision = c1 / c2
  903. return precision
  904. # def print_metrics(history):
  905. # '''
  906. # 制作每次迭代的各metrics变化图片
  907. #
  908. # @Arugs:
  909. # history: 模型训练迭代的历史记录
  910. # '''
  911. # import matplotlib.pyplot as plt
  912. #
  913. # # loss图
  914. # loss = history.history['loss']
  915. # val_loss = history.history['val_loss']
  916. # epochs = range(1, len(loss) + 1)
  917. # plt.subplot(2, 2, 1)
  918. # plt.plot(epochs, loss, 'bo', label='Training loss')
  919. # plt.plot(epochs, val_loss, 'b', label='Validation loss')
  920. # plt.title('Training and validation loss')
  921. # plt.xlabel('Epochs')
  922. # plt.ylabel('Loss')
  923. # plt.legend()
  924. #
  925. # # f1图
  926. # f1 = history.history['f1_score']
  927. # val_f1 = history.history['val_f1_score']
  928. # plt.subplot(2, 2, 2)
  929. # plt.plot(epochs, f1, 'bo', label='Training f1')
  930. # plt.plot(epochs, val_f1, 'b', label='Validation f1')
  931. # plt.title('Training and validation f1')
  932. # plt.xlabel('Epochs')
  933. # plt.ylabel('F1')
  934. # plt.legend()
  935. #
  936. # # precision图
  937. # prec = history.history['precision']
  938. # val_prec = history.history['val_precision']
  939. # plt.subplot(2, 2, 3)
  940. # plt.plot(epochs, prec, 'bo', label='Training precision')
  941. # plt.plot(epochs, val_prec, 'b', label='Validation pecision')
  942. # plt.title('Training and validation precision')
  943. # plt.xlabel('Epochs')
  944. # plt.ylabel('Precision')
  945. # plt.legend()
  946. #
  947. # # recall图
  948. # recall = history.history['recall']
  949. # val_recall = history.history['val_recall']
  950. # plt.subplot(2, 2, 4)
  951. # plt.plot(epochs, recall, 'bo', label='Training recall')
  952. # plt.plot(epochs, val_recall, 'b', label='Validation recall')
  953. # plt.title('Training and validation recall')
  954. # plt.xlabel('Epochs')
  955. # plt.ylabel('Recall')
  956. # plt.legend()
  957. #
  958. # plt.show()
  959. if __name__=="__main__":
  960. print(fool_char_to_id[">"])
  961. # model = getModel_w2v()
  962. # vocab,matrix = getVocabAndMatrix(model, Embedding_size=128)
  963. # save([vocab,matrix],"vocabMatrix_words.pk")