Utils.py 68 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525
  1. '''
  2. Created on 2018年12月20日
  3. @author: User
  4. '''
  5. import numpy as np
  6. import re
  7. import gensim
  8. from keras import backend as K
  9. import os,sys
  10. import time
  11. import traceback
  12. from threading import RLock
  13. # from pai_tf_predict_proto import tf_predict_pb2
  14. import requests
  15. model_w2v = None
  16. lock_model_w2v = RLock()
  17. USE_PAI_EAS = False
  18. Lazy_load = False
  19. # API_URL = "http://192.168.2.103:8802"
  20. API_URL = "http://127.0.0.1:888"
  21. # USE_API = True
  22. USE_API = False
  23. def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
  24. _time = time.strftime(format,time.localtime())
  25. return _time
  26. def getw2vfilepath():
  27. filename = "wiki_128_word_embedding_new.vector"
  28. w2vfile = getFileFromSysPath(filename)
  29. if w2vfile is not None:
  30. return w2vfile
  31. return filename
  32. def getLazyLoad():
  33. global Lazy_load
  34. return Lazy_load
  35. def getFileFromSysPath(filename):
  36. for _path in sys.path:
  37. if os.path.isdir(_path):
  38. for _file in os.listdir(_path):
  39. _abspath = os.path.join(_path,_file)
  40. if os.path.isfile(_abspath):
  41. if _file==filename:
  42. return _abspath
  43. return None
  44. model_word_file = os.path.dirname(__file__)+"/../singlew2v_model.vector"
  45. model_word = None
  46. lock_model_word = RLock()
  47. from decimal import Decimal
  48. import logging
  49. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  50. logger = logging.getLogger(__name__)
  51. import pickle
  52. import os
  53. import json
  54. #自定义jsonEncoder
  55. class MyEncoder(json.JSONEncoder):
  56. def __init__(self):
  57. import numpy as np
  58. global np
  59. def default(self, obj):
  60. if isinstance(obj, np.ndarray):
  61. return obj.tolist()
  62. elif isinstance(obj, bytes):
  63. return str(obj, encoding='utf-8')
  64. elif isinstance(obj, (np.float_, np.float16, np.float32,
  65. np.float64)):
  66. return float(obj)
  67. elif isinstance(obj,(np.int64,np.int32)):
  68. return int(obj)
  69. return json.JSONEncoder.default(self, obj)
  70. vocab_word = None
  71. vocab_words = None
  72. file_vocab_word = "vocab_word.pk"
  73. file_vocab_words = "vocab_words.pk"
  74. selffool_authorization = "NjlhMWFjMjVmNWYyNzI0MjY1OGQ1M2Y0ZmY4ZGY0Mzg3Yjc2MTVjYg=="
  75. selffool_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_gpu"
  76. selffool_seg_authorization = "OWUwM2Q0ZmE3YjYxNzU4YzFiMjliNGVkMTA3MzJkNjQ2MzJiYzBhZg=="
  77. selffool_seg_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_seg_gpu"
  78. codename_authorization = "Y2M5MDUxMzU1MTU4OGM3ZDk2ZmEzYjkxYmYyYzJiZmUyYTgwYTg5NA=="
  79. codename_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codename_gpu"
  80. form_item_authorization = "ODdkZWY1YWY0NmNhNjU2OTI2NWY4YmUyM2ZlMDg1NTZjOWRkYTVjMw=="
  81. form_item_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/form"
  82. person_authorization = "N2I2MDU2N2Q2MGQ0ZWZlZGM3NDkyNTA1Nzc4YmM5OTlhY2MxZGU1Mw=="
  83. person_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/person"
  84. role_authorization = "OWM1ZDg5ZDEwYTEwYWI4OGNjYmRlMmQ1NzYwNWNlZGZkZmRmMjE4OQ=="
  85. role_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/role"
  86. money_authorization = "MDQyNjc2ZDczYjBhYmM4Yzc4ZGI4YjRmMjc3NGI5NTdlNzJiY2IwZA=="
  87. money_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/money"
  88. codeclasses_authorization = "MmUyNWIxZjQ2NjAzMWJlMGIzYzkxMjMzNWY5OWI3NzJlMWQ1ZjY4Yw=="
  89. codeclasses_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codeclasses"
  90. def viterbi_decode(score, transition_params):
  91. """Decode the highest scoring sequence of tags outside of TensorFlow.
  92. This should only be used at test time.
  93. Args:
  94. score: A [seq_len, num_tags] matrix of unary potentials.
  95. transition_params: A [num_tags, num_tags] matrix of binary potentials.
  96. Returns:
  97. viterbi: A [seq_len] list of integers containing the highest scoring tag
  98. indices.
  99. viterbi_score: A float containing the score for the Viterbi sequence.
  100. """
  101. trellis = np.zeros_like(score)
  102. backpointers = np.zeros_like(score, dtype=np.int32)
  103. trellis[0] = score[0]
  104. for t in range(1, score.shape[0]):
  105. v = np.expand_dims(trellis[t - 1], 1) + transition_params
  106. trellis[t] = score[t] + np.max(v, 0)
  107. backpointers[t] = np.argmax(v, 0)
  108. viterbi = [np.argmax(trellis[-1])]
  109. for bp in reversed(backpointers[1:]):
  110. viterbi.append(bp[viterbi[-1]])
  111. viterbi.reverse()
  112. viterbi_score = np.max(trellis[-1])
  113. return viterbi, viterbi_score
  114. def limitRun(sess,list_output,feed_dict,MAX_BATCH=1024):
  115. len_sample = 0
  116. if len(feed_dict.keys())>0:
  117. len_sample = len(feed_dict[list(feed_dict.keys())[0]])
  118. if len_sample>MAX_BATCH:
  119. list_result = [[] for _ in range(len(list_output))]
  120. _begin = 0
  121. while(_begin<len_sample):
  122. new_dict = dict()
  123. for _key in feed_dict.keys():
  124. if isinstance(feed_dict[_key],(float,int,np.int32,np.float_,np.float16,np.float32,np.float64)):
  125. new_dict[_key] = feed_dict[_key]
  126. else:
  127. new_dict[_key] = feed_dict[_key][_begin:_begin+MAX_BATCH]
  128. _output = sess.run(list_output,feed_dict=new_dict)
  129. for _index in range(len(list_output)):
  130. list_result[_index].extend(_output[_index])
  131. _begin += MAX_BATCH
  132. else:
  133. list_result = sess.run(list_output,feed_dict=feed_dict)
  134. return list_result
  135. def get_values(response,output_name):
  136. """
  137. Get the value of a specified output tensor
  138. :param output_name: name of the output tensor
  139. :return: the content of the output tensor
  140. """
  141. output = response.outputs[output_name]
  142. if output.dtype == tf_predict_pb2.DT_FLOAT:
  143. _value = output.float_val
  144. elif output.dtype == tf_predict_pb2.DT_INT8 or output.dtype == tf_predict_pb2.DT_INT16 or \
  145. output.dtype == tf_predict_pb2.DT_INT32:
  146. _value = output.int_val
  147. elif output.dtype == tf_predict_pb2.DT_INT64:
  148. _value = output.int64_val
  149. elif output.dtype == tf_predict_pb2.DT_DOUBLE:
  150. _value = output.double_val
  151. elif output.dtype == tf_predict_pb2.DT_STRING:
  152. _value = output.string_val
  153. elif output.dtype == tf_predict_pb2.DT_BOOL:
  154. _value = output.bool_val
  155. return np.array(_value).reshape(response.outputs[output_name].array_shape.dim)
  156. def vpc_requests(url,authorization,request_data,list_outputs):
  157. headers = {"Authorization": authorization}
  158. dict_outputs = dict()
  159. response = tf_predict_pb2.PredictResponse()
  160. resp = requests.post(url, data=request_data, headers=headers)
  161. if resp.status_code != 200:
  162. print(resp.status_code,resp.content)
  163. log("调用pai-eas接口出错,authorization:"+str(authorization))
  164. return None
  165. else:
  166. response = tf_predict_pb2.PredictResponse()
  167. response.ParseFromString(resp.content)
  168. for _output in list_outputs:
  169. dict_outputs[_output] = get_values(response, _output)
  170. return dict_outputs
  171. def encodeInput(data,word_len,word_flag=True,userFool=False):
  172. result = []
  173. out_index = 0
  174. for item in data:
  175. if out_index in [0]:
  176. list_word = item[-word_len:]
  177. else:
  178. list_word = item[:word_len]
  179. temp = []
  180. if word_flag:
  181. for word in list_word:
  182. if userFool:
  183. temp.append(getIndexOfWord_fool(word))
  184. else:
  185. temp.append(getIndexOfWord(word))
  186. list_append = []
  187. temp_len = len(temp)
  188. while(temp_len<word_len):
  189. if userFool:
  190. list_append.append(0)
  191. else:
  192. list_append.append(getIndexOfWord("<pad>"))
  193. temp_len += 1
  194. if out_index in [0]:
  195. temp = list_append+temp
  196. else:
  197. temp = temp+list_append
  198. else:
  199. for words in list_word:
  200. temp.append(getIndexOfWords(words))
  201. list_append = []
  202. temp_len = len(temp)
  203. while(temp_len<word_len):
  204. list_append.append(getIndexOfWords("<pad>"))
  205. temp_len += 1
  206. if out_index in [0,1]:
  207. temp = list_append+temp
  208. else:
  209. temp = temp+list_append
  210. result.append(temp)
  211. out_index += 1
  212. return result
  213. def encodeInput_form(input,MAX_LEN=30):
  214. x = np.zeros([MAX_LEN])
  215. for i in range(len(input)):
  216. if i>=MAX_LEN:
  217. break
  218. x[i] = getIndexOfWord(input[i])
  219. return x
  220. def getVocabAndMatrix(model,Embedding_size = 60):
  221. '''
  222. @summary:获取子向量的词典和子向量矩阵
  223. '''
  224. vocab = ["<pad>"]+model.index2word
  225. embedding_matrix = np.zeros((len(vocab),Embedding_size))
  226. for i in range(1,len(vocab)):
  227. embedding_matrix[i] = model[vocab[i]]
  228. return vocab,embedding_matrix
  229. def getIndexOfWord(word):
  230. global vocab_word,file_vocab_word
  231. if vocab_word is None:
  232. if os.path.exists(file_vocab_word):
  233. vocab = load(file_vocab_word)
  234. vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab)))
  235. else:
  236. model = getModel_word()
  237. vocab,_ = getVocabAndMatrix(model, Embedding_size=60)
  238. vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab)))
  239. save(vocab,file_vocab_word)
  240. if word in vocab_word.keys():
  241. return vocab_word[word]
  242. else:
  243. return vocab_word['<pad>']
  244. def changeIndexFromWordToWords(tokens,word_index):
  245. '''
  246. @summary:转换某个字的字偏移为词偏移
  247. '''
  248. before_index = 0
  249. after_index = 0
  250. for i in range(len(tokens)):
  251. after_index = after_index+len(tokens[i])
  252. if before_index<=word_index and after_index>word_index:
  253. return i
  254. before_index = after_index
  255. return i+1
  256. def getIndexOfWords(words):
  257. global vocab_words,file_vocab_words
  258. if vocab_words is None:
  259. if os.path.exists(file_vocab_words):
  260. vocab = load(file_vocab_words)
  261. vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab)))
  262. else:
  263. model = getModel_w2v()
  264. vocab,_ = getVocabAndMatrix(model, Embedding_size=128)
  265. vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab)))
  266. save(vocab,file_vocab_words)
  267. if words in vocab_words.keys():
  268. return vocab_words[words]
  269. else:
  270. return vocab_words["<pad>"]
  271. def log(msg):
  272. '''
  273. @summary:打印信息
  274. '''
  275. logger.info(msg)
  276. def debug(msg):
  277. '''
  278. @summary:打印信息
  279. '''
  280. logger.debug(msg)
  281. def save(object_to_save, path):
  282. '''
  283. 保存对象
  284. @Arugs:
  285. object_to_save: 需要保存的对象
  286. @Return:
  287. 保存的路径
  288. '''
  289. with open(path, 'wb') as f:
  290. pickle.dump(object_to_save, f)
  291. def load(path):
  292. '''
  293. 读取对象
  294. @Arugs:
  295. path: 读取的路径
  296. @Return:
  297. 读取的对象
  298. '''
  299. with open(path, 'rb') as f:
  300. object1 = pickle.load(f)
  301. return object1
  302. fool_char_to_id = load(os.path.dirname(__file__)+"/fool_char_to_id.pk")
  303. def getIndexOfWord_fool(word):
  304. if word in fool_char_to_id.keys():
  305. return fool_char_to_id[word]
  306. else:
  307. return fool_char_to_id["[UNK]"]
  308. def find_index(list_tofind,text):
  309. '''
  310. @summary: 查找所有词汇在字符串中第一次出现的位置
  311. @param:
  312. list_tofind:待查找词汇
  313. text:字符串
  314. @return: list,每个词汇第一次出现的位置
  315. '''
  316. result = []
  317. for item in list_tofind:
  318. index = text.find(item)
  319. if index>=0:
  320. result.append(index)
  321. else:
  322. result.append(-1)
  323. return result
  324. def combine(list1,list2):
  325. '''
  326. @summary:将两个list中的字符串两两拼接
  327. @param:
  328. list1:字符串list
  329. list2:字符串list
  330. @return:拼接结果list
  331. '''
  332. result = []
  333. for item1 in list1:
  334. for item2 in list2:
  335. result.append(str(item1)+str(item2))
  336. return result
  337. def getDigitsDic(unit):
  338. '''
  339. @summary:拿到中文对应的数字
  340. '''
  341. DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
  342. "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
  343. return DigitsDic.get(unit)
  344. def getMultipleFactor(unit):
  345. '''
  346. @summary:拿到单位对应的值
  347. '''
  348. MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
  349. return MultipleFactor.get(unit)
  350. def getUnifyMoney(money):
  351. '''
  352. @summary:将中文金额字符串转换为数字金额
  353. @param:
  354. money:中文金额字符串
  355. @return: decimal,数据金额
  356. '''
  357. MAX_MONEY = 1000000000000
  358. MAX_NUM = 12
  359. #去掉逗号
  360. money = re.sub("[,,]","",money)
  361. money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
  362. result = Decimal(0)
  363. chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
  364. # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
  365. chnFactorUnits = ["兆", "亿", "万", "仟", '千', "佰", '百', "拾", '十',"圆", "元", "角", "分"] # 20240611 修复大写提取错误 '陆拾陆亿伍千柒佰零叁万肆千叁佰陆拾伍元' Decimal('11607430365')
  366. LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
  367. BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
  368. try:
  369. if re.search(LowMoneypattern,money) is not None:
  370. return Decimal(money)
  371. elif re.search(BigMoneypattern,money) is not None:
  372. return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney"))
  373. for factorUnit in chnFactorUnits:
  374. if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
  375. subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
  376. if re.search(re.compile("^(\d+)(\.\d+)?$"),subMoneys[0]) is not None:
  377. if MAX_MONEY/getMultipleFactor(factorUnit)<Decimal(subMoneys[0]):
  378. return Decimal(0)
  379. result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
  380. elif len(subMoneys[0])==1:
  381. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
  382. result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
  383. # subMoneys[0]中无金额单位,不可再拆分
  384. elif subMoneys[0]=="":
  385. result += 0
  386. elif re.search(re.compile("[%s]"%("".join(chnFactorUnits))),subMoneys[0]) is None:
  387. # print(subMoneys)
  388. # subMoneys[0] = subMoneys[0][0]
  389. result += Decimal(getUnifyMoney(subMoneys[0])) * (getMultipleFactor(factorUnit))
  390. else:
  391. result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
  392. if len(subMoneys)>1:
  393. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
  394. result += Decimal(subMoneys[1])
  395. elif len(subMoneys[1])==1:
  396. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
  397. result += Decimal(getDigitsDic(subMoneys[1]))
  398. else:
  399. result += Decimal(getUnifyMoney(subMoneys[1]))
  400. break
  401. except Exception as e:
  402. # traceback.print_exc()
  403. return Decimal(0)
  404. return result
  405. def getModel_w2v():
  406. '''
  407. @summary:加载词向量
  408. '''
  409. global model_w2v,lock_model_w2v
  410. with lock_model_w2v:
  411. if model_w2v is None:
  412. model_w2v = gensim.models.KeyedVectors.load_word2vec_format(getw2vfilepath(),binary=True)
  413. return model_w2v
  414. def getModel_word():
  415. '''
  416. @summary:加载字向量
  417. '''
  418. global model_word,lock_model_w2v
  419. with lock_model_word:
  420. if model_word is None:
  421. model_word = gensim.models.KeyedVectors.load_word2vec_format(model_word_file,binary=True)
  422. return model_word
  423. # getModel_w2v()
  424. # getModel_word()
  425. def findAllIndex(substr,wholestr):
  426. '''
  427. @summary: 找到字符串的子串的所有begin_index
  428. @param:
  429. substr:子字符串
  430. wholestr:子串所在完整字符串
  431. @return: list,字符串的子串的所有begin_index
  432. '''
  433. copystr = wholestr
  434. result = []
  435. indexappend = 0
  436. while(True):
  437. index = copystr.find(substr)
  438. if index<0:
  439. break
  440. else:
  441. result.append(indexappend+index)
  442. indexappend += index+len(substr)
  443. copystr = copystr[index+len(substr):]
  444. return result
  445. def spanWindow(tokens,begin_index,end_index,size,center_include=False,word_flag = False,use_text = False,text = None):
  446. '''
  447. @summary:取得某个实体的上下文词汇
  448. @param:
  449. tokens:句子分词list
  450. begin_index:实体的开始index
  451. end_index:实体的结束index
  452. size:左右两边各取多少个词
  453. center_include:是否包含实体
  454. word_flag:词/字,默认是词
  455. @return: list,实体的上下文词汇
  456. '''
  457. if use_text:
  458. assert text is not None
  459. length_tokens = len(tokens)
  460. if begin_index>size:
  461. begin = begin_index-size
  462. else:
  463. begin = 0
  464. if end_index+size<length_tokens:
  465. end = end_index+size+1
  466. else:
  467. end = length_tokens
  468. result = []
  469. if not word_flag:
  470. result.append(tokens[begin:begin_index])
  471. if center_include:
  472. if use_text:
  473. result.append(text)
  474. else:
  475. result.append(tokens[begin_index:end_index+1])
  476. result.append(tokens[end_index+1:end])
  477. else:
  478. result.append("".join(tokens[begin:begin_index]))
  479. if center_include:
  480. if use_text:
  481. result.append(text)
  482. else:
  483. result.append("".join(tokens[begin_index:end_index+1]))
  484. result.append("".join(tokens[end_index+1:end]))
  485. #print(result)
  486. return result
  487. def get_context(sentence_text, begin_index, end_index, size=20, center_include=False):
  488. '''
  489. 返回实体上下文信息
  490. :param sentence_text: 句子文本
  491. :param begin_index: 实体字开始位置
  492. :param end_index: 实体字结束位置
  493. :param size: 字偏移量
  494. :param center_include:
  495. :return:
  496. '''
  497. result = []
  498. begin = begin_index - size if begin_index>size else 0
  499. end = end_index + size
  500. result.append(sentence_text[begin: begin_index])
  501. if center_include:
  502. result.append(sentence_text[begin_index: end_index])
  503. result.append(sentence_text[end_index: end])
  504. return result
  505. #根据规则补全编号或名称两边的符号
  506. def fitDataByRule(data):
  507. symbol_dict = {"(":")",
  508. "(":")",
  509. "[":"]",
  510. "【":"】",
  511. ")":"(",
  512. ")":"(",
  513. "]":"[",
  514. "】":"【"}
  515. leftSymbol_pattern = re.compile("[\((\[【]")
  516. rightSymbol_pattern = re.compile("[\))\]】]")
  517. leftfinds = re.findall(leftSymbol_pattern,data)
  518. rightfinds = re.findall(rightSymbol_pattern,data)
  519. result = data
  520. if len(leftfinds)+len(rightfinds)==0:
  521. return data
  522. elif len(leftfinds)==len(rightfinds):
  523. return data
  524. elif abs(len(leftfinds)-len(rightfinds))==1:
  525. if len(leftfinds)>len(rightfinds):
  526. if symbol_dict.get(data[0]) is not None:
  527. result = data[1:]
  528. else:
  529. #print(symbol_dict.get(leftfinds[0]))
  530. result = data+symbol_dict.get(leftfinds[0])
  531. else:
  532. if symbol_dict.get(data[-1]) is not None:
  533. result = data[:-1]
  534. else:
  535. result = symbol_dict.get(rightfinds[0])+data
  536. result = re.sub("[。]","",result)
  537. return result
  538. from datetime import date
  539. # 时间合法性判断
  540. def isValidDate(year, month, day):
  541. try:
  542. date(year, month, day)
  543. except:
  544. return False
  545. else:
  546. return True
  547. time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]?\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3})?)")
  548. from BiddingKG.dl.ratio.re_ratio import getUnifyNum
  549. import calendar
  550. def get_maxday(year, month):
  551. # calendar.monthrange(year, month) 返回一个元组,其中第一个元素是那个月第一天的星期几(0-6代表周一到周日),
  552. # 第二个元素是那个月的天数。
  553. _, last_day = calendar.monthrange(year, month)
  554. return last_day
  555. def timeFormat(_time, default_first_day=True):
  556. '''
  557. 日期格式化:年-月-日
  558. :param _time:
  559. :param default_first_day: True取当月第一天,否则取最后一天
  560. :return:
  561. '''
  562. current_year = time.strftime("%Y",time.localtime())
  563. all_match = re.finditer(time_format_pattern,_time)
  564. for _match in all_match:
  565. if len(_match.group())>0:
  566. legal = True
  567. year = ""
  568. month = ""
  569. day = ""
  570. for k,v in _match.groupdict().items():
  571. if k=="year":
  572. year = v
  573. if k=="month":
  574. month = v
  575. if k=="day":
  576. day = v
  577. if year!="":
  578. if re.search("^\d+$",year):
  579. if len(year)==2:
  580. year = "20"+year
  581. if int(year)-int(current_year)>10:
  582. legal = False
  583. else:
  584. _year = ""
  585. for word in year:
  586. if word == '0':
  587. _year += word
  588. else:
  589. _year += str(getDigitsDic(word))
  590. year = _year
  591. else:
  592. legal = False
  593. if month!="":
  594. if re.search("^\d+$", month):
  595. if int(month)>12:
  596. legal = False
  597. else:
  598. month = int(getUnifyNum(month))
  599. if month>=1 and month<=12:
  600. month = str(month)
  601. else:
  602. legal = False
  603. else:
  604. legal = False
  605. if day == None:
  606. day = "01" if (default_first_day or legal == False) else str(get_maxday(int(year), int(month)))
  607. if day!="":
  608. if re.search("^\d+$", day):
  609. if int(day)>31:
  610. legal = False
  611. else:
  612. day = int(getUnifyNum(day))
  613. if day >= 1 and day <= 31:
  614. day = str(day)
  615. else:
  616. legal = False
  617. else:
  618. legal = False
  619. # print(year,month,day)
  620. if not isValidDate(int(year),int(month),int(day)):
  621. legal = False
  622. if legal:
  623. return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
  624. return ""
  625. def embedding(datas,shape):
  626. '''
  627. @summary:查找词汇对应的词向量
  628. @param:
  629. datas:词汇的list
  630. shape:结果的shape
  631. @return: array,返回对应shape的词嵌入
  632. '''
  633. model_w2v = getModel_w2v()
  634. embed = np.zeros(shape)
  635. length = shape[1]
  636. out_index = 0
  637. #print(datas)
  638. for data in datas:
  639. index = 0
  640. for item in data:
  641. item_not_space = re.sub("\s*","",item)
  642. if index>=length:
  643. break
  644. if item_not_space in model_w2v.vocab:
  645. embed[out_index][index] = model_w2v[item_not_space]
  646. index += 1
  647. else:
  648. #embed[out_index][index] = model_w2v['unk']
  649. index += 1
  650. out_index += 1
  651. return embed
  652. def embedding_word(datas,shape):
  653. '''
  654. @summary:查找词汇对应的词向量
  655. @param:
  656. datas:词汇的list
  657. shape:结果的shape
  658. @return: array,返回对应shape的词嵌入
  659. '''
  660. model_w2v = getModel_word()
  661. embed = np.zeros(shape)
  662. length = shape[1]
  663. out_index = 0
  664. #print(datas)
  665. for data in datas:
  666. index = 0
  667. for item in str(data)[-shape[1]:]:
  668. if index>=length:
  669. break
  670. if item in model_w2v.vocab:
  671. embed[out_index][index] = model_w2v[item]
  672. index += 1
  673. else:
  674. # embed[out_index][index] = model_w2v['unk']
  675. index += 1
  676. out_index += 1
  677. return embed
  678. def embedding_word_forward(datas,shape):
  679. '''
  680. @summary:查找词汇对应的词向量
  681. @param:
  682. datas:词汇的list
  683. shape:结果的shape
  684. @return: array,返回对应shape的词嵌入
  685. '''
  686. model_w2v = getModel_word()
  687. embed = np.zeros(shape)
  688. length = shape[1]
  689. out_index = 0
  690. #print(datas)
  691. for data in datas:
  692. index = 0
  693. for item in str(data)[:shape[1]]:
  694. if index>=length:
  695. break
  696. if item in model_w2v.vocab:
  697. embed[out_index][index] = model_w2v[item]
  698. index += 1
  699. else:
  700. # embed[out_index][index] = model_w2v['unk']
  701. index += 1
  702. out_index += 1
  703. return embed
  704. def formEncoding(text,shape=(100,60),expand=False):
  705. embedding = np.zeros(shape)
  706. word_model = getModel_word()
  707. for i in range(len(text)):
  708. if i>=shape[0]:
  709. break
  710. if text[i] in word_model.vocab:
  711. embedding[i] = word_model[text[i]]
  712. if expand:
  713. embedding = np.expand_dims(embedding,0)
  714. return embedding
  715. def partMoney(entity_text,input2_shape = [7]):
  716. '''
  717. @summary:对金额分段
  718. @param:
  719. entity_text:数值金额
  720. input2_shape:分类数
  721. @return: array,分段之后的独热编码
  722. '''
  723. money = float(entity_text)
  724. parts = np.zeros(input2_shape)
  725. if money<100:
  726. parts[0] = 1
  727. elif money<1000:
  728. parts[1] = 1
  729. elif money<10000:
  730. parts[2] = 1
  731. elif money<100000:
  732. parts[3] = 1
  733. elif money<1000000:
  734. parts[4] = 1
  735. elif money<10000000:
  736. parts[5] = 1
  737. else:
  738. parts[6] = 1
  739. return parts
  740. def uniform_num(num):
  741. d1 = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十': '10'}
  742. # d2 = {'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5', 'F': '6', 'G': '7', 'H': '8', 'I': '9', 'J': '10'}
  743. d3 = {'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5', 'Ⅵ': '6', 'Ⅶ': '7'}
  744. if num.isdigit():
  745. if re.search('^0[\d]$', num):
  746. num = num[1:]
  747. return num
  748. elif re.search('^[一二三四五六七八九十]+$', num):
  749. _digit = re.search('^[一二三四五六七八九十]+$', num).group(0)
  750. if len(_digit) == 1:
  751. num = d1[_digit]
  752. elif len(_digit) == 2 and _digit[0] == '十':
  753. num = '1'+ d1[_digit[1]]
  754. elif len(_digit) == 2 and _digit[1] == '十':
  755. num = d1[_digit[0]] + '0'
  756. elif len(_digit) == 3 and _digit[1] == '十':
  757. num = d1[_digit[0]] + d1[_digit[2]]
  758. elif re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num):
  759. num = re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num).group(0)
  760. num = d3[num]
  761. return num
  762. def uniform_package_name(package_name):
  763. '''
  764. 统一规范化包号。数值类型统一为阿拉伯数字,字母统一为大写,包含施工监理等抽到前面, 例 A包监理一标段 统一为 监理A1 ; 包Ⅱ 统一为 2
  765. :param package_name: 字符串类型 包号
  766. :return:
  767. '''
  768. package_name_raw = package_name
  769. package_name = re.sub('pdf|doc|docs|xlsx|rar|\d{4}年', ' ', package_name)
  770. package_name = package_name.replace('标段(包)', '标段').replace('№', '')
  771. package_name = re.sub('\[|【', '', package_name)
  772. kw = re.search('(施工|监理|监测|勘察|设计|劳务)', package_name)
  773. name = ""
  774. if kw:
  775. name += kw.group(0)
  776. if re.search('^[a-zA-Z0-9-]{5,}$', package_name): # 五个字符以上编号
  777. _digit = re.search('^[a-zA-Z0-9-]{5,}$', package_name).group(0).upper()
  778. # print('规范化包号1', _digit)
  779. name += _digit
  780. elif re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name): # 处理类似 A包2标段
  781. ser = re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name)
  782. # print('规范化包号2', ser.group(0))
  783. _char = ser.groupdict().get('eng')
  784. if _char:
  785. _char = _char.upper()
  786. _digit = ser.groupdict().get('num')
  787. _digit = uniform_num(_digit)
  788. name += _char.upper() + _digit
  789. elif re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name): # 处理类似 A包2标段
  790. ser = re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name)
  791. # print('规范化包号3', ser.group(0))
  792. _char = ser.groupdict().get('eng')
  793. if _char:
  794. _char = _char.upper()
  795. _digit = ser.groupdict().get('num')
  796. _digit = uniform_num(_digit)
  797. if _char:
  798. name += _char.upper()
  799. name += _digit
  800. elif re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name): # 数字的统一的阿拉伯数字
  801. ser = re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name)
  802. # print('规范化包号4', ser.group(0))
  803. _char = ser.groupdict().get('eng')
  804. if _char:
  805. _char = _char.upper()
  806. _digit = ser.groupdict().get('num')
  807. _digit = uniform_num(_digit)
  808. if _char:
  809. name += _char.upper()
  810. name += _digit
  811. elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z-]{1,5})', package_name): # 数字的统一的阿拉伯数字
  812. _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z-]{1,5})', package_name).group('eng').upper()
  813. # print('规范化包号5', _digit)
  814. name += _digit
  815. elif re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name): # 数字的统一的阿拉伯数字
  816. _digit = re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name).group('eng').upper()
  817. # print('规范化包号6', _digit)
  818. name += _digit
  819. elif re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name): # 数字的统一的阿拉伯数字
  820. _digit = re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name).group(0)
  821. # print('规范化包号7', _digit)
  822. _digit = uniform_num(_digit)
  823. name += _digit
  824. elif re.search('^[a-zA-Z0-9-]+$', package_name):
  825. _char = re.search('^[a-zA-Z0-9-]+$', package_name).group(0)
  826. # print('规范化包号8', _char)
  827. name += _char.upper()
  828. if name == "":
  829. return package_name_raw
  830. else:
  831. if name.isdigit():
  832. name = str(int(name))
  833. # print('原始包号:%s, 处理后:%s'%(package_name, name))
  834. return name
  835. def money_process(money_text, header):
  836. '''
  837. 输入金额文本及金额列表头,返回统一数字化金额及金额单位
  838. :param money_text:金额字符串
  839. :param header:金额列表头,用于提取单位
  840. :return:
  841. '''
  842. money = 0
  843. money_unit = ""
  844. moneys, _ = get_money_entity('%s:%s' % (header, money_text))
  845. if len(moneys) == 1:
  846. money = float(moneys[0][0])
  847. money_unit = moneys[0][3]
  848. elif len(moneys) == 2 and moneys[0][0]==moneys[1][0]:
  849. money = float(moneys[0][0])
  850. money_unit = moneys[0][3]
  851. # # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[((]?万?", money_text)
  852. # money_text = re.sub('\s', '', money_text) # 2024/04/19 修复 457699044 556.46751 万元 金额与单位有空格造成万漏提取
  853. # if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text) and re.search('\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?', money_text):
  854. # money_text = re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text).group(0) # 如果表格同时包含大小写金额,取大写金额,避免单位取错 463310590 790000(柒拾玖万元整)
  855. # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?", money_text)
  856. # if re_price:
  857. # money_re = re_price.group(0)
  858. # if (re.search('万元|[((]万[))]', header) or re.search('万元|[((]万[))]', money_text)) and '万' not in money_re: # 修复37797825 控制价(万) # 修复 460307391 万元不在表头,在数字前面
  859. # money_re += '万元'
  860. # elif (re.search('亿元|[((]亿[))]', header) or re.search('亿元|[((]亿[))]', money_text)) and '亿' not in money_re: # 修复37797825 控制价(万) # 修复 460307391 万元不在表头,在数字前面
  861. # money_re += '亿元'
  862. # # money = float(getUnifyMoney(money_text))
  863. # money = float(getUnifyMoney(money_re))
  864. # if money > 10000000000000: # 大于万亿的去除
  865. # money = 0
  866. # # money_unit = '万元' if '万' in money_re and re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None else '元'
  867. # if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None:
  868. # if '万' in money_re:
  869. # money_unit = '万元'
  870. # elif '亿' in money_re:
  871. # money_unit = '亿元'
  872. # else:
  873. # money_unit = '元'
  874. return (money, money_unit)
  875. package_number_pattern = re.compile(
  876. '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
  877. |(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
  878. |(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
  879. |((标[段包项]|品目|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
  880. |([,;。、:(]|^)(标的?|(招标|采购)?项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
  881. |((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,9}[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{0,9})\
  882. |[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
  883. filter_package_pattern = 'CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
  884. |标[识注签贴配]|[商油]标号|第X包|第[一二三四五六七八九十]+至[一二三四五六七八九十]+(标[段包项]?|包[组件标]?|合同[包段])\
  885. |\.(docx|doc|pdf|xlsx|xls|jpg)|[一二三四五]次|五金|\d+[年月]|[\d.,]+万?元|\d+\.\d+' # 过滤错误的非包号
  886. def find_package(content):
  887. '''
  888. 通过正则找包和标段号
  889. :param content:
  890. :return:
  891. '''
  892. packages = []
  893. content = content.replace('号,', '号:').replace(':', ':').replace('(', '(').replace(')', ')')
  894. # .replace('-包',' 包').replace('包-', '包 ').replace('-标', ' 标').replace('标段-', '标段 ').replace('-合同包', ' 合同包') # 72760191 标段:№10
  895. content = re.sub('[一二三四五六七八九十\d](标[段包项]|包[组件标])编号', ' 标段编号', content)
  896. for it in re.finditer(filter_package_pattern, content):
  897. content = content.replace(it.group(0), ' ' * len(it.group(0)))
  898. for iter in re.finditer(package_number_pattern, content):
  899. if re.search('(业绩|信誉要求):|业绩(如下)?\d*[、:]', content[:iter.start()]): # 前面有业绩或信誉的标段去掉
  900. continue
  901. # print('提取到标段:%s, 前后文:%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5]))
  902. if re.match('\d', iter.group(0)) and re.search('\d\.$', content[:iter.start()]): # 排除2.10标段3 5.4标段划分 这种情况
  903. # print('过滤掉错误包:', iter.group())
  904. continue
  905. if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('\d包[/每]\w|一包[0-9一二三四五六七八九十]+', content[
  906. iter.start():iter.end() + 3]) or re.search(
  907. '[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
  908. # print('过滤掉错误包:', iter.group())
  909. continue
  910. elif iter.end() + 2 < len(content) and re.search('标的物|包装|划分|标(书|准|志|记|识|签|贴|帜|本|底|价|量)',
  911. content[iter.start():iter.end() + 2]):
  912. # print('过滤掉错误包:', iter.group())
  913. continue
  914. elif re.search('同一(标段?|包)', content[max(0, iter.start() - 2):iter.end()]): # 不得参加同一标段
  915. # print('过滤掉错误包:', iter.group())
  916. continue
  917. elif re.search('三包', content[max(0, iter.start() - 2):iter.end()]) and re.search('第三包', content[max(0,
  918. iter.start() - 2):iter.end()]) == None: # 规规章和“三包”规定
  919. # print('过滤掉错误包:', iter.group())
  920. continue
  921. elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)):
  922. # print('过滤掉错误包号5:', iter.group(0))
  923. continue
  924. elif re.search('单位:包|1包\d|[张箱]', content[max(0, iter.start()-3): iter.end()+2]): # 处理 463166661 包号错误 钢丝,单位:包X10根。
  925. # print('过滤掉错误包号,单位:包|1包', iter.group(0))
  926. continue
  927. elif iter.group(0) == '劳务分包': # 20241203 修复562534840劳务分包作包号
  928. continue
  929. packages.append(iter)
  930. # print('提取到标段:%s, 前后文:%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5]))
  931. return packages
  932. def cut_repeat_name(s):
  933. '''
  934. 公司连续重复名称去重
  935. :param s:
  936. :return:
  937. '''
  938. if len(s) >= 8:
  939. n = s.count(s[-4:])
  940. id = s.find(s[-4:]) + 4
  941. sub_s = s[:id]
  942. if n>=2 and s == sub_s * n:
  943. s = sub_s
  944. return s
  945. def del_tabel_achievement(soup):
  946. if re.search('中标|成交|入围|结果|评标|开标|候选人', soup.text[:800]) == None or re.search('业绩', soup.text)==None:
  947. return None
  948. p1 = '(中标|成交)(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)'
  949. '''删除前面标签 命中业绩规则;当前标签为表格且公布业绩相关信息的去除'''
  950. for tag in soup.find_all('table'):
  951. pre_text = ""
  952. if tag.findPreviousSibling() != None:
  953. pre_text = tag.findPreviousSibling().text.strip()
  954. if pre_text == "" and tag.findPreviousSibling().findPreviousSibling() != None: # 修复表格前一标签没内容,再前一个才有内容情况
  955. pre_text = tag.findPreviousSibling().findPreviousSibling().text.strip()
  956. tr_text = tag.find('tr').text.strip() if tag.find('tr') != None else ""
  957. # print(re.search(p1, pre_text),pre_text, len(pre_text), re.findall('序号|中标候选人名称|项目名称|工程名称|合同金额|建设单位|业主', tr_text))
  958. if re.search(p1, pre_text) and len(pre_text) < 20 and tag.find('tr') != None and len(tr_text)<100:
  959. _count = 0
  960. for td in tag.find('tr').find_all('td'):
  961. td_text = td.text.strip()
  962. if len(td_text) > 25:
  963. break
  964. if len(td_text) < 25 and re.search('中标候选人|第[一二三四五1-5]候选人|(项目|业绩|工程)名称|\w{,10}业绩$|合同金额|建设单位|采购单位|业主|甲方', td_text):
  965. _count += 1
  966. if _count >=2:
  967. pre_tag = tag.findPreviousSibling().extract()
  968. del_tag = tag.extract()
  969. # print('删除表格业绩内容', pre_tag.text + del_tag.text)
  970. break
  971. elif re.search('业绩名称', tr_text) and re.search('建设单位|采购单位|业主', tr_text) and len(tr_text)<100:
  972. del_tag = tag.extract()
  973. # print('删除表格业绩内容', del_tag.text)
  974. del_trs = []
  975. '''删除表格某些行公布的业绩信息'''
  976. for tag in soup.find_all('table'):
  977. text = tag.text
  978. if re.search('业绩', text) == None:
  979. continue
  980. # for tr in tag.find_all('tr'):
  981. trs = tag.find_all('tr')
  982. i = 0
  983. while i < len(trs):
  984. tr = trs[i]
  985. if len(tr.find_all('td'))==2 and tr.td!=None and tr.td.findNextSibling()!=None:
  986. td1_text =tr.td.text
  987. td2_text =tr.td.findNextSibling().text
  988. if re.search('业绩', td1_text)!=None and len(td1_text)<10 and len(re.findall('(\d、|(\d))?[-\w()、]+(工程|项目|勘察|设计|施工|监理|总承包|采购|更新)', td2_text))>=2:
  989. # del_tag = tr.extract()
  990. # print('删除表格业绩内容', del_tag.text)
  991. del_trs.append(tr)
  992. elif tr.td != None and re.search('^业绩|业绩$', tr.td.text.strip()) and len(tr.td.text.strip())<25:
  993. rows = tr.td.attrs.get('rowspan', '')
  994. cols = tr.td.attrs.get('colspan', '')
  995. if rows.isdigit() and int(rows)>2:
  996. for j in range(int(rows)):
  997. if i+j < len(trs):
  998. del_trs.append(trs[i+j])
  999. i += j
  1000. elif cols.isdigit() and int(cols)>3 and len(tr.find_all('td'))==1 and i+2 < len(trs):
  1001. next_tr_cols = 0
  1002. td_num = 0
  1003. for td in trs[i+1].find_all('td'):
  1004. td_num += 1
  1005. if td.attrs.get('colspan', '').isdigit():
  1006. next_tr_cols += int(td.attrs.get('colspan', ''))
  1007. if next_tr_cols == int(cols):
  1008. del_trs.append(tr)
  1009. for j in range(1,len(trs)-i):
  1010. if len(trs[i+j].find_all('td')) == 1:
  1011. break
  1012. elif len(trs[i+j].find_all('td')) >= td_num-1:
  1013. del_trs.append(trs[i+j])
  1014. else:
  1015. break
  1016. i += j
  1017. i += 1
  1018. for tr in del_trs:
  1019. del_tag = tr.extract()
  1020. # print('删除表格业绩内容', del_tag.text)
  1021. def is_all_winner(title):
  1022. '''
  1023. 是否提取所有投标人作为中标人,存管类不分排名都作中标人;入围类按排名,无排名都做中标人
  1024. :param title: 标题
  1025. :return:
  1026. '''
  1027. if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)存放|存放银行|存款服务|国库现金管理', title):
  1028. return 1
  1029. elif re.search('招募|入围|框架(协议)?采购|(单位|商|机构)入库|入库供应商|集中采购', title):
  1030. return 2
  1031. return False
  1032. def is_deposit_project(title, name, requirement):
  1033. '''
  1034. 通过正则判断项目是否为银行存款类项目
  1035. :param title: 标题
  1036. :param name: 项目名称
  1037. :param requirement: 采购内容
  1038. :return:
  1039. '''
  1040. if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)((.{2,10}))?存放|存放银行|存款(服务|业务|项目)|国库现金管理|存款账户开户|(管理|存款|合作)(定点|专户)?银行|贷款合作银行|资金监管账户|开户银行项目|专户开户银行|银行专户选择|定期存[款放]|专项债券?专用账户', title+name+requirement):
  1041. return True
  1042. return False
  1043. def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
  1044. money_list = []
  1045. # 使用正则识别金额
  1046. entity_type = "money"
  1047. list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
  1048. "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资|成本)(\d:|\d=\d[-+×]\d:)?(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[)\)]?))",
  1049. "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
  1050. "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
  1051. # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。 20240415 调整front_m 修复 详见合同元,合同金额:378.8万元 提取
  1052. pattern_money = re.compile("%s|%s|%s|%s" % (
  1053. list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
  1054. list_money_pattern["front_m"]))
  1055. # sentence_text = re.sub('\d+[年月日]', '', sentence_text) # 修复560180018 中标价(元):3年投标报价(元)含税6299700.00 3年作为金额
  1056. # if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
  1057. # found_yeji += 1
  1058. # if found_yeji >= 2: # 过滤掉业绩后面的所有金额 # 20250210修复逻辑错误,中标金额被前面句子业绩表达过滤 评分因素:业绩(9分),评分标准:提供2021年1月1日以来类似项目业绩, 589003579
  1059. # all_match = []
  1060. # else:
  1061. ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text) # 过滤掉收费标准里面的金额
  1062. if ser:
  1063. sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
  1064. all_match = re.finditer(pattern_money, sentence_text)
  1065. # print('all_match:', all_match)
  1066. for _match in all_match:
  1067. # print('_match: ', _match.group())
  1068. if re.search('^元/1\d{10},$', _match.group(0)): # 修复 495042766 现场负责人 姚元 / 13488160460 预测为金额
  1069. continue
  1070. if len(_match.group()) > 0:
  1071. # print("===",_match.group())
  1072. # # print(_match.groupdict())
  1073. notes = '' # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
  1074. unit = ""
  1075. entity_text = ""
  1076. start_index = ""
  1077. end_index = ""
  1078. text_beforeMoney = ""
  1079. filter = ""
  1080. filter_unit = False
  1081. notSure = False
  1082. science = ""
  1083. if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]): # 2021/7/21过滤掉业绩后面金额
  1084. # print('金额在业绩后面: ', _match.group(0))
  1085. found_yeji += 1
  1086. break
  1087. for k, v in _match.groupdict().items():
  1088. if v != "" and v is not None:
  1089. if k == 'text_key_word':
  1090. notSure = True
  1091. if k.split("_")[0] == "money":
  1092. entity_text = v
  1093. # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
  1094. if entity_text.endswith(',00'): # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
  1095. entity_text = entity_text[:-3]
  1096. if k.split("_")[0] == "unit":
  1097. if 'behind' in k or unit == "": # 优先后面单位 预算金额(元):160万元 总价(万元):最终报价:695000.00(元)
  1098. unit = v
  1099. if k.split("_")[0] == "text":
  1100. text_beforeMoney = v
  1101. if k.split("_")[0] == "filter":
  1102. filter = v
  1103. if re.search("filter_unit", k) is not None:
  1104. filter_unit = True
  1105. if k.split("_")[0] == 'science':
  1106. science = v
  1107. # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text,unit,text_beforeMoney,filter,filter_unit))
  1108. # if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()): # 2021/7/19 修正OCR识别小数点为逗号
  1109. # if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0] - 2):_match.span()[0]]):
  1110. # entity_text = re.sub('\d+,', '', entity_text)
  1111. # else:
  1112. # entity_text = entity_text.replace(',', '.')
  1113. # # print(' 修正OCR识别小数点为逗号')
  1114. if filter != "":
  1115. continue
  1116. if len(entity_text)>30 or len(re.sub('[E-]', '', science))>2: # 限制数字长度,避免类似265339018附件金额错误,数值超大报错 decimal.InvalidOperation
  1117. continue
  1118. start_index, end_index = _match.span()
  1119. start_index += len(text_beforeMoney)
  1120. '''过滤掉手机号码作为金额'''
  1121. if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
  1122. # print('过滤掉手机号码作为金额')
  1123. continue
  1124. elif re.search('^1[3-9]\d{9}$', entity_text) and re.search(':\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额(万元):季勇13863441880'
  1125. # print('过滤掉手机号码作为金额')
  1126. continue
  1127. elif re.search('^\d(.\d{1,2})?$', entity_text) and re.search('\d$', _match.group(0)) and re.search('^[、.]', sentence_text[_match.end():]): # 170756755 控制价为:1、合理利润率上限
  1128. # print('过滤错误金额:', _match.group(0))
  1129. continue
  1130. if unit == "": # 2021/7/21 有明显金额特征的补充单位,避免被过滤
  1131. if (re.search('(¥|¥|RMB|CNY)[::]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
  1132. if entity_text.endswith('万元'):
  1133. unit = '万元'
  1134. entity_text = entity_text[:-2]
  1135. else:
  1136. unit = '元'
  1137. # print('1明显金额特征补充单位 元')
  1138. elif re.search('USD[::]?$', text_beforeMoney):
  1139. unit = '美元'
  1140. elif re.search('EUR[::]?$', text_beforeMoney):
  1141. unit = '欧元'
  1142. elif re.search('JPY[::]?$', text_beforeMoney):
  1143. unit = '日元'
  1144. elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
  1145. # print('两个金额连接后面的有单位,用后面单位')
  1146. unit = '万元'
  1147. elif re.search('^,?(价格币种:\w{2,3},)?价格单位:万元', sentence_text[end_index:]): # 修复494731937金额单位缺漏 中标价格:39501.094425,价格币种:人民币,价格单位:万元,
  1148. unit = '万元'
  1149. elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型:(万元)报价:13311.1582,得分:84.46,
  1150. unit = '万元'
  1151. elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|成本)(小写)?[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None: # 修复
  1152. if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
  1153. unit = '万元'
  1154. # print('金额较小且句子中有万元的,补充单位为万元')
  1155. elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
  1156. unit = '万元'
  1157. else:
  1158. unit = '元'
  1159. # print('金额前面紧接关键词的补充单位 元')
  1160. elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|(^\d{,3}(,\d{3})+,?$)', entity_text):
  1161. unit = '元'
  1162. # print('3明显金额特征补充单位 元')
  1163. else:
  1164. # print('过滤掉没单位金额: ',entity_text)
  1165. continue
  1166. elif unit == '万元':
  1167. if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
  1168. unit = '元'
  1169. elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
  1170. unit = '元'
  1171. if unit.find("万") >= 0 and entity_text.find("万") >= 0: # 2021/7/19修改为金额文本有万,不计算单位
  1172. # print('修正金额及单位都有万, 金额:',entity_text, '单位:',unit)
  1173. unit = "元"
  1174. if re.search('.*万元万元', entity_text): # 2021/7/19 修正两个万元
  1175. # print(' 修正两个万元',entity_text)
  1176. entity_text = entity_text.replace('万元万元', '万元')
  1177. else:
  1178. if filter_unit:
  1179. continue
  1180. # symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else '' # 负值金额前面保留负号 ,后面这些不作为负金额 起拍价:105.29-200.46万元 预 算 --- 350000.0 2023/04/14 取消符号
  1181. entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
  1182. # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
  1183. if re.search('总投资|投资总额|总预算|总概算|(投资|招标|资金|存放|操作|融资)规模|批复概算|投资额|总规模|工程造价|总金额',
  1184. sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]): # 2021/8/5过滤掉总投资金额 20241031工程造价作总投资
  1185. # print('总投资金额: ', _match.group(0))
  1186. notes = '总投资'
  1187. elif re.search('投资|概算|建安费|其他费用|基本预备费',
  1188. sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]): # 2021/11/18 投资金额不作为招标金额
  1189. notes = '投资'
  1190. # elif re.search('工程造价',
  1191. # sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]): # 2021/12/20 工程造价不作为招标金额
  1192. # notes = '工程造价'
  1193. elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
  1194. or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\((]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\))]*[::为]',
  1195. sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
  1196. or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
  1197. sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
  1198. notes = '保证金'
  1199. # print('保证金信息:', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
  1200. elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
  1201. sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
  1202. notes = '成本警戒线'
  1203. elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]|服务金额', sentence_text[_match.span()[0]:_match.span()[1]]):
  1204. # cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
  1205. # notes = cost_re.group(1)
  1206. notes = '招标或中标金额'
  1207. elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
  1208. notes = '单价'
  1209. elif re.search('^[/每]', sentence_text[_match.end():]):
  1210. # print('单价:', _match.group(0))
  1211. notes = '单价'
  1212. elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
  1213. notes = '大写'
  1214. if entity_text[0] == "拾": # 2021/12/16 修正大写金额省略了数字转换错误问题
  1215. entity_text = "壹" + entity_text
  1216. # print("补充备注:notes = 大写")
  1217. if len(unit) > 0:
  1218. if unit.find('万') >= 0 and len(entity_text.split('.')[0]) >= 8: # 2021/7/19 修正万元金额过大的情况
  1219. # print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
  1220. entity_text = str(
  1221. getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]) / 10000)
  1222. unit = '元' # 修正金额后单位 重置为元
  1223. else:
  1224. # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
  1225. entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]))
  1226. else:
  1227. if entity_text.find('万') >= 0 and entity_text.split('.')[0].isdigit() and len(
  1228. entity_text.split('.')[0]) >= 8:
  1229. entity_text = str(getUnifyMoney(entity_text) / 10000)
  1230. # print('修正金额字段含万 过大的情况')
  1231. else:
  1232. entity_text = str(getUnifyMoney(entity_text))
  1233. if science and re.search('^E-?\d+$', science): # 科学计数
  1234. entity_text = str(Decimal(entity_text + science)) if Decimal(entity_text + science) > 100 and Decimal(
  1235. entity_text + science) < 10000000000 else entity_text # 结果大于100及小于100万才使用科学计算
  1236. if float(entity_text) > 100000000000: # float(entity_text)<100 or 2022/3/4 取消最小金额限制
  1237. # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
  1238. continue
  1239. if notSure and unit == "" and float(entity_text) > 100 * 10000:
  1240. # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000:', entity_text, unit)
  1241. continue
  1242. # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
  1243. # filter, filter_unit))
  1244. if re.search('[%%‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
  1245. # print('过滤掉可能是费率的金额')
  1246. continue
  1247. money_list.append((entity_text, start_index, end_index, unit, notes))
  1248. return money_list, found_yeji
  1249. def recall(y_true, y_pred):
  1250. '''
  1251. 计算召回率
  1252. @Argus:
  1253. y_true: 正确的标签
  1254. y_pred: 模型预测的标签
  1255. @Return
  1256. 召回率
  1257. '''
  1258. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  1259. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  1260. if c3 == 0:
  1261. return 0
  1262. recall = c1 / c3
  1263. return recall
  1264. def f1_score(y_true, y_pred):
  1265. '''
  1266. 计算F1
  1267. @Argus:
  1268. y_true: 正确的标签
  1269. y_pred: 模型预测的标签
  1270. @Return
  1271. F1值
  1272. '''
  1273. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  1274. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  1275. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  1276. precision = c1 / c2
  1277. if c3 == 0:
  1278. recall = 0
  1279. else:
  1280. recall = c1 / c3
  1281. f1_score = 2 * (precision * recall) / (precision + recall)
  1282. return f1_score
  1283. def precision(y_true, y_pred):
  1284. '''
  1285. 计算精确率
  1286. @Argus:
  1287. y_true: 正确的标签
  1288. y_pred: 模型预测的标签
  1289. @Return
  1290. 精确率
  1291. '''
  1292. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  1293. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  1294. precision = c1 / c2
  1295. return precision
  1296. # def print_metrics(history):
  1297. # '''
  1298. # 制作每次迭代的各metrics变化图片
  1299. #
  1300. # @Arugs:
  1301. # history: 模型训练迭代的历史记录
  1302. # '''
  1303. # import matplotlib.pyplot as plt
  1304. #
  1305. # # loss图
  1306. # loss = history.history['loss']
  1307. # val_loss = history.history['val_loss']
  1308. # epochs = range(1, len(loss) + 1)
  1309. # plt.subplot(2, 2, 1)
  1310. # plt.plot(epochs, loss, 'bo', label='Training loss')
  1311. # plt.plot(epochs, val_loss, 'b', label='Validation loss')
  1312. # plt.title('Training and validation loss')
  1313. # plt.xlabel('Epochs')
  1314. # plt.ylabel('Loss')
  1315. # plt.legend()
  1316. #
  1317. # # f1图
  1318. # f1 = history.history['f1_score']
  1319. # val_f1 = history.history['val_f1_score']
  1320. # plt.subplot(2, 2, 2)
  1321. # plt.plot(epochs, f1, 'bo', label='Training f1')
  1322. # plt.plot(epochs, val_f1, 'b', label='Validation f1')
  1323. # plt.title('Training and validation f1')
  1324. # plt.xlabel('Epochs')
  1325. # plt.ylabel('F1')
  1326. # plt.legend()
  1327. #
  1328. # # precision图
  1329. # prec = history.history['precision']
  1330. # val_prec = history.history['val_precision']
  1331. # plt.subplot(2, 2, 3)
  1332. # plt.plot(epochs, prec, 'bo', label='Training precision')
  1333. # plt.plot(epochs, val_prec, 'b', label='Validation pecision')
  1334. # plt.title('Training and validation precision')
  1335. # plt.xlabel('Epochs')
  1336. # plt.ylabel('Precision')
  1337. # plt.legend()
  1338. #
  1339. # # recall图
  1340. # recall = history.history['recall']
  1341. # val_recall = history.history['val_recall']
  1342. # plt.subplot(2, 2, 4)
  1343. # plt.plot(epochs, recall, 'bo', label='Training recall')
  1344. # plt.plot(epochs, val_recall, 'b', label='Validation recall')
  1345. # plt.title('Training and validation recall')
  1346. # plt.xlabel('Epochs')
  1347. # plt.ylabel('Recall')
  1348. # plt.legend()
  1349. #
  1350. # plt.show()
  1351. def clean_company(entity_text):
  1352. '''
  1353. 清洗公司名称
  1354. :param entity_text:
  1355. :return:
  1356. '''
  1357. entity_text = re.sub('\s', '', entity_text)
  1358. if re.search('^(\d{4}年)?[\-\d月日份]*\w{2,3}分公司$|^\w{,6}某(部|医院)$|空间布局$', entity_text): # 删除
  1359. # print('公司实体不符合规范:', entity_text)
  1360. return ''
  1361. elif re.match('xx|XX', entity_text): # 删除
  1362. # print('公司实体不符合规范:', entity_text)
  1363. return ''
  1364. elif re.match('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', entity_text):
  1365. entity_text = re.sub('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', '', entity_text)
  1366. elif re.match('(\d+)|\d+\.|\s|&nbsp', entity_text):
  1367. entity_text = re.sub('(\d+)|\d+\.|\s|&nbsp', '', entity_text)
  1368. elif re.match(
  1369. '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
  1370. entity_text):
  1371. filter = re.match(
  1372. '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
  1373. entity_text).group(1)
  1374. entity_text = entity_text.replace(filter, '')
  1375. elif re.search('\]|\[|\]|[【】{}「?:∶〔·.\'#~_ΓΙεⅠ]', entity_text):
  1376. entity_text = re.sub('\]|\[|\]|[【】「?:∶〔·.\'#~_ΓΙεⅠ]', '', entity_text)
  1377. if len(re.sub('(项目|分|有限)?公司|集团|制造部|中心|医院|学校|大学|中学|小学|幼儿园', '', entity_text)) < 2:
  1378. # print('公司实体不符合规范:', entity_text)
  1379. return ''
  1380. return entity_text
  1381. if __name__=="__main__":
  1382. # print(fool_char_to_id[">"])
  1383. print(getUnifyMoney('伍仟贰佰零壹拾伍万零捌佰壹拾元陆角伍分'))
  1384. # model = getModel_w2v()
  1385. # vocab,matrix = getVocabAndMatrix(model, Embedding_size=128)
  1386. # save([vocab,matrix],"vocabMatrix_words.pk")