predictor.py 135 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556
  1. '''
  2. Created on 2018年12月26日
  3. @author: User
  4. '''
  5. import os
  6. import sys
  7. from BiddingKG.dl.common.nerUtils import *
  8. sys.path.append(os.path.abspath("../.."))
  9. # from keras.engine import topology
  10. # from keras import models
  11. # from keras import layers
  12. # from keras_contrib.layers.crf import CRF
  13. # from keras.preprocessing.sequence import pad_sequences
  14. # from keras import optimizers,losses,metrics
  15. from BiddingKG.dl.common.Utils import *
  16. from BiddingKG.dl.interface.modelFactory import *
  17. import tensorflow as tf
  18. from BiddingKG.dl.product.data_util import decode, process_data
  19. from BiddingKG.dl.interface.Entitys import Entity
  20. from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
  21. from bs4 import BeautifulSoup
  22. import copy
  23. import calendar
  24. import datetime
  25. from threading import RLock
  26. dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
  27. "prem":{"predictor":None,"Lock":RLock()},
  28. "epc":{"predictor":None,"Lock":RLock()},
  29. "roleRule":{"predictor":None,"Lock":RLock()},
  30. "form":{"predictor":None,"Lock":RLock()},
  31. "time":{"predictor":None,"Lock":RLock()},
  32. "punish":{"predictor":None,"Lock":RLock()},
  33. "product":{"predictor":None,"Lock":RLock()},
  34. "product_attrs":{"predictor":None,"Lock":RLock()},
  35. "channel": {"predictor": None, "Lock": RLock()},
  36. "deposit_payment_way": {"predictor": None, "Lock": RLock()}}
  37. def getPredictor(_type):
  38. if _type in dict_predictor:
  39. with dict_predictor[_type]["Lock"]:
  40. if dict_predictor[_type]["predictor"] is None:
  41. if _type=="codeName":
  42. dict_predictor[_type]["predictor"] = CodeNamePredict()
  43. if _type=="prem":
  44. dict_predictor[_type]["predictor"] = PREMPredict()
  45. if _type=="epc":
  46. dict_predictor[_type]["predictor"] = EPCPredict()
  47. if _type=="roleRule":
  48. dict_predictor[_type]["predictor"] = RoleRulePredictor()
  49. if _type=="form":
  50. dict_predictor[_type]["predictor"] = FormPredictor()
  51. if _type=="time":
  52. dict_predictor[_type]["predictor"] = TimePredictor()
  53. if _type=="punish":
  54. dict_predictor[_type]["predictor"] = Punish_Extract()
  55. if _type=="product":
  56. dict_predictor[_type]["predictor"] = ProductPredictor()
  57. if _type=="product_attrs":
  58. dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
  59. if _type == "channel":
  60. dict_predictor[_type]["predictor"] = DocChannel()
  61. if _type == 'deposit_payment_way':
  62. dict_predictor[_type]["predictor"] = DepositPaymentWay()
  63. return dict_predictor[_type]["predictor"]
  64. raise NameError("no this type of predictor")
  65. #编号名称模型
  66. class CodeNamePredict():
  67. def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()):
  68. self.model = None
  69. self.MAX_LEN = None
  70. self.model_code = None
  71. if EMBED_DIM is None:
  72. self.EMBED_DIM = 60
  73. else:
  74. self.EMBED_DIM = EMBED_DIM
  75. if BiRNN_UNITS is None:
  76. self.BiRNN_UNITS = 200
  77. else:
  78. self.BiRNN_UNITS = BiRNN_UNITS
  79. self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5"
  80. #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5"
  81. self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5"
  82. vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk"
  83. classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk"
  84. self.vocab = load(vocabpath)
  85. self.class_labels = load(classlabelspath)
  86. #生成提取编号和名称的正则
  87. id_PC_B = self.class_labels.index("PC_B")
  88. id_PC_M = self.class_labels.index("PC_M")
  89. id_PC_E = self.class_labels.index("PC_E")
  90. id_PN_B = self.class_labels.index("PN_B")
  91. id_PN_M = self.class_labels.index("PN_M")
  92. id_PN_E = self.class_labels.index("PN_E")
  93. self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E))
  94. self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E))
  95. print("pc",self.PC_pattern)
  96. print("pn",self.PN_pattern)
  97. self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab)))
  98. self.inputs = None
  99. self.outputs = None
  100. self.sess_codename = tf.Session(graph=tf.Graph())
  101. self.sess_codesplit = tf.Session(graph=tf.Graph())
  102. self.inputs_code = None
  103. self.outputs_code = None
  104. if not lazyLoad:
  105. self.getModel()
  106. self.getModel_code()
  107. def getModel(self):
  108. '''
  109. @summary: 取得编号和名称模型
  110. '''
  111. if self.inputs is None:
  112. log("get model of codename")
  113. with self.sess_codename.as_default():
  114. with self.sess_codename.graph.as_default():
  115. meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf")
  116. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  117. signature_def = meta_graph_def.signature_def
  118. self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name)
  119. self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name)
  120. self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name)
  121. self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name)
  122. self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name)
  123. return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
  124. else:
  125. return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
  126. '''
  127. if self.model is None:
  128. self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None)
  129. self.model.load_weights(self.filepath)
  130. return self.model
  131. '''
  132. def getModel_code(self):
  133. if self.inputs_code is None:
  134. log("get model of code")
  135. with self.sess_codesplit.as_default():
  136. with self.sess_codesplit.graph.as_default():
  137. meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel")
  138. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  139. signature_def = meta_graph_def.signature_def
  140. self.inputs_code = []
  141. self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
  142. self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
  143. self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name))
  144. self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
  145. self.sess_codesplit.graph.finalize()
  146. return self.inputs_code,self.outputs_code
  147. else:
  148. return self.inputs_code,self.outputs_code
  149. '''
  150. if self.model_code is None:
  151. log("get model of model_code")
  152. with self.sess_codesplit.as_default():
  153. with self.sess_codesplit.graph.as_default():
  154. self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  155. return self.model_code
  156. '''
  157. def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
  158. '''
  159. model = models.Sequential()
  160. model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
  161. model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
  162. crf = CRF(len(chunk_tags), sparse_target=True)
  163. model.add(crf)
  164. model.summary()
  165. model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
  166. return model
  167. '''
  168. input = layers.Input(shape=(None,))
  169. if weights is not None:
  170. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
  171. else:
  172. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
  173. bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
  174. bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
  175. crf = CRF(len(chunk_tags),sparse_target=True)
  176. crf_out = crf(bilstm_dense)
  177. model = models.Model(input=[input],output = [crf_out])
  178. model.summary()
  179. model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
  180. return model
  181. #根据规则补全编号或名称两边的符号
  182. def fitDataByRule(self,data):
  183. symbol_dict = {"(":")",
  184. "(":")",
  185. "[":"]",
  186. "【":"】",
  187. ")":"(",
  188. ")":"(",
  189. "]":"[",
  190. "】":"【"}
  191. leftSymbol_pattern = re.compile("[\((\[【]")
  192. rightSymbol_pattern = re.compile("[\))\]】]")
  193. leftfinds = re.findall(leftSymbol_pattern,data)
  194. rightfinds = re.findall(rightSymbol_pattern,data)
  195. result = data
  196. if len(leftfinds)+len(rightfinds)==0:
  197. return data
  198. elif len(leftfinds)==len(rightfinds):
  199. return data
  200. elif abs(len(leftfinds)-len(rightfinds))==1:
  201. if len(leftfinds)>len(rightfinds):
  202. if symbol_dict.get(data[0]) is not None:
  203. result = data[1:]
  204. else:
  205. #print(symbol_dict.get(leftfinds[0]))
  206. result = data+symbol_dict.get(leftfinds[0])
  207. else:
  208. if symbol_dict.get(data[-1]) is not None:
  209. result = data[:-1]
  210. else:
  211. result = symbol_dict.get(rightfinds[0])+data
  212. return result
  213. def decode(self,logits, trans, sequence_lengths, tag_num):
  214. viterbi_sequences = []
  215. for logit, length in zip(logits, sequence_lengths):
  216. score = logit[:length]
  217. viterbi_seq, viterbi_score = viterbi_decode(score, trans)
  218. viterbi_sequences.append(viterbi_seq)
  219. return viterbi_sequences
  220. def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
  221. #@summary: 获取每篇文章的code和name
  222. pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
  223. result = []
  224. index_unk = self.word2index.get("<unk>")
  225. # index_pad = self.word2index.get("<pad>")
  226. if list_entitys is None:
  227. list_entitys = [[] for _ in range(len(list_sentences))]
  228. for list_sentence,list_entity in zip(list_sentences,list_entitys):
  229. if len(list_sentence)==0:
  230. result.append([{"code":[],"name":""}])
  231. continue
  232. doc_id = list_sentence[0].doc_id
  233. # sentences = []
  234. # for sentence in list_sentence:
  235. # if len(sentence.sentence_text)>MAX_AREA:
  236. # for _sentence_comma in re.split("[;;,\n]",sentence):
  237. # _comma_index = 0
  238. # while(_comma_index<len(_sentence_comma)):
  239. # sentences.append(_sentence_comma[_comma_index:_comma_index+MAX_AREA])
  240. # _comma_index += MAX_AREA
  241. # else:
  242. # sentences.append(sentence+"。")
  243. list_sentence.sort(key=lambda x:len(x.sentence_text),reverse=True)
  244. _begin_index = 0
  245. item = {"code":[],"name":""}
  246. code_set = set()
  247. dict_name_freq_score = dict()
  248. while(True):
  249. MAX_LEN = len(list_sentence[_begin_index].sentence_text)
  250. if MAX_LEN>MAX_AREA:
  251. MAX_LEN = MAX_AREA
  252. _LEN = MAX_AREA//MAX_LEN
  253. #预测
  254. x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
  255. # x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
  256. x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
  257. x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
  258. if USE_PAI_EAS:
  259. request = tf_predict_pb2.PredictRequest()
  260. request.inputs["inputs"].dtype = tf_predict_pb2.DT_INT32
  261. request.inputs["inputs"].array_shape.dim.extend(np.shape(x))
  262. request.inputs["inputs"].int_val.extend(np.array(x,dtype=np.int32).reshape(-1))
  263. request_data = request.SerializeToString()
  264. list_outputs = ["outputs"]
  265. _result = vpc_requests(codename_url, codename_authorization, request_data, list_outputs)
  266. if _result is not None:
  267. predict_y = _result["outputs"]
  268. else:
  269. with self.sess_codename.as_default():
  270. t_input,t_output = self.getModel()
  271. predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x})
  272. else:
  273. with self.sess_codename.as_default():
  274. t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel()
  275. _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x,
  276. t_input_length:x_len,
  277. t_keepprob:1.0})
  278. predict_y = self.decode(_logits,_trans,x_len,7)
  279. # print('==========',_logits)
  280. '''
  281. for item11 in np.argmax(predict_y,-1):
  282. print(item11)
  283. print(predict_y)
  284. '''
  285. # print(predict_y)
  286. for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)):
  287. pad_sentence = sentence.sentence_text[:MAX_LEN]
  288. join_predict = "".join([str(s) for s in predict])
  289. # print(pad_sentence)
  290. # print(join_predict)
  291. code_x = []
  292. code_text = []
  293. temp_entitys = []
  294. for iter in re.finditer(self.PC_pattern,join_predict):
  295. get_len = 40
  296. if iter.span()[0]<get_len:
  297. begin = 0
  298. else:
  299. begin = iter.span()[0]-get_len
  300. end = iter.span()[1]+get_len
  301. code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
  302. code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
  303. _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]],entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1])
  304. temp_entitys.append(_entity)
  305. #print("code",code_text)
  306. if len(code_x)>0:
  307. code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3))
  308. if USE_PAI_EAS:
  309. request = tf_predict_pb2.PredictRequest()
  310. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  311. request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0]))
  312. request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1))
  313. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  314. request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1]))
  315. request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1))
  316. request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
  317. request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2]))
  318. request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1))
  319. request_data = request.SerializeToString()
  320. list_outputs = ["outputs"]
  321. _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs)
  322. if _result is not None:
  323. predict_code = _result["outputs"]
  324. else:
  325. with self.sess_codesplit.as_default():
  326. with self.sess_codesplit.graph.as_default():
  327. predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
  328. else:
  329. with self.sess_codesplit.as_default():
  330. with self.sess_codesplit.graph.as_default():
  331. inputs_code,outputs_code = self.getModel_code()
  332. predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]},MAX_BATCH=2)[0]
  333. #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})
  334. #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
  335. for h in range(len(predict_code)):
  336. if predict_code[h][0]>0.5:
  337. the_code = self.fitDataByRule(code_text[h])
  338. #add code to entitys
  339. list_entity.append(temp_entitys[h])
  340. if the_code not in code_set:
  341. code_set.add(the_code)
  342. item['code'] = list(code_set)
  343. for iter in re.finditer(self.PN_pattern,join_predict):
  344. _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  345. #add name to entitys
  346. _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1])
  347. list_entity.append(_entity)
  348. w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
  349. if _name not in dict_name_freq_score:
  350. # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
  351. dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w]
  352. else:
  353. dict_name_freq_score[_name][0] += 1
  354. '''
  355. for iter in re.finditer(self.PN_pattern,join_predict):
  356. print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
  357. if item[1]['name']=="":
  358. for iter in re.finditer(self.PN_pattern,join_predict):
  359. #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  360. item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  361. break
  362. '''
  363. if _begin_index+_LEN>=len(list_sentence):
  364. break
  365. _begin_index += _LEN
  366. list_name_freq_score = []
  367. # 2020/11/23 大网站规则调整
  368. if len(dict_name_freq_score) == 0:
  369. name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
  370. for sentence in list_sentence:
  371. # pad_sentence = sentence.sentence_text
  372. othername = re.search(name_re1, sentence.sentence_text)
  373. if othername != None:
  374. project_name = othername.group(3)
  375. beg = find_index([project_name], sentence.sentence_text)[0]
  376. end = beg + len(project_name)
  377. _name = self.fitDataByRule(sentence.sentence_text[beg:end])
  378. # add name to entitys
  379. _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
  380. sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
  381. entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
  382. end_index=0, wordOffset_begin=beg, wordOffset_end=end)
  383. list_entity.append(_entity)
  384. w = 1
  385. if _name not in dict_name_freq_score:
  386. # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
  387. dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
  388. else:
  389. dict_name_freq_score[_name][0] += 1
  390. # othername = re.search(name_re1, sentence.sentence_text)
  391. # if othername != None:
  392. # _name = othername.group(3)
  393. # if _name not in dict_name_freq_score:
  394. # dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1]
  395. # else:
  396. # dict_name_freq_score[_name][0] += 1
  397. for _name in dict_name_freq_score.keys():
  398. list_name_freq_score.append([_name,dict_name_freq_score[_name]])
  399. # print(list_name_freq_score)
  400. if len(list_name_freq_score)>0:
  401. list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
  402. item['name'] = list_name_freq_score[0][0]
  403. # if list_name_freq_score[0][1][0]>1:
  404. # item[1]['name'] = list_name_freq_score[0][0]
  405. # else:
  406. # list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True)
  407. # item[1]["name"] = list_name_freq_score[0][0]
  408. #下面代码加上去用正则添加某些识别不到的项目编号
  409. if item['code'] == []:
  410. for sentence in list_sentence:
  411. # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text)
  412. # if othercode != None:
  413. # item[1]['code'].append(othercode.group(2))
  414. # 2020/11/23 大网站规则调整
  415. othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text)
  416. if othercode != None:
  417. item['code'].append(othercode.group(3))
  418. item['code'].sort(key=lambda x:len(x),reverse=True)
  419. result.append(item)
  420. list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
  421. return result
  422. '''
  423. #当数据量过大时会报错
  424. def predict(self,articles,MAX_LEN = None):
  425. sentences = []
  426. for article in articles:
  427. for sentence in article.content.split("。"):
  428. sentences.append([sentence,article.id])
  429. if MAX_LEN is None:
  430. sent_len = [len(sentence[0]) for sentence in sentences]
  431. MAX_LEN = max(sent_len)
  432. #print(MAX_LEN)
  433. #若为空,则直接返回空
  434. result = []
  435. if MAX_LEN==0:
  436. for article in articles:
  437. result.append([article.id,{"code":[],"name":""}])
  438. return result
  439. index_unk = self.word2index.get("<unk>")
  440. index_pad = self.word2index.get("<pad>")
  441. x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences]
  442. x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
  443. predict_y = self.getModel().predict(x)
  444. last_doc_id = ""
  445. item = []
  446. for sentence,predict in zip(sentences,np.argmax(predict_y,-1)):
  447. pad_sentence = sentence[0][:MAX_LEN]
  448. doc_id = sentence[1]
  449. join_predict = "".join([str(s) for s in predict])
  450. if doc_id!=last_doc_id:
  451. if last_doc_id!="":
  452. result.append(item)
  453. item = [doc_id,{"code":[],"name":""}]
  454. code_set = set()
  455. code_x = []
  456. code_text = []
  457. for iter in re.finditer(self.PC_pattern,join_predict):
  458. get_len = 40
  459. if iter.span()[0]<get_len:
  460. begin = 0
  461. else:
  462. begin = iter.span()[0]-get_len
  463. end = iter.span()[1]+get_len
  464. code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
  465. code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
  466. if len(code_x)>0:
  467. code_x = np.transpose(np.array(code_x),(1,0,2,3))
  468. predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
  469. for h in range(len(predict_code)):
  470. if predict_code[h][0]>0.5:
  471. the_code = self.fitDataByRule(code_text[h])
  472. if the_code not in code_set:
  473. code_set.add(the_code)
  474. item[1]['code'] = list(code_set)
  475. if item[1]['name']=="":
  476. for iter in re.finditer(self.PN_pattern,join_predict):
  477. #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  478. item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  479. break
  480. last_doc_id = doc_id
  481. result.append(item)
  482. return result
  483. '''
  484. #角色金额模型
  485. class PREMPredict():
  486. def __init__(self):
  487. #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
  488. self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
  489. self.model_role = Model_role_classify_word()
  490. self.model_money = Model_money_classify()
  491. return
  492. def search_role_data(self,list_sentences,list_entitys):
  493. '''
  494. @summary:根据句子list和实体list查询角色模型的输入数据
  495. @param:
  496. list_sentences:文章的sentences
  497. list_entitys:文章的entitys
  498. @return:角色模型的输入数据
  499. '''
  500. text_list = []
  501. data_x = []
  502. points_entitys = []
  503. for list_entity,list_sentence in zip(list_entitys,list_sentences):
  504. list_entity.sort(key=lambda x:x.sentence_index)
  505. list_sentence.sort(key=lambda x:x.sentence_index)
  506. p_entitys = 0
  507. p_sentences = 0
  508. while(p_entitys<len(list_entity)):
  509. entity = list_entity[p_entitys]
  510. if entity.entity_type in ['org','company']:
  511. while(p_sentences<len(list_sentence)):
  512. sentence = list_sentence[p_sentences]
  513. if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
  514. text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-10):entity.wordOffset_end+10])
  515. #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
  516. item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
  517. data_x.append(item_x)
  518. points_entitys.append(entity)
  519. break
  520. p_sentences += 1
  521. p_entitys += 1
  522. if len(points_entitys)==0:
  523. return None
  524. return [data_x,points_entitys, text_list]
  525. def search_money_data(self,list_sentences,list_entitys):
  526. '''
  527. @summary:根据句子list和实体list查询金额模型的输入数据
  528. @param:
  529. list_sentences:文章的sentences
  530. list_entitys:文章的entitys
  531. @return:金额模型的输入数据
  532. '''
  533. text_list = []
  534. data_x = []
  535. points_entitys = []
  536. for list_entity,list_sentence in zip(list_entitys,list_sentences):
  537. list_entity.sort(key=lambda x:x.sentence_index)
  538. list_sentence.sort(key=lambda x:x.sentence_index)
  539. p_entitys = 0
  540. while(p_entitys<len(list_entity)):
  541. entity = list_entity[p_entitys]
  542. if entity.entity_type=="money":
  543. p_sentences = 0
  544. while(p_sentences<len(list_sentence)):
  545. sentence = list_sentence[p_sentences]
  546. if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
  547. text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 8):entity.wordOffset_end])
  548. #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
  549. #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
  550. item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
  551. data_x.append(item_x)
  552. points_entitys.append(entity)
  553. break
  554. p_sentences += 1
  555. p_entitys += 1
  556. if len(points_entitys)==0:
  557. return None
  558. return [data_x,points_entitys, text_list]
  559. def predict_role(self,list_sentences, list_entitys):
  560. datas = self.search_role_data(list_sentences, list_entitys)
  561. if datas is None:
  562. return
  563. points_entitys = datas[1]
  564. text_list = datas[2]
  565. if USE_PAI_EAS:
  566. _data = datas[0]
  567. _data = np.transpose(np.array(_data),(1,0,2))
  568. request = tf_predict_pb2.PredictRequest()
  569. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  570. request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
  571. request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
  572. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  573. request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
  574. request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
  575. request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
  576. request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
  577. request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
  578. request_data = request.SerializeToString()
  579. list_outputs = ["outputs"]
  580. _result = vpc_requests(role_url, role_authorization, request_data, list_outputs)
  581. if _result is not None:
  582. predict_y = _result["outputs"]
  583. else:
  584. predict_y = self.model_role.predict(datas[0])
  585. else:
  586. predict_y = self.model_role.predict(np.array(datas[0],dtype=np.float64))
  587. for i in range(len(predict_y)):
  588. entity = points_entitys[i]
  589. label = np.argmax(predict_y[i])
  590. values = predict_y[i]
  591. text = text_list[i]
  592. if label == 2:
  593. if re.search('中标单位和.{,25}签订合同', text):
  594. label = 0
  595. values[label] = 0.501
  596. elif re.search('尊敬的供应商:.{,25}我公司', text):
  597. label = 0
  598. values[label] = 0.801
  599. entity.set_Role(label, values)
  600. def predict_money(self,list_sentences,list_entitys):
  601. datas = self.search_money_data(list_sentences, list_entitys)
  602. if datas is None:
  603. return
  604. points_entitys = datas[1]
  605. _data = datas[0]
  606. text_list = datas[2]
  607. if USE_PAI_EAS:
  608. _data = np.transpose(np.array(_data),(1,0,2,3))
  609. request = tf_predict_pb2.PredictRequest()
  610. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  611. request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
  612. request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
  613. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  614. request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
  615. request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
  616. request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
  617. request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
  618. request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
  619. request_data = request.SerializeToString()
  620. list_outputs = ["outputs"]
  621. _result = vpc_requests(money_url, money_authorization, request_data, list_outputs)
  622. if _result is not None:
  623. predict_y = _result["outputs"]
  624. else:
  625. predict_y = self.model_money.predict(_data)
  626. else:
  627. predict_y = self.model_money.predict(_data)
  628. for i in range(len(predict_y)):
  629. entity = points_entitys[i]
  630. label = np.argmax(predict_y[i])
  631. values = predict_y[i]
  632. text = text_list[i]
  633. if label == 1 and re.search('[::,。](总金额|总价|单价)', text):
  634. values[label] = 0.49
  635. elif label ==0 and entity.notes in ["投资", "工程造价"]:
  636. values[label] = 0.49
  637. entity.set_Money(label, values)
  638. def predict(self,list_sentences,list_entitys):
  639. self.predict_role(list_sentences,list_entitys)
  640. self.predict_money(list_sentences,list_entitys)
  641. #联系人模型
  642. class EPCPredict():
  643. def __init__(self):
  644. self.model_person = Model_person_classify()
  645. def search_person_data(self,list_sentences,list_entitys):
  646. '''
  647. @summary:根据句子list和实体list查询联系人模型的输入数据
  648. @param:
  649. list_sentences:文章的sentences
  650. list_entitys:文章的entitys
  651. @return:联系人模型的输入数据
  652. '''
  653. data_x = []
  654. points_entitys = []
  655. for list_entity,list_sentence in zip(list_entitys,list_sentences):
  656. p_entitys = 0
  657. dict_index_sentence = {}
  658. for _sentence in list_sentence:
  659. dict_index_sentence[_sentence.sentence_index] = _sentence
  660. _list_entity = [entity for entity in list_entity if entity.entity_type=="person"]
  661. while(p_entitys<len(_list_entity)):
  662. entity = _list_entity[p_entitys]
  663. if entity.entity_type=="person":
  664. sentence = dict_index_sentence[entity.sentence_index]
  665. item_x = self.model_person.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
  666. data_x.append(item_x)
  667. points_entitys.append(entity)
  668. p_entitys += 1
  669. if len(points_entitys)==0:
  670. return None
  671. # return [data_x,points_entitys,dianhua]
  672. return [data_x,points_entitys]
  673. def predict_person(self,list_sentences, list_entitys):
  674. datas = self.search_person_data(list_sentences, list_entitys)
  675. if datas is None:
  676. return
  677. points_entitys = datas[1]
  678. # phone = datas[2]
  679. if USE_PAI_EAS:
  680. _data = datas[0]
  681. _data = np.transpose(np.array(_data),(1,0,2,3))
  682. request = tf_predict_pb2.PredictRequest()
  683. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  684. request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
  685. request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
  686. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  687. request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
  688. request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
  689. request_data = request.SerializeToString()
  690. list_outputs = ["outputs"]
  691. _result = vpc_requests(person_url, person_authorization, request_data, list_outputs)
  692. if _result is not None:
  693. predict_y = _result["outputs"]
  694. else:
  695. predict_y = self.model_person.predict(datas[0])
  696. else:
  697. predict_y = self.model_person.predict(datas[0])
  698. # assert len(predict_y)==len(points_entitys)==len(phone)
  699. assert len(predict_y)==len(points_entitys)
  700. for i in range(len(predict_y)):
  701. entity = points_entitys[i]
  702. label = np.argmax(predict_y[i])
  703. values = []
  704. for item in predict_y[i]:
  705. values.append(item)
  706. # phone_number = phone[i]
  707. # entity.set_Person(label,values,phone_number)
  708. entity.set_Person(label,values,None)
  709. # 为联系人匹配电话
  710. # self.person_search_phone(list_sentences, list_entitys)
  711. def person_search_phone(self,list_sentences, list_entitys):
  712. def phoneFromList(phones):
  713. # for phone in phones:
  714. # if len(phone)==11:
  715. # return re.sub('电话[:|:]|联系方式[:|:]','',phone)
  716. return re.sub('电话[:|:]|联系方式[:|:]', '', phones[0])
  717. for list_entity, list_sentence in zip(list_entitys, list_sentences):
  718. # p_entitys = 0
  719. # p_sentences = 0
  720. #
  721. # key_word = re.compile('电话[:|:].{0,4}\d{7,12}|联系方式[:|:].{0,4}\d{7,12}')
  722. # # phone = re.compile('1[3|4|5|7|8][0-9][-—-]?\d{4}[-—-]?\d{4}|\d{3,4}[-—-]\d{7,8}/\d{3,8}|\d{3,4}[-—-]\d{7,8}转\d{1,4}|\d{3,4}[-—-]\d{7,8}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}') # 联系电话
  723. # # 2020/11/25 增加发现的号码段
  724. # phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-]?\d{4}[-—-]?\d{4}|'
  725. # '\d{3,4}[-—-][1-9]\d{6,7}/\d{3,8}|'
  726. # '\d{3,4}[-—-]\d{7,8}转\d{1,4}|'
  727. # '\d{3,4}[-—-]?[1-9]\d{6,7}|'
  728. # '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
  729. # '[1-9]\d{6,7}') # 联系电话
  730. # dict_index_sentence = {}
  731. # for _sentence in list_sentence:
  732. # dict_index_sentence[_sentence.sentence_index] = _sentence
  733. #
  734. # dict_context_itemx = {}
  735. # last_person = "####****++++$$^"
  736. # last_person_phone = "####****++++$^"
  737. # _list_entity = [entity for entity in list_entity if entity.entity_type == "person"]
  738. # while (p_entitys < len(_list_entity)):
  739. # entity = _list_entity[p_entitys]
  740. # if entity.entity_type == "person" and entity.label in [1,2,3]:
  741. # sentence = dict_index_sentence[entity.sentence_index]
  742. # # item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_PERSON_INPUT_SHAPE[1]),shape=settings.MODEL_PERSON_INPUT_SHAPE)
  743. #
  744. # # s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20)
  745. #
  746. # # 2021/5/8 取上下文的句子,解决表格处理的分句问题
  747. # left_sentence = dict_index_sentence.get(entity.sentence_index - 1)
  748. # left_sentence_tokens = left_sentence.tokens if left_sentence else []
  749. # right_sentence = dict_index_sentence.get(entity.sentence_index + 1)
  750. # right_sentence_tokens = right_sentence.tokens if right_sentence else []
  751. # entity_beginIndex = entity.begin_index + len(left_sentence_tokens)
  752. # entity_endIndex = entity.end_index + len(left_sentence_tokens)
  753. # context_sentences_tokens = left_sentence_tokens + sentence.tokens + right_sentence_tokens
  754. # s = spanWindow(tokens=context_sentences_tokens, begin_index=entity_beginIndex,
  755. # end_index=entity_endIndex, size=20)
  756. #
  757. # _key = "".join(["".join(x) for x in s])
  758. # if _key in dict_context_itemx:
  759. # _dianhua = dict_context_itemx[_key][0]
  760. # else:
  761. # s1 = ''.join(s[1])
  762. # # s1 = re.sub(',)', '-', s1)
  763. # s1 = re.sub('\s', '', s1)
  764. # have_key = re.findall(key_word, s1)
  765. # have_phone = re.findall(phone, s1)
  766. # s0 = ''.join(s[0])
  767. # # s0 = re.sub(',)', '-', s0)
  768. # s0 = re.sub('\s', '', s0)
  769. # have_key2 = re.findall(key_word, s0)
  770. # have_phone2 = re.findall(phone, s0)
  771. #
  772. # s3 = ''.join(s[1])
  773. # # s0 = re.sub(',)', '-', s0)
  774. # s3 = re.sub(',|,|\s', '', s3)
  775. # have_key3 = re.findall(key_word, s3)
  776. # have_phone3 = re.findall(phone, s3)
  777. #
  778. # s4 = ''.join(s[0])
  779. # # s0 = re.sub(',)', '-', s0)
  780. # s4 = re.sub(',|,|\s', '', s0)
  781. # have_key4 = re.findall(key_word, s4)
  782. # have_phone4 = re.findall(phone, s4)
  783. #
  784. # _dianhua = ""
  785. # if have_phone:
  786. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
  787. # last_person_phone) != -1:
  788. # if len(have_phone) > 1:
  789. # _dianhua = phoneFromList(have_phone[1:])
  790. # else:
  791. # _dianhua = phoneFromList(have_phone)
  792. # elif have_key:
  793. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
  794. # last_person_phone) != -1:
  795. # if len(have_key) > 1:
  796. # _dianhua = phoneFromList(have_key[1:])
  797. # else:
  798. # _dianhua = phoneFromList(have_key)
  799. # elif have_phone2:
  800. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
  801. # last_person_phone) != -1:
  802. # if len(have_phone2) > 1:
  803. # _dianhua = phoneFromList(have_phone2[1:])
  804. # else:
  805. # _dianhua = phoneFromList(have_phone2)
  806. # elif have_key2:
  807. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
  808. # last_person_phone) != -1:
  809. # if len(have_key2) > 1:
  810. # _dianhua = phoneFromList(have_key2[1:])
  811. # else:
  812. # _dianhua = phoneFromList(have_key2)
  813. # elif have_phone3:
  814. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
  815. # last_person_phone) != -1:
  816. # if len(have_phone3) > 1:
  817. # _dianhua = phoneFromList(have_phone3[1:])
  818. # else:
  819. # _dianhua = phoneFromList(have_phone3)
  820. # elif have_key3:
  821. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
  822. # last_person_phone) != -1:
  823. # if len(have_key3) > 1:
  824. # _dianhua = phoneFromList(have_key3[1:])
  825. # else:
  826. # _dianhua = phoneFromList(have_key3)
  827. # elif have_phone4:
  828. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
  829. # last_person_phone) != -1:
  830. # if len(have_phone4) > 1:
  831. # _dianhua = phoneFromList(have_phone4)
  832. # else:
  833. # _dianhua = phoneFromList(have_phone4)
  834. # elif have_key4:
  835. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
  836. # last_person_phone) != -1:
  837. # if len(have_key4) > 1:
  838. # _dianhua = phoneFromList(have_key4)
  839. # else:
  840. # _dianhua = phoneFromList(have_key4)
  841. # else:
  842. # _dianhua = ""
  843. # # dict_context_itemx[_key] = [item_x, _dianhua]
  844. # dict_context_itemx[_key] = [_dianhua]
  845. # # points_entitys.append(entity)
  846. # # dianhua.append(_dianhua)
  847. # last_person = entity.entity_text
  848. # if _dianhua:
  849. # # 更新联系人entity联系方式(person_phone)
  850. # entity.person_phone = _dianhua
  851. # last_person_phone = _dianhua
  852. # else:
  853. # last_person_phone = "####****++++$^"
  854. # p_entitys += 1
  855. from scipy.optimize import linear_sum_assignment
  856. from BiddingKG.dl.interface.Entitys import Match
  857. def dispatch(match_list):
  858. main_roles = list(set([match.main_role for match in match_list]))
  859. attributes = list(set([match.attribute for match in match_list]))
  860. label = np.zeros(shape=(len(main_roles), len(attributes)))
  861. for match in match_list:
  862. main_role = match.main_role
  863. attribute = match.attribute
  864. value = match.value
  865. label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
  866. # print(label)
  867. gragh = -label
  868. # km算法
  869. row, col = linear_sum_assignment(gragh)
  870. max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
  871. return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
  872. # km算法
  873. key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)(\d{7,12})')
  874. phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
  875. '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
  876. '0\d{2,3}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
  877. '0\d{2,3}[-—-―]\d{7,8}转\d{1,4}|'
  878. '0\d{2,3}[-—-―]?[1-9]\d{6,7}|'
  879. '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
  880. '[1-9]\d{6,7}')
  881. phone_entitys = []
  882. for _sentence in list_sentence:
  883. sentence_text = _sentence.sentence_text
  884. res_set = set()
  885. for i in re.finditer(phone,sentence_text):
  886. res_set.add((i.group(),i.start(),i.end()))
  887. for i in re.finditer(key_word,sentence_text):
  888. res_set.add((i.group(2),i.start()+len(i.group(1)),i.end()))
  889. for item in list(res_set):
  890. phone_left = sentence_text[max(0,item[1]-10):item[1]]
  891. phone_right = sentence_text[item[2]:item[2]+8]
  892. # 排除传真号 和 其它错误项
  893. if re.search("传,?真|信,?箱|邮,?箱",phone_left):
  894. if not re.search("电,?话",phone_left):
  895. continue
  896. if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]",phone_left):
  897. continue
  898. if re.search("[.,]\d{2,}",phone_right):
  899. continue
  900. _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2])
  901. phone_entitys.append(_entity)
  902. person_entitys = []
  903. for entity in list_entity:
  904. if entity.entity_type == "person":
  905. entity.person_phone = ""
  906. person_entitys.append(entity)
  907. _list_entity = phone_entitys + person_entitys
  908. _list_entity = sorted(_list_entity,key=lambda x:(x.sentence_index,x.wordOffset_begin))
  909. words_num_dict = dict()
  910. last_words_num = 0
  911. list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
  912. for sentence in list_sentence:
  913. _index = sentence.sentence_index
  914. if _index == 0:
  915. words_num_dict[_index] = 0
  916. else:
  917. words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
  918. last_words_num = len(sentence.sentence_text)
  919. match_list = []
  920. for index in range(len(_list_entity)):
  921. entity = _list_entity[index]
  922. if entity.entity_type=="person" and entity.label in [1,2,3]:
  923. match_nums = 0
  924. for after_index in range(index + 1, min(len(_list_entity), index + 5)):
  925. after_entity = _list_entity[after_index]
  926. if after_entity.entity_type=="phone":
  927. sentence_distance = after_entity.sentence_index - entity.sentence_index
  928. distance = (words_num_dict[after_entity.sentence_index] + after_entity.wordOffset_begin) - (
  929. words_num_dict[entity.sentence_index] + entity.wordOffset_end)
  930. if sentence_distance < 2 and distance < 50:
  931. value = (-1 / 2 * (distance ** 2)) / 10000
  932. match_list.append(Match(entity, after_entity, value))
  933. match_nums += 1
  934. else:
  935. break
  936. if after_entity.entity_type=="person":
  937. if after_entity.label not in [1,2,3]:
  938. break
  939. if not match_nums:
  940. for previous_index in range(index-1, max(0,index-5), -1):
  941. previous_entity = _list_entity[previous_index]
  942. if previous_entity.entity_type == "phone":
  943. sentence_distance = entity.sentence_index - previous_entity.sentence_index
  944. distance = (words_num_dict[entity.sentence_index] + entity.wordOffset_begin) - (
  945. words_num_dict[previous_entity.sentence_index] + previous_entity.wordOffset_end)
  946. if sentence_distance < 1 and distance<30:
  947. # 前向 没有 /10000
  948. value = (-1 / 2 * (distance ** 2))
  949. match_list.append(Match(entity, previous_entity, value))
  950. else:
  951. break
  952. result = dispatch(match_list)
  953. for match in result:
  954. entity = match.main_role
  955. # 更新 list_entity
  956. entity_index = list_entity.index(entity)
  957. list_entity[entity_index].person_phone = match.attribute.entity_text
  958. def predict(self,list_sentences,list_entitys):
  959. self.predict_person(list_sentences,list_entitys)
  960. #表格预测
  961. class FormPredictor():
  962. def __init__(self,lazyLoad=getLazyLoad()):
  963. self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5"
  964. self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5"
  965. self.model_form_item = Model_form_item()
  966. self.model_form_context = Model_form_context()
  967. self.model_dict = {"line":[None,self.model_file_line]}
  968. def getModel(self,type):
  969. if type=="item":
  970. return self.model_form_item
  971. elif type=="context":
  972. return self.model_form_context
  973. else:
  974. return self.getModel(type)
  975. def encode(self,data,**kwargs):
  976. return encodeInput([data], word_len=50, word_flag=True,userFool=False)[0]
  977. return encodeInput_form(data)
  978. def predict(self,form_datas,type):
  979. if type=="item":
  980. return self.model_form_item.predict(form_datas)
  981. elif type=="context":
  982. return self.model_form_context.predict(form_datas)
  983. else:
  984. return self.getModel(type).predict(form_datas)
  985. #角色规则
  986. #依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率
  987. class RoleRulePredictor():
  988. def __init__(self):
  989. self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|需方)(名称)?(是|为|信息|:|:|\s*)$)"
  990. self.pattern_tenderee_center = "(?P<tenderee_center>(受.{,20}委托))"
  991. self.pattern_tenderee_right = "(?P<tenderee_right>^(\((以下简称)?[\"”]?(招标|采购)(人|单位|机构)\)?))" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
  992. self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|招标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{,20}委托))"
  993. self.pattern_agency_right = "(?P<agency_right>^(\((以下简称)?[\"”]?(代理)(人|单位|机构)\))|受.{,15}委托)"
  994. # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
  995. self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|各?供应商|方|公司|厂商|商)[::是为]+$|(选定单位|指定的中介服务机构))[::是为,]+$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))[::是为]+$|((评审结果|名次|排名)[::]第?[一1]名?)$|单一来源(采购)?方式向$|((中标|成交)(结果|信息))(是|为|:|:)$|(单一来源采购(供应商|供货商|服务商))$|[^候选]((分包|标包){,5}供应商|供货商|服务商|供应商名称|服务机构|供方)[::]$)"
  996. # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
  997. self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
  998. self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果:由.{5,20}供货)|中标通知书.{,15}你方" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
  999. # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
  1000. self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
  1001. self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
  1002. self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
  1003. self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
  1004. self.dict_list_pattern = {"0":[["L",self.pattern_tenderee_left],
  1005. ["C",self.pattern_tenderee_center],
  1006. ["R",self.pattern_tenderee_right]],
  1007. "1":[["L",self.pattern_agency_left],
  1008. ["R",self.pattern_agency_right]],
  1009. "2":[["L",self.pattern_winTenderer_left],
  1010. # ["C",self.pattern_winTenderer_center],
  1011. ["R",self.pattern_winTenderer_right],
  1012. ["W",self.pattern_winTenderer_whole]],
  1013. "3":[["L",self.pattern_secondTenderer_left],
  1014. ["R",self.pattern_secondTenderer_right]],
  1015. "4":[["L",self.pattern_thirdTenderer_left],
  1016. ["R",self.pattern_thirdTenderer_right]]}
  1017. self.pattern_whole = []
  1018. for _k,_v in self.dict_list_pattern.items():
  1019. for _d,_p in _v:
  1020. self.pattern_whole.append(_p)
  1021. # self.pattern_whole = "|".join(list_pattern)
  1022. self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
  1023. self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额")
  1024. self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况")
  1025. self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
  1026. self.pattern_money_other = re.compile("代理费|服务费")
  1027. self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
  1028. def _check_input(self,text, ignore=False):
  1029. if not text:
  1030. return []
  1031. if not isinstance(text, list):
  1032. text = [text]
  1033. null_index = [i for i, t in enumerate(text) if not t]
  1034. if null_index and not ignore:
  1035. raise Exception("null text in input ")
  1036. return text
  1037. def predict(self,list_articles,list_sentences,list_entitys,list_codenames,on_value = 0.5):
  1038. for article,list_entity,list_sentence,list_codename in zip(list_articles,list_entitys,list_sentences,list_codenames):
  1039. list_name = list_codename["name"]
  1040. list_name = self._check_input(list_name)+[article.title]
  1041. for p_entity in list_entity:
  1042. if p_entity.entity_type in ["org","company"]:
  1043. #将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人
  1044. if str(p_entity.label)=="0":
  1045. find_flag = False
  1046. for _sentence in list_sentence:
  1047. if _sentence.sentence_index==p_entity.sentence_index:
  1048. _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
  1049. for _name in list_name:
  1050. if _name!="" and str(_span[1]+_span[2][:len(str(_name))]).find(_name)>=0:
  1051. find_flag = True
  1052. if p_entity.values[0]>on_value:
  1053. p_entity.values[0] = 0.6+(p_entity.values[0]-0.6)/10
  1054. if find_flag:
  1055. continue
  1056. #只解析角色为无的或者概率低于阈值的
  1057. if p_entity.label is None:
  1058. continue
  1059. role_prob = float(p_entity.values[int(p_entity.label)])
  1060. if role_prob<on_value or str(p_entity.label)=="5":
  1061. #将标题中的实体置为招标人
  1062. _list_name = self._check_input(list_name,ignore=True)
  1063. find_flag = False
  1064. for _name in _list_name:
  1065. if str(_name).find(p_entity.entity_text)>=0:
  1066. find_flag = True
  1067. _label = 0
  1068. p_entity.label = _label
  1069. p_entity.values[int(_label)] = on_value
  1070. break
  1071. #若是实体在标题中,默认为招标人,不进行以下的规则匹配
  1072. if find_flag:
  1073. continue
  1074. for s_index in range(len(list_sentence)):
  1075. if p_entity.doc_id==list_sentence[s_index].doc_id and p_entity.sentence_index==list_sentence[s_index].sentence_index:
  1076. tokens = list_sentence[s_index].tokens
  1077. begin_index = p_entity.begin_index
  1078. end_index = p_entity.end_index
  1079. size = 15
  1080. spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False)
  1081. #距离
  1082. list_distance = [100,100,100,100,100]
  1083. _flag = False
  1084. #使用正则+距离解决冲突
  1085. # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
  1086. list_spans = [spans[0][-30:],spans[0][-10:]+spans[1]+spans[2][:10],spans[2]]
  1087. for _i_span in range(len(list_spans)):
  1088. # print(list_spans[_i_span],p_entity.entity_text)
  1089. for _pattern in self.pattern_whole:
  1090. for _iter in re.finditer(_pattern,list_spans[_i_span]):
  1091. for _group,_v_group in _iter.groupdict().items():
  1092. if _v_group is not None and _v_group!="":
  1093. _role = _group.split("_")[0]
  1094. _direct = _group.split("_")[1]
  1095. _label = {"tenderee":0,"agency":1,"winTenderer":2,"secondTenderer":3,"thirdTenderer":4}.get(_role)
  1096. if _i_span==0 and _direct=="left" and '各供应商' not in _v_group: #2021/12/22 修正错误中标召回 例子208668937
  1097. _flag = True
  1098. _distance = abs((len(list_spans[_i_span])-_iter.span()[1]))
  1099. list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
  1100. if _i_span==1 and _direct=="center":
  1101. _flag = True
  1102. _distance = abs((len(list_spans[_i_span])-_iter.span()[1]))
  1103. list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
  1104. if _i_span==2 and _direct=="right":
  1105. _flag = True
  1106. _distance = _iter.span()[0]
  1107. list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
  1108. # print(list_distance)
  1109. # for _key in self.dict_list_pattern.keys():
  1110. #
  1111. # for pattern in self.dict_list_pattern[_key]:
  1112. # if pattern[0]=="L":
  1113. # for _iter in re.finditer(pattern[1], spans[0][-30:]):
  1114. # _flag = True
  1115. # if len(spans[0])-_iter.span()[1]<list_distance[int(_key)]:
  1116. # list_distance[int(_key)] = len(spans[0])-_iter.span()[1]-(_iter.span()[1]-_iter.span()[0])
  1117. #
  1118. # if pattern[0]=="C":
  1119. # if re.search(pattern[1],spans[0]) is None and re.search(pattern[1],spans[2]) is None and re.search(pattern[1],spans[0]+spans[1]+spans[2]) is not None:
  1120. # _flag = True
  1121. # list_distance[int(_key)] = 0
  1122. #
  1123. # if pattern[0]=="R":
  1124. # for _iter in re.finditer(pattern[1], spans[2][:30]):
  1125. # _flag = True
  1126. # if _iter.span()[0]<list_distance[int(_key)]:
  1127. # list_distance[int(_key)] = _iter.span()[0]
  1128. # if pattern[0]=="W":
  1129. # spans = spanWindow(tokens, begin_index, end_index, size=20, center_include=True, word_flag=True, use_text=False)
  1130. # for _iter in re.finditer(pattern[1], "".join(spans)):
  1131. # _flag = True
  1132. # if _iter.span()[0]<list_distance[int(_key)]:
  1133. # list_distance[int(_key)] = _iter.span()[0]
  1134. # print("==",list_distance)
  1135. #得到结果
  1136. _label = np.argmin(list_distance)
  1137. if _flag:
  1138. # if _label==2 and min(list_distance[3:])<100:
  1139. # _label += np.argmin(list_distance[3:])+1
  1140. if _label in [2,3,4]:
  1141. if p_entity.entity_type in ["company","org"]:
  1142. p_entity.label = _label
  1143. p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
  1144. else:
  1145. p_entity.label = _label
  1146. p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
  1147. # if p_entity.entity_type=="location":
  1148. # for _sentence in list_sentence:
  1149. # if _sentence.sentence_index==p_entity.sentence_index:
  1150. # _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=5,center_include=True,word_flag=True,text=p_entity.entity_text)
  1151. # if re.search(self.pattern_winTenderer_location,_span[0][-10:]) is not None and re.search("地址|地点",_span[0]) is None:
  1152. # p_entity.entity_type="company"
  1153. # _label = "2"
  1154. # p_entity.label = _label
  1155. # p_entity.values = [0]*6
  1156. # p_entity.values[int(_label)] = on_value
  1157. #确定性强的特殊修改
  1158. if p_entity.entity_type in ["company","org"]:
  1159. for s_index in range(len(list_sentence)):
  1160. if p_entity.doc_id==list_sentence[s_index].doc_id and p_entity.sentence_index==list_sentence[s_index].sentence_index:
  1161. tokens = list_sentence[s_index].tokens
  1162. begin_index = p_entity.begin_index
  1163. end_index = p_entity.end_index
  1164. size = 15
  1165. spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False)
  1166. #距离
  1167. list_distance = [100,100,100,100,100]
  1168. _flag = False
  1169. for _key in self.dict_list_pattern.keys():
  1170. for pattern in self.dict_list_pattern[_key]:
  1171. if pattern[0]=="W":
  1172. spans = spanWindow(tokens, begin_index, end_index, size=30, center_include=True, word_flag=True, use_text=False)
  1173. for _iter in re.finditer(pattern[1], spans[0][-10:]+spans[1]+spans[2]):
  1174. _flag = True
  1175. if _iter.span()[0]<list_distance[int(_key)]:
  1176. list_distance[int(_key)] = _iter.span()[0]
  1177. #得到结果
  1178. _label = np.argmin(list_distance)
  1179. if _flag:
  1180. if _label==2 and min(list_distance[3:])<100:
  1181. _label += np.argmin(list_distance[3:])+1
  1182. if _label in [2,3,4]:
  1183. p_entity.label = _label
  1184. p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
  1185. else:
  1186. p_entity.label = _label
  1187. p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
  1188. if p_entity.entity_type in ["money"]:
  1189. if str(p_entity.label)=="2":
  1190. for _sentence in list_sentence:
  1191. if _sentence.sentence_index==p_entity.sentence_index:
  1192. _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
  1193. if re.search(self.pattern_money_tenderee,_span[0]) is not None and re.search(self.pattern_money_other,_span[0]) is None:
  1194. p_entity.values[0] = 0.8+p_entity.values[0]/10
  1195. p_entity.label = 0
  1196. if re.search(self.pattern_money_tenderer,_span[0]) is not None:
  1197. if re.search(self.pattern_money_other,_span[0]) is not None:
  1198. if re.search(self.pattern_money_tenderer,_span[0]).span()[1]>re.search(self.pattern_money_other,_span[0]).span()[1]:
  1199. p_entity.values[1] = 0.8+p_entity.values[1]/10
  1200. p_entity.label = 1
  1201. else:
  1202. p_entity.values[1] = 0.8+p_entity.values[1]/10
  1203. p_entity.label = 1
  1204. if re.search(self.pattern_money_tenderer_whole,"".join(_span)) is not None and re.search(self.pattern_money_other,_span[0]) is None:
  1205. p_entity.values[1] = 0.8+p_entity.values[1]/10
  1206. p_entity.label = 1
  1207. #增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
  1208. list_p = []
  1209. state = 0
  1210. for p_entity in list_entity:
  1211. for _sentence in list_sentence:
  1212. if _sentence.sentence_index==p_entity.sentence_index:
  1213. _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
  1214. if state==2:
  1215. for _p in list_p[1:]:
  1216. _p.values[0] = 0.8+_p.values[0]/10
  1217. _p.label = 0
  1218. state = 0
  1219. list_p = []
  1220. if state==0:
  1221. if p_entity.entity_type in ["money"]:
  1222. if str(p_entity.label)=="0" and re.search(self.pattern_pack,_span[0]+"-"+_span[2]) is not None:
  1223. state = 1
  1224. list_p.append(p_entity)
  1225. elif state==1:
  1226. if p_entity.entity_type in ["money"]:
  1227. if str(p_entity.label) in ["0","2"] and re.search(self.pattern_pack,_span[0]+"-"+_span[2]) is not None and re.search(self.pattern_money_other,_span[0]+"-"+_span[2]) is None and p_entity.sentence_index==list_p[0].sentence_index:
  1228. list_p.append(p_entity)
  1229. else:
  1230. state = 2
  1231. if len(list_p)>1:
  1232. for _p in list_p[1:]:
  1233. #print("==",_p.entity_text,_p.sentence_index,_p.label)
  1234. _p.values[0] = 0.8+_p.values[0]/10
  1235. _p.label = 0
  1236. state = 0
  1237. list_p = []
  1238. for p_entity in list_entity:
  1239. #将属于集合中的不可能是中标人的标签置为无
  1240. if p_entity.entity_text in self.SET_NOT_TENDERER:
  1241. p_entity.label=5
  1242. # 时间类别
  1243. class TimePredictor():
  1244. def __init__(self):
  1245. self.sess = tf.Session(graph=tf.Graph())
  1246. self.inputs_code = None
  1247. self.outputs_code = None
  1248. self.input_shape = (2,40,128)
  1249. self.load_model()
  1250. def load_model(self):
  1251. model_path = os.path.dirname(__file__)+'/timesplit_model'
  1252. if self.inputs_code is None:
  1253. log("get model of time")
  1254. with self.sess.as_default():
  1255. with self.sess.graph.as_default():
  1256. meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path)
  1257. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  1258. signature_def = meta_graph_def.signature_def
  1259. self.inputs_code = []
  1260. self.inputs_code.append(
  1261. self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
  1262. self.inputs_code.append(
  1263. self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
  1264. self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
  1265. return self.inputs_code, self.outputs_code
  1266. else:
  1267. return self.inputs_code, self.outputs_code
  1268. def search_time_data(self,list_sentences,list_entitys):
  1269. data_x = []
  1270. points_entitys = []
  1271. for list_sentence, list_entity in zip(list_sentences, list_entitys):
  1272. p_entitys = 0
  1273. p_sentences = 0
  1274. list_sentence.sort(key=lambda x: x.sentence_index)
  1275. while(p_entitys<len(list_entity)):
  1276. entity = list_entity[p_entitys]
  1277. if entity.entity_type in ['time']:
  1278. while(p_sentences<len(list_sentence)):
  1279. sentence = list_sentence[p_sentences]
  1280. if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
  1281. # left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
  1282. # right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
  1283. s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=self.input_shape[1])
  1284. left = s[0]
  1285. right = s[1]
  1286. context = [left, right]
  1287. x = self.embedding_words(context, shape=self.input_shape)
  1288. data_x.append(x)
  1289. points_entitys.append(entity)
  1290. break
  1291. p_sentences += 1
  1292. p_entitys += 1
  1293. if len(points_entitys)==0:
  1294. return None
  1295. data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
  1296. return [data_x, points_entitys]
  1297. def embedding_words(self, datas, shape):
  1298. '''
  1299. @summary:查找词汇对应的词向量
  1300. @param:
  1301. datas:词汇的list
  1302. shape:结果的shape
  1303. @return: array,返回对应shape的词嵌入
  1304. '''
  1305. model_w2v = getModel_w2v()
  1306. embed = np.zeros(shape)
  1307. length = shape[1]
  1308. out_index = 0
  1309. for data in datas:
  1310. index = 0
  1311. for item in data:
  1312. item_not_space = re.sub("\s*", "", item)
  1313. if index >= length:
  1314. break
  1315. if item_not_space in model_w2v.vocab:
  1316. embed[out_index][index] = model_w2v[item_not_space]
  1317. index += 1
  1318. else:
  1319. embed[out_index][index] = model_w2v['unk']
  1320. index += 1
  1321. out_index += 1
  1322. return embed
  1323. def predict(self, list_sentences,list_entitys):
  1324. datas = self.search_time_data(list_sentences, list_entitys)
  1325. if datas is None:
  1326. return
  1327. points_entitys = datas[1]
  1328. with self.sess.as_default():
  1329. predict_y = limitRun(self.sess,[self.outputs_code], feed_dict={self.inputs_code[0]:datas[0][0]
  1330. ,self.inputs_code[1]:datas[0][1]})[0]
  1331. for i in range(len(predict_y)):
  1332. entity = points_entitys[i]
  1333. label = np.argmax(predict_y[i])
  1334. values = []
  1335. for item in predict_y[i]:
  1336. values.append(item)
  1337. if label != 0:
  1338. if not timeFormat(entity.entity_text):
  1339. label = 0
  1340. values[0] = 0.5
  1341. entity.set_Role(label, values)
  1342. # 产品字段提取
  1343. class ProductPredictor():
  1344. def __init__(self):
  1345. self.sess = tf.Session(graph=tf.Graph())
  1346. self.load_model()
  1347. def load_model(self):
  1348. model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb'
  1349. with self.sess.as_default():
  1350. with self.sess.graph.as_default():
  1351. output_graph_def = tf.GraphDef()
  1352. with open(model_path, 'rb') as f:
  1353. output_graph_def.ParseFromString(f.read())
  1354. tf.import_graph_def(output_graph_def, name='')
  1355. self.sess.run(tf.global_variables_initializer())
  1356. self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0')
  1357. self.length = self.sess.graph.get_tensor_by_name("Sum:0")
  1358. self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0")
  1359. self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0")
  1360. self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0")
  1361. def predict(self, list_sentences,list_entitys=None, MAX_AREA=5000):
  1362. '''
  1363. 预测实体代码,每个句子最多取MAX_AREA个字,超过截断
  1364. :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
  1365. :param list_entitys: 多篇公告实体列表
  1366. :param MAX_AREA: 每个句子最多截取多少字
  1367. :return: 把预测出来的实体放进实体类
  1368. '''
  1369. with self.sess.as_default() as sess:
  1370. with self.sess.graph.as_default():
  1371. result = []
  1372. if list_entitys is None:
  1373. list_entitys = [[] for _ in range(len(list_sentences))]
  1374. for list_sentence, list_entity in zip(list_sentences,list_entitys):
  1375. if len(list_sentence)==0:
  1376. result.append({"product":[]})
  1377. continue
  1378. list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
  1379. _begin_index = 0
  1380. item = {"product":[]}
  1381. temp_list = []
  1382. while True:
  1383. MAX_LEN = len(list_sentence[_begin_index].sentence_text)
  1384. if MAX_LEN > MAX_AREA:
  1385. MAX_LEN = MAX_AREA
  1386. _LEN = MAX_AREA//MAX_LEN
  1387. chars = process_data([sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]])
  1388. lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
  1389. feed_dict={
  1390. self.char_input: np.asarray(chars),
  1391. self.dropout: 1.0
  1392. })
  1393. batch_paths = decode(scores, lengths, tran_)
  1394. for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths):
  1395. tags = ''.join([str(it) for it in path[:length]])
  1396. for it in re.finditer("12*3", tags):
  1397. start = it.start()
  1398. end = it.end()
  1399. _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
  1400. sentence.doc_id, sentence.sentence_index, start, end),
  1401. entity_text=sentence.sentence_text[start:end],
  1402. entity_type="product", sentence_index=sentence.sentence_index,
  1403. begin_index=0, end_index=0, wordOffset_begin=start,
  1404. wordOffset_end=end)
  1405. list_entity.append(_entity)
  1406. temp_list.append(sentence.sentence_text[start:end])
  1407. # item["product"] = list(set(temp_list))
  1408. # result.append(item)
  1409. if _begin_index+_LEN >= len(list_sentence):
  1410. break
  1411. _begin_index += _LEN
  1412. item["product"] = list(set(temp_list))
  1413. result.append(item) # 修正bug
  1414. return result
  1415. # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
  1416. class ProductAttributesPredictor():
  1417. def __init__(self,):
  1418. self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
  1419. self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
  1420. with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
  1421. self.header_set = pickle.load(f)
  1422. def isTrueTable(self, table):
  1423. '''真假表格规则:
  1424. 1、包含<caption>或<th>标签为真
  1425. 2、包含大量链接、表单、图片或嵌套表格为假
  1426. 3、表格尺寸太小为假
  1427. 4、外层<table>嵌套子<table>,一般子为真,外为假'''
  1428. if table.find_all(['caption', 'th']) != []:
  1429. return True
  1430. elif len(table.find_all(['form', 'a', 'img'])) > 5:
  1431. return False
  1432. elif len(table.find_all(['tr'])) < 2:
  1433. return False
  1434. elif len(table.find_all(['table'])) >= 1:
  1435. return False
  1436. else:
  1437. return True
  1438. def getTrs(self, tbody):
  1439. # 获取所有的tr
  1440. trs = []
  1441. objs = tbody.find_all(recursive=False)
  1442. for obj in objs:
  1443. if obj.name == "tr":
  1444. trs.append(obj)
  1445. if obj.name == "tbody":
  1446. for tr in obj.find_all("tr", recursive=False):
  1447. trs.append(tr)
  1448. return trs
  1449. def getTable(self, tbody):
  1450. trs = self.getTrs(tbody)
  1451. inner_table = []
  1452. if len(trs) < 2:
  1453. return inner_table
  1454. for tr in trs:
  1455. tr_line = []
  1456. tds = tr.findChildren(['td', 'th'], recursive=False)
  1457. if len(tds) < 2:
  1458. continue
  1459. for td in tds:
  1460. td_text = re.sub('\s', '', td.get_text())
  1461. tr_line.append(td_text)
  1462. inner_table.append(tr_line)
  1463. return inner_table
  1464. def fixSpan(self, tbody):
  1465. # 处理colspan, rowspan信息补全问题
  1466. trs = self.getTrs(tbody)
  1467. ths_len = 0
  1468. ths = list()
  1469. trs_set = set()
  1470. # 修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
  1471. # 遍历每一个tr
  1472. for indtr, tr in enumerate(trs):
  1473. ths_tmp = tr.findChildren('th', recursive=False)
  1474. # 不补全含有表格的tr
  1475. if len(tr.findChildren('table')) > 0:
  1476. continue
  1477. if len(ths_tmp) > 0:
  1478. ths_len = ths_len + len(ths_tmp)
  1479. for th in ths_tmp:
  1480. ths.append(th)
  1481. trs_set.add(tr)
  1482. # 遍历每行中的element
  1483. tds = tr.findChildren(recursive=False)
  1484. if len(tds) < 3:
  1485. continue # 列数太少的不补全
  1486. for indtd, td in enumerate(tds):
  1487. # 若有colspan 则补全同一行下一个位置
  1488. if 'colspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['colspan']))) != "":
  1489. col = int(re.sub("[^0-9]", "", str(td['colspan'])))
  1490. if col < 10 and len(td.get_text()) < 500:
  1491. td['colspan'] = 1
  1492. for i in range(1, col, 1):
  1493. td.insert_after(copy.copy(td))
  1494. for indtr, tr in enumerate(trs):
  1495. ths_tmp = tr.findChildren('th', recursive=False)
  1496. # 不补全含有表格的tr
  1497. if len(tr.findChildren('table')) > 0:
  1498. continue
  1499. if len(ths_tmp) > 0:
  1500. ths_len = ths_len + len(ths_tmp)
  1501. for th in ths_tmp:
  1502. ths.append(th)
  1503. trs_set.add(tr)
  1504. # 遍历每行中的element
  1505. tds = tr.findChildren(recursive=False)
  1506. same_span = 0
  1507. if len(tds) > 1 and 'rowspan' in tds[0].attrs:
  1508. span0 = tds[0].attrs['rowspan']
  1509. for td in tds:
  1510. if 'rowspan' in td.attrs and td.attrs['rowspan'] == span0:
  1511. same_span += 1
  1512. if same_span == len(tds):
  1513. continue
  1514. for indtd, td in enumerate(tds):
  1515. # 若有rowspan 则补全下一行同样位置
  1516. if 'rowspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['rowspan']))) != "":
  1517. row = int(re.sub("[^0-9]", "", str(td['rowspan'])))
  1518. td['rowspan'] = 1
  1519. for i in range(1, row, 1):
  1520. # 获取下一行的所有td, 在对应的位置插入
  1521. if indtr + i < len(trs):
  1522. tds1 = trs[indtr + i].findChildren(['td', 'th'], recursive=False)
  1523. if len(tds1) >= (indtd) and len(tds1) > 0:
  1524. if indtd > 0:
  1525. tds1[indtd - 1].insert_after(copy.copy(td))
  1526. else:
  1527. tds1[0].insert_before(copy.copy(td))
  1528. elif len(tds1) > 0 and len(tds1) == indtd - 1:
  1529. tds1[indtd - 2].insert_after(copy.copy(td))
  1530. def get_monthlen(self, year, month):
  1531. '''输入年份、月份 int类型 得到该月份天数'''
  1532. try:
  1533. weekday, num = calendar.monthrange(int(year), int(month))
  1534. except:
  1535. num = 30
  1536. return str(num)
  1537. def fix_time(self, text, html, page_time):
  1538. '''输入日期字段返回格式化日期'''
  1539. for it in [('十二', '12'),('十一', '11'),('十','10'),('九','9'),('八','8'),('七','7'),
  1540. ('六','6'),('五','5'),('四','4'),('三','3'),('二','2'),('一','1')]:
  1541. if it[0] in text:
  1542. text = text.replace(it[0], it[1])
  1543. if re.search('^\d{1,2}月$', text):
  1544. m = re.search('^(\d{1,2})月$', text).group(1)
  1545. if len(m) < 2:
  1546. m = '0' + m
  1547. year = re.search('(\d{4})年(.{,12}采购意向)?', html)
  1548. if year:
  1549. y = year.group(1)
  1550. num = self.get_monthlen(y, m)
  1551. if len(num) < 2:
  1552. num = '0' + num
  1553. order_begin = "%s-%s-01" % (y, m)
  1554. order_end = "%s-%s-%s" % (y, m, num)
  1555. elif page_time != "":
  1556. year = re.search('\d{4}', page_time)
  1557. if year:
  1558. y = year.group(0)
  1559. num = self.get_monthlen(y, m)
  1560. if len(num) < 2:
  1561. num = '0' + num
  1562. order_begin = "%s-%s-01" % (y, m)
  1563. order_end = "%s-%s-%s" % (y, m, num)
  1564. else:
  1565. y = str(datetime.datetime.now().year)
  1566. num = self.get_monthlen(y, m)
  1567. if len(num) < 2:
  1568. num = '0' + num
  1569. order_begin = "%s-%s-01" % (y, m)
  1570. order_end = "%s-%s-%s" % (y, m, num)
  1571. else:
  1572. y = str(datetime.datetime.now().year)
  1573. num = self.get_monthlen(y, m)
  1574. if len(num) < 2:
  1575. num = '0' + num
  1576. order_begin = "%s-%s-01" % (y, m)
  1577. order_end = "%s-%s-%s" % (y, m, num)
  1578. return order_begin, order_end
  1579. t1 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})月?$', text)
  1580. if t1:
  1581. year = t1.group(1)
  1582. month = t1.group(3)
  1583. num = self.get_monthlen(year, month)
  1584. if len(month)<2:
  1585. month = '0'+month
  1586. if len(num) < 2:
  1587. num = '0'+num
  1588. order_begin = "%s-%s-01" % (year, month)
  1589. order_end = "%s-%s-%s" % (year, month, num)
  1590. return order_begin, order_end
  1591. t2 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})(月|/|.|-)(\d{1,2})日?$', text)
  1592. if t2:
  1593. y = t2.group(1)
  1594. m = t2.group(3)
  1595. d = t2.group(5)
  1596. m = '0'+ m if len(m)<2 else m
  1597. d = '0'+d if len(d)<2 else d
  1598. order_begin = order_end = "%s-%s-%s"%(y,m,d)
  1599. return order_begin, order_end
  1600. all_match = re.finditer('^(?P<y1>\d{4})(年|/|.)(?P<m1>\d{1,2})(?:(月|/|.)(?:(?P<d1>\d{1,2})日)?)?'
  1601. '(到|至|-)(?:(?P<y2>\d{4})(年|/|.))?(?P<m2>\d{1,2})(?:(月|/|.)'
  1602. '(?:(?P<d2>\d{1,2})日)?)?$', text)
  1603. y1 = m1 = d1 = y2 = m2 = d2 = ""
  1604. found_math = False
  1605. for _match in all_match:
  1606. if len(_match.group()) > 0:
  1607. found_math = True
  1608. for k, v in _match.groupdict().items():
  1609. if v!="" and v is not None:
  1610. if k == 'y1':
  1611. y1 = v
  1612. elif k == 'm1':
  1613. m1 = v
  1614. elif k == 'd1':
  1615. d1 = v
  1616. elif k == 'y2':
  1617. y2 = v
  1618. elif k == 'm2':
  1619. m2 = v
  1620. elif k == 'd2':
  1621. d2 = v
  1622. if not found_math:
  1623. return "", ""
  1624. y2 = y1 if y2 == "" else y2
  1625. d1 = '1' if d1 == "" else d1
  1626. d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
  1627. m1 = '0' + m1 if len(m1) < 2 else m1
  1628. m2 = '0' + m2 if len(m2) < 2 else m2
  1629. d1 = '0' + d1 if len(d1) < 2 else d1
  1630. d2 = '0' + d2 if len(d2) < 2 else d2
  1631. order_begin = "%s-%s-%s"%(y1,m1,d1)
  1632. order_end = "%s-%s-%s"%(y2,m2,d2)
  1633. return order_begin, order_end
  1634. def find_header(self, items, p1, p2):
  1635. '''
  1636. inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容
  1637. :param items: 列表,内容为每个td 文本内容
  1638. :param p1: 优先表头正则
  1639. :param p2: 第二表头正则
  1640. :return: 表头所在列序号,是否表头,表头内容
  1641. '''
  1642. flag = False
  1643. header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
  1644. product = "" # 产品
  1645. quantity = "" # 数量
  1646. unitPrice = "" # 单价
  1647. brand = "" # 品牌
  1648. specs = "" # 规格
  1649. demand = "" # 采购需求
  1650. budget = "" # 预算金额
  1651. order_time = "" # 采购时间
  1652. for i in range(min(4, len(items))):
  1653. it = items[i]
  1654. if len(it) < 15 and re.search(p1, it) != None:
  1655. flag = True
  1656. product = it
  1657. header_dic['名称'] = i
  1658. break
  1659. if not flag:
  1660. for i in range(min(4, len(items))):
  1661. it = items[i]
  1662. if len(it) < 15 and re.search(p2, it) and re.search(
  1663. '编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None:
  1664. flag = True
  1665. product = it
  1666. header_dic['名称'] = i
  1667. break
  1668. if flag:
  1669. for j in range(i + 1, len(items)):
  1670. if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
  1671. continue
  1672. if re.search('数量', items[j]):
  1673. header_dic['数量'] = j
  1674. quantity = items[j]
  1675. elif re.search('单价', items[j]):
  1676. header_dic['单价'] = j
  1677. unitPrice = items[j]
  1678. elif re.search('品牌', items[j]):
  1679. header_dic['品牌'] = j
  1680. brand = items[j]
  1681. elif re.search('规格', items[j]):
  1682. header_dic['规格'] = j
  1683. specs = items[j]
  1684. elif re.search('需求', items[j]):
  1685. header_dic['需求'] = j
  1686. demand = items[j]
  1687. elif re.search('预算', items[j]):
  1688. header_dic['预算'] = j
  1689. budget = items[j]
  1690. elif re.search('时间|采购实施月份|采购月份', items[j]):
  1691. header_dic['时间'] = j
  1692. order_time = items[j]
  1693. if header_dic.get('名称', "") != "" :
  1694. num = 0
  1695. for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time):
  1696. if it != "":
  1697. num += 1
  1698. if num >=2:
  1699. return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
  1700. flag = False
  1701. return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
  1702. def predict(self, docid='', html='', page_time=""):
  1703. '''
  1704. 正则寻找table表格内 产品相关信息
  1705. :param html:公告HTML原文
  1706. :return:公告表格内 产品、数量、单价、品牌、规格 ,表头,表头列等信息
  1707. '''
  1708. soup = BeautifulSoup(html, 'lxml')
  1709. flag_yx = True if re.search('采购意向', html) else False
  1710. tables = soup.find_all(['table'])
  1711. headers = []
  1712. headers_demand = []
  1713. header_col = []
  1714. product_link = []
  1715. demand_link = []
  1716. for i in range(len(tables)-1, -1, -1):
  1717. table = tables[i]
  1718. if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
  1719. table.string = table.get_text()
  1720. table.name = 'turntable'
  1721. continue
  1722. if not self.isTrueTable(table):
  1723. continue
  1724. self.fixSpan(table)
  1725. inner_table = self.getTable(table)
  1726. i = 0
  1727. found_header = False
  1728. header_colnum = 0
  1729. if flag_yx:
  1730. col0_l = []
  1731. col1_l = []
  1732. for tds in inner_table:
  1733. if len(tds) == 2:
  1734. col0_l.append(re.sub(':', '', tds[0]))
  1735. col1_l.append(tds[1])
  1736. if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2:
  1737. header_list2 = []
  1738. product = demand = budget = order_begin = order_end = ""
  1739. for i in range(len(col0_l)):
  1740. if re.search('项目名称', col0_l[i]):
  1741. header_list2.append(col0_l[i])
  1742. product = col1_l[i]
  1743. elif re.search('采购需求|需求概况', col0_l[i]):
  1744. header_list2.append(col0_l[i])
  1745. demand = col1_l[i]
  1746. elif re.search('采购预算|预算金额', col0_l[i]):
  1747. header_list2.append(col0_l[i])
  1748. budget = col1_l[i]
  1749. if '万元' in col0_l[i] and '万' not in budget:
  1750. budget += '万元'
  1751. budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
  1752. budget = str(getUnifyMoney(budget))
  1753. elif re.search('采购时间|采购实施月份|采购月份', col0_l[i]):
  1754. header_list2.append(col0_l[i])
  1755. order_time = col1_l[i].strip()
  1756. order_begin, order_end = self.fix_time(order_time, html, page_time)
  1757. if product!= "" and demand != "" and budget!="" and order_begin != "":
  1758. link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
  1759. 'order_begin': order_begin, 'order_end': order_end}
  1760. if link not in demand_link:
  1761. demand_link.append(link)
  1762. headers_demand.append('_'.join(header_list2))
  1763. continue
  1764. while i < (len(inner_table)):
  1765. tds = inner_table[i]
  1766. not_empty = [it for it in tds if it != ""]
  1767. if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2:
  1768. i += 1
  1769. continue
  1770. product = "" # 产品
  1771. quantity = "" # 数量
  1772. unitPrice = "" # 单价
  1773. brand = "" # 品牌
  1774. specs = "" # 规格
  1775. demand = "" # 采购需求
  1776. budget = "" # 预算金额
  1777. order_time = "" # 采购时间
  1778. order_begin = ""
  1779. order_end = ""
  1780. if len(set(tds) & self.header_set) > len(tds) * 0.2:
  1781. header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
  1782. if found_header:
  1783. headers.append('_'.join(header_list))
  1784. headers_demand.append('_'.join(header_list2))
  1785. header_colnum = len(tds)
  1786. header_col.append('_'.join(tds))
  1787. i += 1
  1788. continue
  1789. elif found_header:
  1790. if len(tds) != header_colnum: # 表头、属性列数不一致跳过
  1791. i += 1
  1792. continue
  1793. id1 = header_dic.get('名称', "")
  1794. id2 = header_dic.get('数量', "")
  1795. id3 = header_dic.get('单价', "")
  1796. id4 = header_dic.get('品牌', "")
  1797. id5 = header_dic.get('规格', "")
  1798. id6 = header_dic.get('需求', "")
  1799. id7 = header_dic.get('预算', "")
  1800. id8 = header_dic.get('时间', "")
  1801. if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
  1802. re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
  1803. product = tds[id1]
  1804. if id2 != "":
  1805. if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
  1806. quantity = tds[id2]
  1807. else:
  1808. quantity = ""
  1809. if id3 != "":
  1810. if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
  1811. unitPrice = tds[id3]
  1812. if '万元' in header_list[2] and '万' not in unitPrice:
  1813. unitPrice += '万元'
  1814. unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
  1815. unitPrice = str(getUnifyMoney(unitPrice))
  1816. else:
  1817. unitPrice = ""
  1818. if id4 != "":
  1819. if re.search('\w', tds[id4]):
  1820. brand = tds[id4]
  1821. else:
  1822. brand = ""
  1823. if id5 != "":
  1824. if re.search('\w', tds[id5]):
  1825. specs = tds[id5]
  1826. else:
  1827. specs = ""
  1828. if id6 != "":
  1829. if re.search('\w', tds[id6]):
  1830. demand = tds[id6]
  1831. else:
  1832. demand = ""
  1833. if id7 != "":
  1834. if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
  1835. budget = tds[id7]
  1836. if '万元' in header_list2[2] and '万' not in budget:
  1837. budget += '万元'
  1838. budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
  1839. budget = str(getUnifyMoney(budget))
  1840. else:
  1841. budget = ""
  1842. if id8 != "":
  1843. if re.search('\w', tds[id8]):
  1844. order_time = tds[id8].strip()
  1845. order_begin, order_end = self.fix_time(order_time, html, page_time)
  1846. if quantity != "" or unitPrice != "" or brand != "" or specs != "":
  1847. link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
  1848. 'brand': brand[:50], 'specs':specs}
  1849. if link not in product_link:
  1850. product_link.append(link)
  1851. if budget != "" and order_time != "" :
  1852. link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
  1853. if link not in demand_link:
  1854. demand_link.append(link)
  1855. i += 1
  1856. else:
  1857. i += 1
  1858. if len(product_link)>0:
  1859. attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
  1860. else:
  1861. attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
  1862. if len(demand_link)>0:
  1863. demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
  1864. else:
  1865. demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
  1866. return [attr_dic, demand_dic]
  1867. # docchannel类型提取
  1868. class DocChannel():
  1869. def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):
  1870. self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
  1871. self.mask, self.mask_title = self.load_life(life_model)
  1872. self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
  1873. self.type_mask, self.type_mask_title = self.load_type(type_model)
  1874. self.sequen_len = 200 # 150 200
  1875. self.title_len = 30
  1876. self.sentence_num = 10
  1877. self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
  1878. lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
  1879. lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
  1880. self.id2type = {k: v for k, v in enumerate(lb_type)}
  1881. self.id2life = {k: v for k, v in enumerate(lb_life)}
  1882. def load_life(self,life_model):
  1883. with tf.Graph().as_default() as graph:
  1884. output_graph_def = graph.as_graph_def()
  1885. with open(os.path.dirname(__file__)+life_model, 'rb') as f:
  1886. output_graph_def.ParseFromString(f.read())
  1887. tf.import_graph_def(output_graph_def, name='')
  1888. print("%d ops in the final graph" % len(output_graph_def.node))
  1889. del output_graph_def
  1890. sess = tf.Session(graph=graph)
  1891. sess.run(tf.global_variables_initializer())
  1892. inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
  1893. prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
  1894. title = sess.graph.get_tensor_by_name('inputs/title:0')
  1895. mask = sess.graph.get_tensor_by_name('inputs/mask:0')
  1896. mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
  1897. # logit = sess.graph.get_tensor_by_name('output/logit:0')
  1898. softmax = sess.graph.get_tensor_by_name('output/softmax:0')
  1899. return sess, title, inputs, prob, softmax, mask, mask_title
  1900. def load_type(self,type_model):
  1901. with tf.Graph().as_default() as graph:
  1902. output_graph_def = graph.as_graph_def()
  1903. with open(os.path.dirname(__file__)+type_model, 'rb') as f:
  1904. output_graph_def.ParseFromString(f.read())
  1905. tf.import_graph_def(output_graph_def, name='')
  1906. print("%d ops in the final graph" % len(output_graph_def.node))
  1907. del output_graph_def
  1908. sess = tf.Session(graph=graph)
  1909. sess.run(tf.global_variables_initializer())
  1910. inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
  1911. prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
  1912. title = sess.graph.get_tensor_by_name('inputs/title:0')
  1913. mask = sess.graph.get_tensor_by_name('inputs/mask:0')
  1914. mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
  1915. # logit = sess.graph.get_tensor_by_name('output/logit:0')
  1916. softmax = sess.graph.get_tensor_by_name('output/softmax:0')
  1917. return sess, title, inputs, prob, softmax, mask, mask_title
  1918. def predict_process(self, docid='', doctitle='', dochtmlcon=''):
  1919. # print('准备预处理')
  1920. def get_kw_senten(s, span=10):
  1921. doc_sens = []
  1922. tmp = 0
  1923. num = 0
  1924. end_idx = 0
  1925. for it in re.finditer(self.kws, s): # '|'.join(keywordset)
  1926. left = s[end_idx:it.end()].split()
  1927. right = s[it.end():].split()
  1928. tmp_seg = s[tmp:it.start()].split()
  1929. if len(tmp_seg) > span or tmp == 0:
  1930. doc_sens.append(' '.join(left[-span:] + right[:span]))
  1931. end_idx = it.end() + 1 + len(' '.join(right[:span]))
  1932. tmp = it.end()
  1933. num += 1
  1934. if num >= self.sentence_num:
  1935. break
  1936. if doc_sens == []:
  1937. doc_sens.append(s)
  1938. return doc_sens
  1939. def word2id(wordlist, max_len=self.sequen_len):
  1940. ids = [getIndexOfWords(w) for w in wordlist]
  1941. ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
  1942. assert len(ids) == max_len
  1943. return ids
  1944. cost_time = dict()
  1945. datas = []
  1946. datas_title = []
  1947. try:
  1948. segword_title = ' '.join(selffool.cut(doctitle)[0])
  1949. segword_content = dochtmlcon
  1950. except:
  1951. segword_content = ''
  1952. segword_title = ''
  1953. if isinstance(segword_content, float):
  1954. segword_content = ''
  1955. if isinstance(segword_title, float):
  1956. segword_title = ''
  1957. segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
  1958. replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
  1959. replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
  1960. segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
  1961. segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
  1962. doc_word_list = segword_content.split()
  1963. if len(doc_word_list) > self.sequen_len / 2:
  1964. doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
  1965. doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
  1966. else:
  1967. doc_sens = ' '.join(doc_word_list[:self.sequen_len])
  1968. # print('标题:',segword_title)
  1969. # print('正文:',segword_content)
  1970. datas.append(doc_sens.split())
  1971. datas_title.append(segword_title.split())
  1972. # print('完成预处理')
  1973. return datas, datas_title
  1974. def is_houxuan(self, title, content):
  1975. '''
  1976. 通过标题和中文内容判断是否属于候选人公示类别
  1977. :param title: 公告标题
  1978. :param content: 公告正文文本内容
  1979. :return: 1 是候选人公示 ;0 不是
  1980. '''
  1981. if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围)
  1982. if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
  1983. return 0
  1984. return 1
  1985. if re.search('候选人的?公示', content[:100]):
  1986. if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
  1987. return 0
  1988. return 1
  1989. else:
  1990. return 0
  1991. def predict(self, title='', content=''):
  1992. # print('准备预测')
  1993. if isinstance(content, list):
  1994. token_l = [it.tokens for it in content]
  1995. tokens = [it for l in token_l for it in l]
  1996. content = ' '.join(tokens[:500])
  1997. title = re.sub('[^\u4e00-\u9fa5]', '', title)
  1998. if len(title)>50:
  1999. title = title[:20]+title[-30:]
  2000. data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字
  2001. text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
  2002. title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
  2003. array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
  2004. array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
  2005. pred = self.type_sess.run(self.type_softmax,
  2006. feed_dict={
  2007. self.type_title: array_title,
  2008. self.type_content: array_content,
  2009. self.type_mask:[[0]*text_len+[1]*(self.sequen_len-text_len)],
  2010. self.type_mask_title:[[0]*title_len+[1]*(self.title_len-title_len)],
  2011. self.type_prob:1}
  2012. )
  2013. id = np.argmax(pred, axis=1)[0]
  2014. prob = pred[0][id]
  2015. # print('公告类别:', self.id2type[id], '概率:',prob)
  2016. if id == 0:
  2017. pred = self.lift_sess.run(self.lift_softmax,
  2018. feed_dict={
  2019. self.lift_title: array_title,
  2020. self.lift_content: array_content,
  2021. self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
  2022. self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
  2023. self.lift_prob:1}
  2024. )
  2025. id = np.argmax(pred, axis=1)[0]
  2026. prob = pred[0][id]
  2027. # print('生命周期:',self.id2life[id], '概率:',prob)
  2028. if id == 6:
  2029. if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
  2030. # return '候选人公示', prob
  2031. return [{'docchannel': '候选人公示'}]
  2032. # return self.id2life[id], prob
  2033. return [{'docchannel':self.id2life[id]}]
  2034. else:
  2035. # return self.id2type[id], prob
  2036. return [{'docchannel':self.id2type[id]}]
  2037. # 保证金支付方式提取
  2038. class DepositPaymentWay():
  2039. def __init__(self,):
  2040. self.pt = '(保证金的?(交纳|缴纳|应按下列|入账|支付)方式)[::]*([^,。]{,60})'
  2041. self.pt2 = '保证金(必?须以|必?须?通过|以)(.{,8})方式'
  2042. kws = ['银行转账', '公?对公方?式?转账', '对公转账', '柜台转账', '(线上|网上)自?行?(缴纳|交纳|缴退|收退)',
  2043. '网上银行支付', '现金存入', '直接缴纳', '支票', '汇票', '本票', '电汇', '转账', '汇款', '随机码',
  2044. '入账', '基本账户转出', '基本账户汇入', '诚信库中登记的账户转出',
  2045. '银行保函', '电子保函', '担保函', '保证保险', '合法担保机构出具的担保', '金融机构、担保机构出具的保函']
  2046. self.kws = sorted(kws, key=lambda x: len(x), reverse=True)
  2047. def predict(self,content):
  2048. pay_way = {'deposit_patment_way':''}
  2049. result = []
  2050. pay = re.search(self.pt, content)
  2051. if pay:
  2052. # print(pay.group(0))
  2053. pay = pay.group(3)
  2054. for it in re.finditer('|'.join(self.kws), pay):
  2055. result.append(it.group(0))
  2056. pay_way['deposit_patment_way'] = ';'.join(result)
  2057. return pay_way
  2058. pay = re.search(self.pt2, content)
  2059. if pay:
  2060. # print(pay.group(0))
  2061. pay = pay.group(2)
  2062. for it in re.finditer('|'.join(self.kws), pay):
  2063. result.append(it.group(0))
  2064. pay_way['deposit_patment_way'] = ';'.join(result)
  2065. return pay_way
  2066. else:
  2067. return pay_way
  2068. def getSavedModel():
  2069. #predictor = FormPredictor()
  2070. graph = tf.Graph()
  2071. with graph.as_default():
  2072. model = tf.keras.models.load_model("../form/model/model_form.model_item.hdf5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
  2073. #print(tf.graph_util.remove_training_nodes(model))
  2074. tf.saved_model.simple_save(
  2075. tf.keras.backend.get_session(),
  2076. "./h5_savedmodel/",
  2077. inputs={"image": model.input},
  2078. outputs={"scores": model.output}
  2079. )
  2080. def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
  2081. '''
  2082. model = models.Sequential()
  2083. model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
  2084. model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
  2085. crf = CRF(len(chunk_tags), sparse_target=True)
  2086. model.add(crf)
  2087. model.summary()
  2088. model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
  2089. return model
  2090. '''
  2091. input = layers.Input(shape=(None,),dtype="int32")
  2092. if weights is not None:
  2093. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
  2094. else:
  2095. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
  2096. bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
  2097. bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
  2098. crf = CRF(len(chunk_tags),sparse_target=True)
  2099. crf_out = crf(bilstm_dense)
  2100. model = models.Model(input=[input],output = [crf_out])
  2101. model.summary()
  2102. model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
  2103. return model
  2104. import h5py
  2105. def h5_to_graph(sess,graph,h5file):
  2106. f = h5py.File(h5file,'r') #打开h5文件
  2107. def getValue(v):
  2108. _value = f["model_weights"]
  2109. list_names = str(v.name).split("/")
  2110. for _index in range(len(list_names)):
  2111. print(v.name)
  2112. if _index==1:
  2113. _value = _value[list_names[0]]
  2114. _value = _value[list_names[_index]]
  2115. return _value.value
  2116. def _load_attributes_from_hdf5_group(group, name):
  2117. """Loads attributes of the specified name from the HDF5 group.
  2118. This method deals with an inherent problem
  2119. of HDF5 file which is not able to store
  2120. data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
  2121. # Arguments
  2122. group: A pointer to a HDF5 group.
  2123. name: A name of the attributes to load.
  2124. # Returns
  2125. data: Attributes data.
  2126. """
  2127. if name in group.attrs:
  2128. data = [n.decode('utf8') for n in group.attrs[name]]
  2129. else:
  2130. data = []
  2131. chunk_id = 0
  2132. while ('%s%d' % (name, chunk_id)) in group.attrs:
  2133. data.extend([n.decode('utf8')
  2134. for n in group.attrs['%s%d' % (name, chunk_id)]])
  2135. chunk_id += 1
  2136. return data
  2137. def readGroup(gr,parent_name,data):
  2138. for subkey in gr:
  2139. print(subkey)
  2140. if parent_name!=subkey:
  2141. if parent_name=="":
  2142. _name = subkey
  2143. else:
  2144. _name = parent_name+"/"+subkey
  2145. else:
  2146. _name = parent_name
  2147. if str(type(gr[subkey]))=="<class 'h5py._hl.group.Group'>":
  2148. readGroup(gr[subkey],_name,data)
  2149. else:
  2150. data.append([_name,gr[subkey].value])
  2151. print(_name,gr[subkey].shape)
  2152. layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names')
  2153. list_name_value = []
  2154. readGroup(f["model_weights"], "", list_name_value)
  2155. '''
  2156. for k, name in enumerate(layer_names):
  2157. g = f["model_weights"][name]
  2158. weight_names = _load_attributes_from_hdf5_group(g, 'weight_names')
  2159. #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
  2160. for weight_name in weight_names:
  2161. list_name_value.append([weight_name,np.asarray(g[weight_name])])
  2162. '''
  2163. for name_value in list_name_value:
  2164. name = name_value[0]
  2165. '''
  2166. if re.search("dense",name) is not None:
  2167. name = name[:7]+"_1"+name[7:]
  2168. '''
  2169. value = name_value[1]
  2170. print(name,graph.get_tensor_by_name(name),np.shape(value))
  2171. sess.run(tf.assign(graph.get_tensor_by_name(name),value))
  2172. def initialize_uninitialized(sess):
  2173. global_vars = tf.global_variables()
  2174. is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars])
  2175. not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f]
  2176. adam_vars = []
  2177. for _vars in not_initialized_vars:
  2178. if re.search("Adam",_vars.name) is not None:
  2179. adam_vars.append(_vars)
  2180. print([str(i.name) for i in adam_vars]) # only for testing
  2181. if len(adam_vars):
  2182. sess.run(tf.variables_initializer(adam_vars))
  2183. def save_codename_model():
  2184. # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
  2185. filepath = "../projectCode/models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt"
  2186. vocabpath = "../projectCode/models/vocab.pk"
  2187. classlabelspath = "../projectCode/models/classlabels.pk"
  2188. # vocab = load(vocabpath)
  2189. # class_labels = load(classlabelspath)
  2190. w2v_matrix = load('codename_w2v_matrix.pk')
  2191. graph = tf.get_default_graph()
  2192. with graph.as_default() as g:
  2193. ''''''
  2194. # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
  2195. #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function})
  2196. sess = tf.Session(graph=g)
  2197. # sess = tf.keras.backend.get_session()
  2198. char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
  2199. #with sess.as_default():
  2200. sess.run(tf.global_variables_initializer())
  2201. # print(sess.run("time_distributed_1/kernel:0"))
  2202. # model.load_weights(filepath)
  2203. saver = tf.train.Saver()
  2204. saver.restore(sess, filepath)
  2205. # print("logits",sess.run(logits))
  2206. # print("#",sess.run("time_distributed_1/kernel:0"))
  2207. # x = load("codename_x.pk")
  2208. #y = model.predict(x)
  2209. # y = sess.run(model.output,feed_dict={model.input:x})
  2210. # for item in np.argmax(y,-1):
  2211. # print(item)
  2212. tf.saved_model.simple_save(
  2213. sess,
  2214. "./codename_savedmodel_tf/",
  2215. inputs={"inputs": char_input,
  2216. "inputs_length":length,
  2217. 'keepprob':keepprob},
  2218. outputs={"logits": logits,
  2219. "trans":trans}
  2220. )
  2221. def save_role_model():
  2222. '''
  2223. @summary: 保存model为savedModel,部署到PAI平台上调用
  2224. '''
  2225. model_role = PREMPredict().model_role
  2226. with model_role.graph.as_default():
  2227. model = model_role.getModel()
  2228. sess = tf.Session(graph=model_role.graph)
  2229. print(type(model.input))
  2230. sess.run(tf.global_variables_initializer())
  2231. h5_to_graph(sess, model_role.graph, model_role.model_role_file)
  2232. model = model_role.getModel()
  2233. tf.saved_model.simple_save(sess,
  2234. "./role_savedmodel/",
  2235. inputs={"input0":model.input[0],
  2236. "input1":model.input[1],
  2237. "input2":model.input[2]},
  2238. outputs={"outputs":model.output}
  2239. )
  2240. def save_money_model():
  2241. model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
  2242. graph = tf.Graph()
  2243. with graph.as_default():
  2244. sess = tf.Session(graph=graph)
  2245. with sess.as_default():
  2246. # model = model_money.getModel()
  2247. # model.summary()
  2248. # sess.run(tf.global_variables_initializer())
  2249. # h5_to_graph(sess, model_money.graph, model_money.model_money_file)
  2250. model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  2251. model.summary()
  2252. print(model.weights)
  2253. tf.saved_model.simple_save(sess,
  2254. "./money_savedmodel2/",
  2255. inputs = {"input0":model.input[0],
  2256. "input1":model.input[1],
  2257. "input2":model.input[2]},
  2258. outputs = {"outputs":model.output}
  2259. )
  2260. def save_person_model():
  2261. model_person = EPCPredict().model_person
  2262. with model_person.graph.as_default():
  2263. x = load("person_x.pk")
  2264. _data = np.transpose(np.array(x),(1,0,2,3))
  2265. model = model_person.getModel()
  2266. sess = tf.Session(graph=model_person.graph)
  2267. with sess.as_default():
  2268. sess.run(tf.global_variables_initializer())
  2269. model_person.load_weights()
  2270. #h5_to_graph(sess, model_person.graph, model_person.model_person_file)
  2271. predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]})
  2272. #predict_y = model.predict([_data[0],_data[1]])
  2273. print(np.argmax(predict_y,-1))
  2274. tf.saved_model.simple_save(sess,
  2275. "./person_savedmodel/",
  2276. inputs={"input0":model.input[0],
  2277. "input1":model.input[1]},
  2278. outputs = {"outputs":model.output})
  2279. def save_form_model():
  2280. model_form = FormPredictor()
  2281. with model_form.graph.as_default():
  2282. model = model_form.getModel("item")
  2283. sess = tf.Session(graph=model_form.graph)
  2284. sess.run(tf.global_variables_initializer())
  2285. h5_to_graph(sess, model_form.graph, model_form.model_file_item)
  2286. tf.saved_model.simple_save(sess,
  2287. "./form_savedmodel/",
  2288. inputs={"inputs":model.input},
  2289. outputs = {"outputs":model.output})
  2290. def save_codesplit_model():
  2291. filepath_code = "../projectCode/models/model_code.hdf5"
  2292. graph = tf.Graph()
  2293. with graph.as_default():
  2294. model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  2295. sess = tf.Session()
  2296. sess.run(tf.global_variables_initializer())
  2297. h5_to_graph(sess, graph, filepath_code)
  2298. tf.saved_model.simple_save(sess,
  2299. "./codesplit_savedmodel/",
  2300. inputs={"input0":model_code.input[0],
  2301. "input1":model_code.input[1],
  2302. "input2":model_code.input[2]},
  2303. outputs={"outputs":model_code.output})
  2304. def save_timesplit_model():
  2305. filepath = '../time/model_label_time_classify.model.hdf5'
  2306. with tf.Graph().as_default() as graph:
  2307. time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  2308. with tf.Session() as sess:
  2309. sess.run(tf.global_variables_initializer())
  2310. h5_to_graph(sess, graph, filepath)
  2311. tf.saved_model.simple_save(sess,
  2312. "./timesplit_model/",
  2313. inputs={"input0":time_model.input[0],
  2314. "input1":time_model.input[1]},
  2315. outputs={"outputs":time_model.output})
  2316. if __name__=="__main__":
  2317. #save_role_model()
  2318. # save_codename_model()
  2319. # save_money_model()
  2320. #save_person_model()
  2321. #save_form_model()
  2322. #save_codesplit_model()
  2323. # save_timesplit_model()
  2324. '''
  2325. # with tf.Session(graph=tf.Graph()) as sess:
  2326. # from tensorflow.python.saved_model import tag_constants
  2327. # meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
  2328. # graph = tf.get_default_graph()
  2329. # signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  2330. # signature = meta_graph_def.signature_def
  2331. # input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
  2332. # input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
  2333. # outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
  2334. # x = load("person_x.pk")
  2335. # _data = np.transpose(x,[1,0,2,3])
  2336. # y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
  2337. # print(np.argmax(y,-1))
  2338. '''