''' Created on 2018年12月26日 @author: User ''' import os import sys from BiddingKG.dl.common.nerUtils import * sys.path.append(os.path.abspath("../..")) # from keras.engine import topology # from keras import models # from keras import layers # from keras_contrib.layers.crf import CRF # from keras.preprocessing.sequence import pad_sequences # from keras import optimizers,losses,metrics from BiddingKG.dl.common.Utils import * from BiddingKG.dl.interface.modelFactory import * import tensorflow as tf import pandas as pd from BiddingKG.dl.product.data_util import decode, process_data from BiddingKG.dl.interface.Entitys import Entity from BiddingKG.dl.complaint.punish_predictor import Punish_Extract from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money from bs4 import BeautifulSoup import copy import calendar import datetime from BiddingKG.dl.entityLink.entityLink import get_business_data # import fool # 统一用 selffool ,阿里云上只有selffool 包 cpu_num = int(os.environ.get("CPU_NUM",0)) sess_config = tf.ConfigProto( inter_op_parallelism_threads = cpu_num, intra_op_parallelism_threads = cpu_num, log_device_placement=True) sess_config = None file = os.path.dirname(__file__) + '/agency_set.pkl' with open(file, 'rb') as f: agency_set = pickle.load(f) def is_agency(entity_text): if re.search('(招投?标|采购|代理|咨询|管理|物资|事务所?|顾问|监理|拍卖)[()\w]{,4}(有限)?(责任)?公司|(采购|招投?标|交易|代理|咨询)[()\w]{,4}(中心|服务所)|法院$', entity_text) or entity_text in agency_set: return True return False from threading import RLock dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()}, "prem":{"predictor":None,"Lock":RLock()}, "epc":{"predictor":None,"Lock":RLock()}, "roleRule":{"predictor":None,"Lock":RLock()}, "roleRuleFinal":{"predictor":None,"Lock":RLock()}, "tendereeRuleRecall":{"predictor":None,"Lock":RLock()}, "form":{"predictor":None,"Lock":RLock()}, "time":{"predictor":None,"Lock":RLock()}, "punish":{"predictor":None,"Lock":RLock()}, "product":{"predictor":None,"Lock":RLock()}, "product_attrs":{"predictor":None,"Lock":RLock()}, "channel": {"predictor": None, "Lock": RLock()}, "deposit_payment_way": {"predictor": None, "Lock": RLock()}, "total_unit_money": {"predictor": None, "Lock": RLock()}, "industry": {"predictor": None, "Lock": RLock()}, "rolegrade": {"predictor": None, "Lock": RLock()}, "moneygrade": {"predictor": None, "Lock": RLock()}, "district": {"predictor": None, "Lock": RLock()}, 'tableprem': {"predictor": None, "Lock": RLock()}, 'candidate': {"predictor": None, "Lock": RLock()}, 'websource_tenderee': {"predictor": None, "Lock": RLock()}, 'project_label': {"predictor": None, "Lock": RLock()} } def getPredictor(_type): if _type in dict_predictor: with dict_predictor[_type]["Lock"]: if dict_predictor[_type]["predictor"] is None: if _type == "codeName": dict_predictor[_type]["predictor"] = CodeNamePredict(config=sess_config) if _type == "prem": dict_predictor[_type]["predictor"] = PREMPredict(config=sess_config) if _type == "epc": dict_predictor[_type]["predictor"] = EPCPredict(config=sess_config) if _type == "roleRule": dict_predictor[_type]["predictor"] = RoleRulePredictor() if _type == "roleRuleFinal": dict_predictor[_type]["predictor"] = RoleRuleFinalAdd() if _type == "tendereeRuleRecall": dict_predictor[_type]["predictor"] = TendereeRuleRecall() if _type == "form": dict_predictor[_type]["predictor"] = FormPredictor(config=sess_config) if _type == "time": dict_predictor[_type]["predictor"] = TimePredictor(config=sess_config) if _type == "punish": dict_predictor[_type]["predictor"] = Punish_Extract() if _type == "product": dict_predictor[_type]["predictor"] = ProductPredictor(config=sess_config) if _type == "product_attrs": dict_predictor[_type]["predictor"] = ProductAttributesPredictor() if _type == "channel": dict_predictor[_type]["predictor"] = DocChannel(config=sess_config) if _type == 'deposit_payment_way': dict_predictor[_type]["predictor"] = DepositPaymentWay() if _type == 'total_unit_money': dict_predictor[_type]["predictor"] = TotalUnitMoney() if _type == 'industry': dict_predictor[_type]["predictor"] = IndustryPredictor() if _type == 'rolegrade': dict_predictor[_type]["predictor"] = RoleGrade() if _type == 'moneygrade': dict_predictor[_type]["predictor"] = MoneyGrade() if _type == 'district': dict_predictor[_type]["predictor"] = DistrictPredictor() if _type == 'tableprem': dict_predictor[_type]["predictor"] = TablePremExtractor() if _type == 'candidate': dict_predictor[_type]["predictor"] = CandidateExtractor() if _type == 'websource_tenderee': dict_predictor[_type]['predictor'] = WebsourceTenderee() if _type == 'project_label': dict_predictor[_type]['predictor'] = ProjectLabel() return dict_predictor[_type]["predictor"] raise NameError("no this type of predictor") # 编号名称模型 class CodeNamePredict(): def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad(),config=None): self.model = None self.MAX_LEN = None self.model_code = None if EMBED_DIM is None: self.EMBED_DIM = 60 else: self.EMBED_DIM = EMBED_DIM if BiRNN_UNITS is None: self.BiRNN_UNITS = 200 else: self.BiRNN_UNITS = BiRNN_UNITS self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5" #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5" self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5" vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk" classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk" self.vocab = load(vocabpath) self.class_labels = load(classlabelspath) #生成提取编号和名称的正则 id_PC_B = self.class_labels.index("PC_B") id_PC_M = self.class_labels.index("PC_M") id_PC_E = self.class_labels.index("PC_E") id_PN_B = self.class_labels.index("PN_B") id_PN_M = self.class_labels.index("PN_M") id_PN_E = self.class_labels.index("PN_E") self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E)) self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E)) # print("pc",self.PC_pattern) # print("pn",self.PN_pattern) self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab))) self.inputs = None self.outputs = None self.sess_codename = tf.Session(graph=tf.Graph(),config=config) self.sess_codesplit = tf.Session(graph=tf.Graph(),config=config) self.inputs_code = None self.outputs_code = None if not lazyLoad: self.getModel() self.getModel_code() def getModel(self): ''' @summary: 取得编号和名称模型 ''' if self.inputs is None: log("get model of codename") with self.sess_codename.as_default(): with self.sess_codename.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf") signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name) self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name) self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name) self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name) self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name) return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans else: return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans ''' if self.model is None: self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None) self.model.load_weights(self.filepath) return self.model ''' def getModel_code(self): if self.inputs_code is None: log("get model of code") with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel") signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.inputs_code = [] self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)) self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)) self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)) self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name) self.sess_codesplit.graph.finalize() return self.inputs_code,self.outputs_code else: return self.inputs_code,self.outputs_code ''' if self.model_code is None: log("get model of model_code") with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) return self.model_code ''' def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights): ''' model = models.Sequential() model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True))) crf = CRF(len(chunk_tags), sparse_target=True) model.add(crf) model.summary() model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) return model ''' input = layers.Input(shape=(None,)) if weights is not None: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input) else: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input) bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding) bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm) crf = CRF(len(chunk_tags),sparse_target=True) crf_out = crf(bilstm_dense) model = models.Model(input=[input],output = [crf_out]) model.summary() model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy]) return model #根据规则补全编号或名称两边的符号 def fitDataByRule(self,data): symbol_dict = {"(":")", "(":")", "[":"]", "【":"】", ")":"(", ")":"(", "]":"[", "】":"【"} leftSymbol_pattern = re.compile("[\((\[【]") rightSymbol_pattern = re.compile("[\))\]】]") leftfinds = re.findall(leftSymbol_pattern,data) rightfinds = re.findall(rightSymbol_pattern,data) result = data if len(leftfinds)+len(rightfinds)==0: return data elif len(leftfinds)==len(rightfinds): return data elif abs(len(leftfinds)-len(rightfinds))==1: if len(leftfinds)>len(rightfinds): if symbol_dict.get(data[0]) is not None: result = data[1:] else: #print(symbol_dict.get(leftfinds[0])) result = data+symbol_dict.get(leftfinds[0]) else: if symbol_dict.get(data[-1]) is not None: result = data[:-1] else: result = symbol_dict.get(rightfinds[0])+data return result def decode(self,logits, trans, sequence_lengths, tag_num): viterbi_sequences = [] for logit, length in zip(logits, sequence_lengths): score = logit[:length] viterbi_seq, viterbi_score = viterbi_decode(score, trans) viterbi_sequences.append(viterbi_seq) return viterbi_sequences def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000): #@summary: 获取每篇文章的code和name # pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店") pattern_score = re.compile('建设项目|服务项目|工程项目|工程施工|建设工程|服务中心|基础设施|物业管理|工程设计|妇幼保健|咨询服务|管理系统|管理中心|改建工程|配套工程|公安局|幼儿园|管理局|使用权|办公楼|教育局|管理处|图书馆|经营权|项目|采购|工程|改造|服务|设备|中心|医院|系统|建设|监理|施工|维修|学院|安装|设计|关于|标段|招标|技术|询价|管理|学校|小学|中学|平台|提升|设施|检测|整治|社区|装修|政府|绿化|物资|租赁|地块|医疗|编制|公开|规划|监控|教育|维护|校区|治理|升级|安置|竞价|购置|评估|勘察|承包|实验|大学|材料|生产|耗材|招租|硬化|维保|用地|消防|审计|拍卖|物业|入围|养护|机关|企业|用房|出让|资产|分局|验收|宣传|处置|校园|研究|咨询|修缮|更换|装饰|劳务|保养|物流|出租|局|院') result = [] index_unk = self.word2index.get("") # index_pad = self.word2index.get("") if list_entitys is None: list_entitys = [[] for _ in range(len(list_sentences))] for list_sentence,list_entity in zip(list_sentences,list_entitys): if len(list_sentence)==0: result.append([{"code":[],"name":""}]) continue doc_id = list_sentence[0].doc_id # sentences = [] # for sentence in list_sentence: # if len(sentence.sentence_text)>MAX_AREA: # for _sentence_comma in re.split("[;;,\n]",sentence): # _comma_index = 0 # while(_comma_indexMAX_AREA: MAX_LEN = MAX_AREA _LEN = MAX_AREA//MAX_LEN #预测 x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]] # x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]] x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x] x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post") if USE_API: requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},verify=True) predict_y = json.loads(requests_result.text)['result'] # print("cost_time:", json.loads(requests_result.text)['cost_time']) # print(MAX_LEN,_LEN,_begin_index) else: with self.sess_codename.as_default(): t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel() _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x, t_input_length:x_len, t_keepprob:1.0}) predict_y = self.decode(_logits,_trans,x_len,7) # print('==========',_logits) ''' for item11 in np.argmax(predict_y,-1): print(item11) print(predict_y) ''' # print(predict_y) for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)): pad_sentence = sentence.sentence_text[:MAX_LEN] join_predict = "".join([str(s) for s in predict]) # print(pad_sentence) # print(join_predict) code_x = [] code_text = [] pre_text = [] temp_entitys = [] for iter in re.finditer(self.PC_pattern,join_predict): get_len = 40 if iter.span()[0]0: code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3)) if USE_PAI_EAS: request = tf_predict_pb2.PredictRequest() request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0])) request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1)) request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1])) request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1)) request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2])) request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1)) request_data = request.SerializeToString() list_outputs = ["outputs"] _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs) if _result is not None: predict_code = _result["outputs"] else: with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]]) else: with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): inputs_code,outputs_code = self.getModel_code() predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})[0] #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]}) #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]]) for h in range(len(predict_code)): if predict_code[h][0]>0.5: the_code = self.fitDataByRule(code_text[h]) # print(the_code) #add code to entitys list_entity.append(temp_entitys[h]) if re.search(',|/|;|、|,', the_code) and len(the_code)>25: for it in re.split(',|/|;|、|,', the_code): if len(it) > 8: if it not in code_set: code_set.add(it) # item['code'].append(it) if re.search("(项目编号|招标编号):?$", pre_text[h]): item['code'].append((it, 0)) elif re.search('采购(计划)?编号:?$', pre_text[h]): item['code'].append((it, 1)) elif re.search('(询价|合同)编号:?$', pre_text[h]): item['code'].append((it, 2)) else: item['code'].append((it, 3)) elif len(item['code']) > 0: new_it = item['code'][-1][0] + re.search(',|/|;|、|,', the_code).group(0) + it if new_it not in code_set: code_set.add(new_it) # item['code'][-1] = new_it if re.search("(项目编号|招标编号):?$", pre_text[h]): item['code'][-1] = (new_it, 0) elif re.search('采购(计划)?编号:?$', pre_text[h]): item['code'][-1] = (new_it, 1) elif re.search('(询价|合同)编号:?$', pre_text[h]): item['code'][-1] = (new_it, 2) else: item['code'][-1] = (new_it, 3) else: if the_code not in code_set: code_set.add(the_code) # item['code'].append(the_code) if re.search("(项目编号|招标编号):?$", pre_text[h]): item['code'].append((the_code, 0)) elif re.search('采购(计划)?编号:?$', pre_text[h]): item['code'].append((the_code, 1)) elif re.search('(询价|合同)编号:?$', pre_text[h]): item['code'].append((the_code, 2)) else: item['code'].append((the_code, 3)) break elif the_code not in code_set: code_set.add(the_code) # item['code'].append(the_code) if re.search("(项目编号|招标编号):?$", pre_text[h]): item['code'].append((the_code, 0)) elif re.search('采购(计划)?编号:?$', pre_text[h]): item['code'].append((the_code, 1)) elif re.search('(询价|合同)编号:?$', pre_text[h]): item['code'].append((the_code, 2)) else: item['code'].append((the_code, 3)) # if the_code not in code_set: # code_set.add(the_code) # item['code'] = list(code_set) for iter in re.finditer(self.PN_pattern,join_predict): _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) if len(_name)>200: # 避免模型预测类似 202750503 这种很长重复字很多的错误项目名称 continue elif '公司:你单位在' in _name: # 避免类似 339900030 这种作为项目名称,导致中标角色作为招标角色 continue #add name to entitys _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment) list_entity.append(_entity) # w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5 w = 1 if re.search('(项目|工程|招标|采购|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题|项目)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5 if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1] len_name = len(_name) if len(_name) <50 else 100-len(_name) # 2023/03/02 超出50长度的逐渐递减 dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len_name * 0.05)*w+(5-sentence.sentence_index)*0.2] else: dict_name_freq_score[_name][0] += 1 ''' for iter in re.finditer(self.PN_pattern,join_predict): print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])) if item[1]['name']=="": for iter in re.finditer(self.PN_pattern,join_predict): #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) break ''' if _begin_index+_LEN>=len(list_sentence): break _begin_index += _LEN list_name_freq_score = [] # print('模型预测项目名称:', dict_name_freq_score) # 2020/11/23 大网站规则调整 if len(dict_name_freq_score) == 0: # name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]' name_re1 = '(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题)[::\s]+(?P[^,。:;]{2,60})[,。]' for sentence in list_sentence: # pad_sentence = sentence.sentence_text othername = re.search(name_re1, sentence.sentence_text) if othername != None: project_name = othername.group('name') if re.search('[\u4e00-\u9fa5]+', project_name) == None: # 没有中文的项目名称去除 # log('没有中文的项目名称去除') continue beg = find_index([project_name], sentence.sentence_text)[0] end = beg + len(project_name) _name = self.fitDataByRule(sentence.sentence_text[beg:end]) # print('规则召回项目名称:', _name) # add name to entitys _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % ( sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name, entity_type="name", sentence_index=sentence.sentence_index, begin_index=0, end_index=0, wordOffset_begin=beg, wordOffset_end=end,in_attachment=sentence.in_attachment) list_entity.append(_entity) w = 1 if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1] dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w+(5-sentence.sentence_index)*0.2] else: dict_name_freq_score[_name][0] += 1 # othername = re.search(name_re1, sentence.sentence_text) # if othername != None: # _name = othername.group(3) # if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1] # else: # dict_name_freq_score[_name][0] += 1 for _name in dict_name_freq_score.keys(): list_name_freq_score.append([_name,dict_name_freq_score[_name]]) # print(list_name_freq_score) if len(list_name_freq_score)>0: list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True) item['name'] = list_name_freq_score[0][0] # for it in list_name_freq_score: # print('项目名称及分值:',it[0],it[1], it[1][0]*it[1][1]) # if list_name_freq_score[0][1][0]>1: # item[1]['name'] = list_name_freq_score[0][0] # else: # list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True) # item[1]["name"] = list_name_freq_score[0][0] #下面代码加上去用正则添加某些识别不到的项目编号 if item['code'] == []: for sentence in list_sentence: # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text) # if othercode != None: # item[1]['code'].append(othercode.group(2)) # 2020/11/23 大网站规则调整 othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P[^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。\u4e00-\u9fa5]', sentence.sentence_text) if othercode != None: # item['code'].append(othercode.group('code')) if re.search("(项目编号|招标编号):?$", othercode.group(0)): item['code'].append((othercode.group('code'), 0)) elif re.search('采购(计划)?编号:?$', othercode.group(0)): item['code'].append((othercode.group('code'), 1)) elif re.search('(询价|合同)编号:?$', othercode.group(0)): item['code'].append((othercode.group('code'), 2)) else: item['code'].append((othercode.group('code'), 3)) # print('规则召回项目编号:', othercode.group('code')) # item['code'] = [code for code in item['code'] if len(code)<500] # item['code'].sort(key=lambda x:len(x),reverse=True) item['code'] = [code for code in item['code'] if len(code[0]) < 500] item['code'].sort(key=lambda x: x[1]) item['code'] = [it[0] for it in item['code']] result.append(item) list_sentence.sort(key=lambda x: x.sentence_index,reverse=False) return result ''' #当数据量过大时会报错 def predict(self,articles,MAX_LEN = None): sentences = [] for article in articles: for sentence in article.content.split("。"): sentences.append([sentence,article.id]) if MAX_LEN is None: sent_len = [len(sentence[0]) for sentence in sentences] MAX_LEN = max(sent_len) #print(MAX_LEN) #若为空,则直接返回空 result = [] if MAX_LEN==0: for article in articles: result.append([article.id,{"code":[],"name":""}]) return result index_unk = self.word2index.get("") index_pad = self.word2index.get("") x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences] x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post") predict_y = self.getModel().predict(x) last_doc_id = "" item = [] for sentence,predict in zip(sentences,np.argmax(predict_y,-1)): pad_sentence = sentence[0][:MAX_LEN] doc_id = sentence[1] join_predict = "".join([str(s) for s in predict]) if doc_id!=last_doc_id: if last_doc_id!="": result.append(item) item = [doc_id,{"code":[],"name":""}] code_set = set() code_x = [] code_text = [] for iter in re.finditer(self.PC_pattern,join_predict): get_len = 40 if iter.span()[0]0: code_x = np.transpose(np.array(code_x),(1,0,2,3)) predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]]) for h in range(len(predict_code)): if predict_code[h][0]>0.5: the_code = self.fitDataByRule(code_text[h]) if the_code not in code_set: code_set.add(the_code) item[1]['code'] = list(code_set) if item[1]['name']=="": for iter in re.finditer(self.PN_pattern,join_predict): #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) break last_doc_id = doc_id result.append(item) return result ''' #角色金额模型 class PREMPredict(): def __init__(self,config=None): #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5") # self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5" self.model_role = Model_role_classify_word(config=config) self.model_money = Model_money_classify(config=config) # self.role_file = open('/data/python/lsm/role_model_predict.txt', 'a', encoding='utf-8') # self.money_file = open('/data/python/lsm/money_model_predict.txt', 'a', encoding='utf-8') return def search_role_data(self,list_sentences,list_entitys): ''' @summary:根据句子list和实体list查询角色模型的输入数据 @param: list_sentences:文章的sentences list_entitys:文章的entitys @return:角色模型的输入数据 ''' text_list = [] data_x = [] points_entitys = [] for list_entity,list_sentence in zip(list_entitys,list_sentences): list_entity.sort(key=lambda x:x.sentence_index) list_sentence.sort(key=lambda x:x.sentence_index) p_entitys = 0 p_sentences = 0 while(p_entitys 1: # _dianhua = phoneFromList(have_phone[1:]) # else: # _dianhua = phoneFromList(have_phone) # elif have_key: # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find( # last_person_phone) != -1: # if len(have_key) > 1: # _dianhua = phoneFromList(have_key[1:]) # else: # _dianhua = phoneFromList(have_key) # elif have_phone2: # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find( # last_person_phone) != -1: # if len(have_phone2) > 1: # _dianhua = phoneFromList(have_phone2[1:]) # else: # _dianhua = phoneFromList(have_phone2) # elif have_key2: # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find( # last_person_phone) != -1: # if len(have_key2) > 1: # _dianhua = phoneFromList(have_key2[1:]) # else: # _dianhua = phoneFromList(have_key2) # elif have_phone3: # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find( # last_person_phone) != -1: # if len(have_phone3) > 1: # _dianhua = phoneFromList(have_phone3[1:]) # else: # _dianhua = phoneFromList(have_phone3) # elif have_key3: # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find( # last_person_phone) != -1: # if len(have_key3) > 1: # _dianhua = phoneFromList(have_key3[1:]) # else: # _dianhua = phoneFromList(have_key3) # elif have_phone4: # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find( # last_person_phone) != -1: # if len(have_phone4) > 1: # _dianhua = phoneFromList(have_phone4) # else: # _dianhua = phoneFromList(have_phone4) # elif have_key4: # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find( # last_person_phone) != -1: # if len(have_key4) > 1: # _dianhua = phoneFromList(have_key4) # else: # _dianhua = phoneFromList(have_key4) # else: # _dianhua = "" # # dict_context_itemx[_key] = [item_x, _dianhua] # dict_context_itemx[_key] = [_dianhua] # # points_entitys.append(entity) # # dianhua.append(_dianhua) # last_person = entity.entity_text # if _dianhua: # # 更新联系人entity联系方式(person_phone) # entity.person_phone = _dianhua # last_person_phone = _dianhua # else: # last_person_phone = "####****++++$^" # p_entitys += 1 from scipy.optimize import linear_sum_assignment from BiddingKG.dl.interface.Entitys import Match def dispatch(match_list): main_roles = list(set([match.main_role for match in match_list])) attributes = list(set([match.attribute for match in match_list])) label = np.zeros(shape=(len(main_roles), len(attributes))) for match in match_list: main_role = match.main_role attribute = match.attribute value = match.value label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000 # print(label) gragh = -label # km算法 row, col = linear_sum_assignment(gragh) max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value] return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch] # km算法 key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)(\d{7,12})') phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' '\+86.?1[3|4|5|6|7|8|9]\d{9}|' '0\d{2,3}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' '0\d{2,3}[-—-―]\d{7,8}转\d{1,4}|' '0\d{2,3}[-—-―]?[1-9]\d{6,7}|' '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|' '[1-9]\d{6,7}') phone_entitys = [] for _sentence in list_sentence: sentence_text = _sentence.sentence_text res_set = set() for i in re.finditer(phone,sentence_text): res_set.add((i.group(),i.start(),i.end())) for i in re.finditer(key_word,sentence_text): res_set.add((i.group(2),i.start()+len(i.group(1)),i.end())) for item in list(res_set): phone_left = sentence_text[max(0,item[1]-10):item[1]] phone_right = sentence_text[item[2]:item[2]+8] # 排除传真号 和 其它错误项 if re.search("传,?真|信,?箱|邮,?箱",phone_left): if not re.search("电,?话",phone_left): continue if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]",phone_left): continue if re.search("[.,]\d{2,}",phone_right): continue _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2],in_attachment=_sentence.in_attachment) phone_entitys.append(_entity) person_entitys = [] for entity in list_entity: if entity.entity_type == "person": entity.person_phone = "" person_entitys.append(entity) _list_entity = phone_entitys + person_entitys _list_entity = sorted(_list_entity,key=lambda x:(x.sentence_index,x.wordOffset_begin)) words_num_dict = dict() last_words_num = 0 list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index) for sentence in list_sentence: _index = sentence.sentence_index if _index == 0: words_num_dict[_index] = 0 else: words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num last_words_num = len(sentence.sentence_text) match_list = [] for index in range(len(_list_entity)): entity = _list_entity[index] if entity.entity_type=="person" and entity.label in [1,2,3]: match_nums = 0 for after_index in range(index + 1, min(len(_list_entity), index + 5)): after_entity = _list_entity[after_index] if after_entity.entity_type=="phone": sentence_distance = after_entity.sentence_index - entity.sentence_index distance = (words_num_dict[after_entity.sentence_index] + after_entity.wordOffset_begin) - ( words_num_dict[entity.sentence_index] + entity.wordOffset_end) if sentence_distance < 2 and distance < 50: value = (-1 / 2 * (distance ** 2)) / 10000 match_list.append(Match(entity, after_entity, value)) match_nums += 1 else: break if after_entity.entity_type=="person": if after_entity.label not in [1,2,3]: break if not match_nums: for previous_index in range(index-1, max(0,index-5), -1): previous_entity = _list_entity[previous_index] if previous_entity.entity_type == "phone": sentence_distance = entity.sentence_index - previous_entity.sentence_index distance = (words_num_dict[entity.sentence_index] + entity.wordOffset_begin) - ( words_num_dict[previous_entity.sentence_index] + previous_entity.wordOffset_end) if sentence_distance < 1 and distance<30: # 前向 没有 /10000 value = (-1 / 2 * (distance ** 2)) match_list.append(Match(entity, previous_entity, value)) else: break result = dispatch(match_list) for match in result: entity = match.main_role # 更新 list_entity entity_index = list_entity.index(entity) list_entity[entity_index].person_phone = match.attribute.entity_text def predict(self,list_sentences,list_entitys): self.predict_person(list_sentences,list_entitys) #表格预测 class FormPredictor(): def __init__(self,lazyLoad=getLazyLoad(),config=None): self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5" self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5" self.model_form_item = Model_form_item(config=config) self.model_dict = {"line":[None,self.model_file_line]} self.model_form_context = Model_form_context(config=config) def getModel(self,type): if type=="item": return self.model_form_item elif type=="context": return self.model_form_context else: return self.getModel(type) def encode(self,data,**kwargs): return encodeInput([data], word_len=50, word_flag=True,userFool=False)[0] return encodeInput_form(data) def predict(self,form_datas,type): if type=="item": return self.model_form_item.predict(form_datas) elif type=="context": return self.model_form_context.predict(form_datas) else: return self.getModel(type).predict(form_datas) #角色规则 #依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率 class RoleRulePredictor(): def __init__(self): # (?P 正则组名 后面的 w1 为概率权重关键词 self.pattern_tenderee_left_55 = "(?P((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)" \ "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|采购(执行|实施)单位)"\ "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)" self.pattern_tenderee_left_60 = "(?P(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包)" \ "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂))"\ "[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)" # 367784094 隆道-大企业采购平台 采购商:C5石油树脂-中国建材集团有限公司-四川省/成都市/市辖区 self.pattern_tenderee_left_50 = "(?P((所需|需[用求]|购货|征集|发布|交易发起|开户|申报|填报|开票|收货)" \ "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \ "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$|(采购商|招标人):(\w{2,10}-)?$)" self.pattern_tenderee_center = "(?P(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))" self.pattern_tenderee_right = "(?P^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位))" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价) self.pattern_tendereeORagency_right = "(?P(^拟对|^现?就|^现对))" self.pattern_agency_left = "(?P((代理|拍卖)(?:人|机构|公司|企业|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构|(采购|招标)代理)(名称|.{,4}名,?称|全称)?(是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))" self.pattern_agency_right = "(?P^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号 # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构 self.pattern_winTenderer_left_50 = "(?P" \ "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \ "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致|征集结果|选择中介|选择结果|成交对象|勘察人|(,|审计|处置|勘察|设计)服务单位|受托[人方])[::是为]+$" \ "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$" \ "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$" \ "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$)" # 承办单位:不作为中标 83914772 self.pattern_winTenderer_left_60 = "(?P" \ "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \ "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$)" # 解决表头识别不到加逗号情况,需前面为,。空 self.pattern_winTenderer_left_55 = "(?P(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)" \ "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \ "|结果公示如下:摇出球号:\d+号,中介机构:$|直购企业:$)" # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标 self.pattern_winTenderer_right = "(?P(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商)))|" \ "^((报价|价格)最低,|以\w{5,10}|\w{,20})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \ "|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格" \ "|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))" self.pattern_winTenderer_whole = "(?P(贵公司|由).{,15}以\w{,15}中标|确定[\w()]{5,20}为[^,。;]{5,50}的?中标单位" \ "|选定报价最低的[“”\w()]{5,25}为[^,。;]{5,50}的?(服务|中标|成交)单位" \ "|拟邀请[\w()]{5,20}(进行)?单一来源谈判|(承办单位|报价人|投标人|中介机构)(名称)?:[\w()]{5,20},(中标|承办|中选)价格" \ "|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货 self.pattern_secondTenderer_left = "(?P((第[二2]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))" self.pattern_secondTenderer_right = "(?P^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))" self.pattern_thirdTenderer_left = "(?P(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))" self.pattern_thirdTenderer_right = "(?P^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))" self.condadate_left = "(?P(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人)?[::是为]+$)" self.pattern_left = [ self.pattern_tenderee_left_60, self.pattern_tenderee_left_55, self.pattern_tenderee_left_50, self.pattern_agency_left, self.pattern_secondTenderer_left, self.pattern_thirdTenderer_left, self.pattern_winTenderer_left_60, self.pattern_winTenderer_left_55, self.pattern_winTenderer_left_50, ] self.pattern_whole = [ self.pattern_winTenderer_whole, self.pattern_tenderee_center, ] self.pattern_right = [ self.pattern_thirdTenderer_right, self.pattern_secondTenderer_right, self.pattern_agency_right, self.pattern_tendereeORagency_right, self.pattern_tenderee_right, self.pattern_winTenderer_right, ] self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"]) self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|采购成本价") # |建安费用 不作为招标金额 self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):") # 单写 总价 不能作为中标金额,很多表格有单价、总价 self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标") self.pattern_money_other = re.compile("代理费|服务费") self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)" # self.role_file = open('/data/python/lsm/role_rule_predict.txt', 'a', encoding='utf-8') def _check_input(self,text, ignore=False): if not text: return [] if not isinstance(text, list): text = [text] null_index = [i for i, t in enumerate(text) if not t] if null_index and not ignore: raise Exception("null text in input ") return text def ser_role(self, pattern_list, text, entity_text): for _pattern in pattern_list: for _iter in re.finditer(_pattern, text): for _group, _v_group in _iter.groupdict().items(): if _v_group is not None and _v_group != "": _role = _group.split("_")[0] if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑 # print('p_entity_sentenceindex:', p_entity.sentence_index) # if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', entity_text) \ # or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', entity_text) == None: if is_agency(entity_text): _role = 'tenderee' else: _role = "agency" _direct = _group.split("_")[1] # _weight = _group.split("_")[2] if len(_group.split("_")) == 3 else "" prob = int(_group.split("_")[2])/100 if len(_group.split("_")) == 3 else 0.55 # print('实体召回概率:', prob) _label = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}.get(_role) return (_label, prob, _iter.group(0)) return (5, 0.5, '') def rule_predict(self, before, center, after, entity_text): # before = before if isinstance(before, str) else "" # center = center if isinstance(center, str) else "" # after = after if isinstance(after, str) else "" _label, _prob, keyword = self.ser_role(self.pattern_left, before, entity_text) # 前文匹配 keyword = "left_" + keyword if keyword!="" else keyword if _label == 2 and re.search( '各.{,5}供应商|尊敬的供应商|[^\w]候选供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})|未(中[标选]|入围)|不得确定为|(响应|参[加与]报价|通过资格审查)的?供应商', # 135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务 before) != None: _label = 5 elif _label == 2 and re.search('为$', before) and re.match('\w', after): # 排除错误 前文为结尾,后文不是标点符号结尾的,如 353824459 供应商为社会团体的, 供应商为玉田县中医医院提供安保服务 _label = 5 elif _label == 2 and re.search('评委|未中标', after[:5]): # 397194341 过滤掉错误召回中标人 _label = 5 if _label == 5: _label, _prob, keyword = self.ser_role(self.pattern_whole, before + center + after, entity_text) # 前后文匹配 keyword = 'whole_'+ keyword[:keyword.find(entity_text)] if keyword!="" else keyword if _label == 2 and re.search('以[^,。;]{10,30}为准', before + center + after)!=None: _label = 5 if _label != 5 and self.ser_role(self.pattern_whole, before, entity_text)[0] != 5 or \ self.ser_role(self.pattern_whole, after, entity_text)[0] != 5: _label = 5 if _label == 5: _label, _prob, keyword = self.ser_role(self.pattern_right, after, entity_text) # 后文匹配 keyword = "right_" + keyword if keyword!="" else keyword if _label==5 and re.search('(中标|中选|成交)?)(结果)?(公告|公示|通知书?),', before) and re.match(':', after): _label = 2 _prob = 0.5 _flag = False if _label==5 else True return (_label, _prob, _flag, keyword) def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5): for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences, list_codenames): list_sentence.sort(key=lambda x: x.sentence_index) # 2022/1/5 按句子顺序排序 # list_name = list_codename["name"] list_name = [] # 2022/1/5 改为实体列表内所有项目名称 name_entitys = [] # 2023/6/30 保存项目名称实体,直接通过位置判断角色是否在项目名称里面 candidates = [] # 保存不能确定为第几的候选人 2023/04/14 notfound_tenderer = True # 未找到前三候选人 for entity in list_entity: if entity.entity_type == 'name': list_name.append(entity.entity_text) name_entitys.append(entity) list_name = self._check_input(list_name) + [article.title] for p_entity in list_entity: if p_entity.entity_type in ["org", "company"]: # 只解析角色为无的或者概率低于阈值的 if p_entity.label is None: continue # 将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人 if str(p_entity.label) == "0": find_flag = False for _sentence in list_sentence: if _sentence.sentence_index == p_entity.sentence_index: _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index, end_index=p_entity.end_index, size=20, center_include=True, word_flag=True, use_text=True, text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text))) if re.search(self.pattern_tenderee_left_50, _span[0]) or re.search(self.pattern_tenderee_left_55, _span[0]): # 前面有关键词的实体不判断是否在项目名称中出现 find_flag = True break if re.search('(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题):$', _span[0]): find_flag = True if re.search('(局|院|府|学|处|站|会|所|校|馆|队|厅|室|司|心|园|厂)$', p_entity.entity_text): p_entity.values[0] = 0.6 if p_entity.values[0]>0.6 else 0.55 else: p_entity.values[0] = on_value # 项目名称里面实体修改为最低概率 break for _name in name_entitys: if _name.sentence_index == p_entity.sentence_index and p_entity.wordOffset_begin >=_name.wordOffset_begin and p_entity.wordOffset_end < _name.wordOffset_end: find_flag = True if re.search('(局|院|府|学|处|站|会|所|校|馆|队|厅|室|司|心|园|厂)$', p_entity.entity_text): p_entity.values[0] = 0.6 if p_entity.values[0] > 0.6 else 0.55 else: p_entity.values[0] = on_value # 项目名称里面实体修改为最低概率 break # if p_entity.values[0] > on_value: # p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10 # else: # p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况 # for _name in list_name: # if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0: #加上前面一些信息,修复公司不在项目名称开头的,检测不到 # find_flag = True # if p_entity.values[0] > on_value: # p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10 # else: # p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况 if find_flag: continue # 正则从概率低于阈值或其他类别中召回角色 role_prob = float(p_entity.values[int(p_entity.label)]) if role_prob < on_value or str(p_entity.label) == "5": # 将标题中的实体置为招标人 _list_name = self._check_input(list_name, ignore=True) find_flag = False for _name in _list_name: # 2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人 if str(_name).find(re.sub(")", ")", re.sub("(", "(", p_entity.entity_text))) >= 0 and p_entity.sentence_index < 4: for _sentence in list_sentence: if _sentence.sentence_index == p_entity.sentence_index: _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index, end_index=p_entity.end_index, size=20, center_include=True, word_flag=True, use_text=True, text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text))) if _span[2].startswith(":"): # 实体后面为冒号的不作为招标人,避免项目名称出错中标变招标 368122675 陇西兴恒建建筑有限责任公司:线路安全保护区内环境治理专项整改(第二标段)项目 break if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find( _name) >= 0: # if p_entity.entity_text in agency_set or re.search('(代理|管理|咨询|招投?标|采购)\w{,6}公司', p_entity.entity_text): # 在代理人集合的作为代理人 if is_agency(p_entity.entity_text): # 2024/3/29 统一方法判断是否为代理 find_flag = True _label = 1 p_entity.label = _label p_entity.values[int(_label)] = on_value break else: find_flag = True _label = 0 p_entity.label = _label p_entity.values[int(_label)] = on_value if 6= 4: break if find_flag: break # 若是实体在标题中,默认为招标人,不进行以下的规则匹配 if find_flag: continue for s_index in range(len(list_sentence)): if p_entity.doc_id == list_sentence[s_index].doc_id and p_entity.sentence_index == \ list_sentence[s_index].sentence_index: tokens = list_sentence[s_index].tokens begin_index = p_entity.begin_index end_index = p_entity.end_index size = 40 #15 spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False) # _flag = False # 添加中标通知书类型特殊处理 try: if s_index == 0 and re.search('中标通知书.{,30}[,:]%s:'%p_entity.entity_text.replace('(', '').replace(')', ''), list_sentence[s_index].sentence_text.replace('(', '').replace(')', '')[:100]): p_entity.label = 2 p_entity.values[2] = 0.5 notfound_tenderer = False # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span])) break except Exception as e: print('正则报错:', e) before, center, after = spans[0], spans[1], spans[2] entity_text = p_entity.entity_text _label, _prob, _flag, kw = self.rule_predict(before, center, after, entity_text) # if _label in [0, 1, 2, 3, 4]: # self.role_file.write("{0}#split#{1}#split#{2}#split#{3}#split#{4}\n".format(before, # entity.entity_text, # after, # _label, # entity.doc_id)) # 得到结果 if _flag: if _label in [2, 3, 4]: notfound_tenderer = False p_entity.label = _label # p_entity.values[int(_label)] = on_value + p_entity.values[ # int(_label)] / 10 p_entity.values[_label] = _prob + p_entity.values[int(_label)] / 10 # log('正则召回实体: %s, %s, %d, %.4f, %s'%(kw, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], before+" "+after)) break if re.search(self.condadate_left, before) and re.search('尊敬的|各', before[-10:])==None: candidates.append(p_entity) # # 使用正则+距离解决冲突 # # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1] # list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:25], spans[2]] # 实体左、中、右 信息 # for _i_span in range(len(list_spans)): # _flag = False # _prob_weight = 1 # # # print(list_spans[_i_span],p_entity.entity_text) # for _pattern in self.pattern_whole: # for _iter in re.finditer(_pattern, list_spans[_i_span]): # for _group, _v_group in _iter.groupdict().items(): # if _v_group is not None and _v_group != "": # _role = _group.split("_")[0] # if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑 # # print('p_entity_sentenceindex:', p_entity.sentence_index) # if p_entity.sentence_index>=1: # 只在第一句进行这种模糊匹配 # continue # if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\ # or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None: # _role = 'tenderee' # else: # _role = "agency" # _direct = _group.split("_")[1] # _weight = _group.split("_")[2] if len(_group.split("_"))==3 else "" # # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2, # # "secondTenderer": 3, "thirdTenderer": 4}.get(_role) # if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务 # list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937 # _flag = True # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2, # "secondTenderer": 3, "thirdTenderer": 4}.get(_role) # _prob_weight = 1.2 if _weight=='w1' else 1 # # print('_v_group:',_group, _v_group, p_entity.entity_text) # # if _i_span == 1 and _direct == "center" and _v_group.find(p_entity.entity_text) != -1 and re.search('以[^,。;]{10,30}为准', list_spans[1])==None: # _flag = True # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2, # "secondTenderer": 3, "thirdTenderer": 4}.get(_role) # _prob_weight = 1.2 if _weight == 'w1' else 1 # # print('_v_group:', _group, _v_group, p_entity.entity_text) # # if _i_span == 2 and _direct == "right": # _flag = True # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2, # "secondTenderer": 3, "thirdTenderer": 4}.get(_role) # _prob_weight = 1.2 if _weight == 'w1' else 1 # # print('_v_group:', _group, _v_group, p_entity.entity_text) # # 得到结果 # if _flag: # if _label in [2, 3, 4]: # notfound_tenderer = False # p_entity.label = _label # p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10 # # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span])) # break # if _i_span == 0 and re.search(self.condadate_left, list_spans[_i_span]): # candidates.append(p_entity) elif str(p_entity.label) in ['2', '3', '4']: notfound_tenderer = False # 其他金额通过正则召回可能是招标或中投标的金额 if p_entity.entity_type in ["money"]: if str(p_entity.label) == "2": for _sentence in list_sentence: if _sentence.sentence_index == p_entity.sentence_index: _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index, end_index=p_entity.end_index, size=10, center_include=True, word_flag=True, text=p_entity.entity_text) if re.search('(含|在|包括)(\d+)?$', _span[0]): continue if re.search(',\w{2,}', _span[0]): _span[0] = _span[0].split(',')[-1] if len(_span[0].split(',')[-1])>4 else _span[0][-8:] #避免多个价格在一起造成误判 if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search( self.pattern_money_other, _span[0]) is None: p_entity.values[0] = 0.8 + p_entity.values[0] / 10 p_entity.label = 0 # print('规则召回预算金额:', p_entity.entity_text, _span[0]) if re.search(self.pattern_money_tenderer, _span[0]) is not None: if re.search(self.pattern_money_other, _span[0]) is not None: if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \ re.search(self.pattern_money_other, _span[0]).span()[1]: p_entity.values[1] = 0.8 + p_entity.values[1] / 10 p_entity.label = 1 else: p_entity.values[1] = 0.8 + p_entity.values[1] / 10 p_entity.label = 1 if re.search(self.pattern_money_tenderer_whole, "".join(_span)) is not None and re.search(self.pattern_money_other, _span[0]) is None: p_entity.values[1] = 0.8 + p_entity.values[1] / 10 p_entity.label = 1 elif re.search('(预算金额|最高(投标)?上?限[价额]?格?|招标控制价))?:?([\d.,]+万?元[,(]其中)?(第?[一二三四五0-9](标[段|包]|[分子]包):?[\d.,]+万?元,)*第?[一二三四五0-9](标[段|包]|[分子]包):?$' , _sentence.sentence_text[:p_entity.wordOffset_begin]): # 处理几个标段金额相邻情况 例子:191705231 p_entity.values[0] = 0.8 + p_entity.values[0] / 10 p_entity.label = 0 # print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin]) if notfound_tenderer and len(set([ent.entity_text for ent in candidates])) == 1 and re.search( '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|磋商|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书', article.title+article.content[:100]): for p_entity in candidates: # print('只有一个候选人的作为中标人', p_entity.entity_text) p_entity.label = 2 p_entity.values[2] = on_value # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额 list_p = [] state = 0 for p_entity in list_entity: for _sentence in list_sentence: if _sentence.sentence_index == p_entity.sentence_index: _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index, end_index=p_entity.end_index, size=20, center_include=True, word_flag=True, text=p_entity.entity_text) if state == 2: for _p in list_p[1:]: _p.values[0] = 0.8 + _p.values[0] / 10 _p.label = 0 state = 0 list_p = [] if state == 0: if p_entity.entity_type in ["money"]: if str(p_entity.label) == "0" and re.search(self.pattern_pack, _span[0] + "-" + _span[2]) is not None: state = 1 list_p.append(p_entity) elif state == 1: if p_entity.entity_type in ["money"]: if str(p_entity.label) in ["0", "2"] and re.search(self.pattern_pack, _span[0] + "-" + _span[ 2]) is not None and re.search( self.pattern_money_other, _span[0] + "-" + _span[2]) is None and p_entity.sentence_index == list_p[ 0].sentence_index: list_p.append(p_entity) else: state = 2 if len(list_p) > 1: for _p in list_p[1:]: # print("==",_p.entity_text,_p.sentence_index,_p.label) _p.values[0] = 0.8 + _p.values[0] / 10 _p.label = 0 state = 0 list_p = [] for p_entity in list_entity: # 将属于集合中的不可能是中标人的标签置为无 if p_entity.entity_text in self.SET_NOT_TENDERER: p_entity.label = 5 '''正则补充最后一句实体日期格式为招标或代理 2021/12/30''' class RoleRuleFinalAdd(): def predict(self, list_articles,list_sentences, list_entitys, list_codenames): ''' 最终规则召回角色 :param list_articles: :param list_sentences: :param list_entitys: :param list_codenames: :return: ''' # text_end = list_articles[0].content.split('##attachment##')[0][-40:] main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment] if len(list_sentences[0])>0 and list_sentences[0][-1].in_attachment: main_sentences = list_sentences[0][-1:] + main_sentences[-2:] if len(main_sentences)==0: return 0 # end_tokens = [] for sentence in main_sentences[-5:][::-1]: # 402073799 最后五句由后往前,匹配文末角色,日期 # end_tokens.extend(sentence.tokens) # text_end = "".join(end_tokens[-30:]) # text_end = "".join(end_tokens) text_end = "".join(sentence.tokens) text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址 text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:.{,100})', '', text_end)[-200:] # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况 # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end) sear_ent = re.search('([,。;]|^)(?P[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end) if sear_ent: b, e = sear_ent.span() if re.search('报价记录|竞价成交', text_end[max(b-10, 0):b] + text_end[e:]): sear_ent = None break if sear_ent == None: text_end = list_articles[0].content[-100:] sear_ent = re.search( '([,。;]|^)(?P[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end) if sear_ent: b, e = sear_ent.span() if re.search('报价记录|竞价成交', text_end[max(b-10, 0):b] + text_end[e:]): sear_ent = None sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000]) sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000]) if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人 sear_ent2 = None sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点)[,:](?P[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000]) sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000]) sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent] tenderee_notfound = True agency_notfound = True tenderee_list = [] agency_list = [] ents = [] for ent in list_entitys[0]: if ent.entity_type in ['org', 'company']: if ent.label == 0 and ent.values[ent.label]>0.55: if '公共资源交易中心' in ent.entity_text: # 公共资源交易中心不算招标或代理,只算平台 # ent.label = 5 ent.values[ent.label] = 0.6 if ent.values[ent.label]>0.6 else 0.5 # 改为降低概率,不改类别,防止 382573066 明显招标人表达不提取 continue tenderee_list.append(ent.entity_text) tenderee_notfound = False elif ent.label == 1 and ent.values[ent.label]>0.55: agency_list.append(ent.entity_text) agency_notfound = False elif ent.label == 5: if '公共资源交易中心' in ent.entity_text: continue ents.append(ent) if sear_ent or sear_ent1 or sear_ent2 or sear_ent3 or sear_ent4: for _sear_ent in [_sear for _sear in sear_list if _sear]: ent_re = _sear_ent.group('entity') ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")") if tenderee_notfound or agency_notfound: n = 0 for i in range(len(ents) - 1, -1, -1): if not ents[i].in_attachment: n += 1 if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体 break elif _sear_ent==sear_ent and ents[i].label != 5: # 后面有角色的实体的停止继续往前 break if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and re.search('(大学|中学|小学|幼儿园|医院)$', ents[i].entity_text)) or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6): if agency_notfound and is_agency(ents[i].entity_text) and ents[i].entity_text not in tenderee_list: ents[i].label = 1 ents[i].values[1] = 0.51 # 修改为比标题概率略高 agency_notfound = False elif tenderee_notfound and not is_agency(ents[i].entity_text) and ents[i].entity_text not in agency_list: ents[i].label = 0 ents[i].values[0] = 0.51 # 修改为比标题概率略高 tenderee_notfound = False # log('正则最后补充实体: %s'%(ent_re)) break if not tenderee_notfound: break # 招标人角色召回规则 class TendereeRuleRecall(): def __init__(self): # self.tenderee_left = re.compile("(发布(人|单位|机构)|需求方(信息[,:])?(单位|公司)?名称|购买主体|收货单位|项目申请单位|发起组织|联系单位|" # "询价(机构|企业)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::是为][^。;,]{,5}$") # self.tenderee_left_1 = re.compile("采购商公司|询价单位|项目法人单位|项目法人|项目业主名称|申购单位|预算单位|预算单位名称|预算单位单位名称|买方单位|需求公司|寻源单位|项目业主|采购商|业主单位咨询电话|需用单位|采购工厂|征集单位") self.tenderee_left_1 = re.compile("((?:采购商|项目法人|项目业主)(名称)?|(?:采购商|询价|项目法人|项目业主|申购|预算|买方|需求|寻源|需用|征集)(单位|公司)((?:单位|公司)?名称)?|询价企业|" "业主单位咨询电话|购买主体|采购工厂|需求方(信息[,:])?(单位|公司)?名称|采购单位[\((].{1,6}[\))])[::是为][^。;,]{,2}$") self.tenderee_left_2 = re.compile("(招标承办单位|交易人(?:名称)?|招标人代表|(采购|招标)联系人|交易单位|发起(单位|组织)|收货单位|使用方|买家信息)[::是为][^。;,]{,2}$") self.tenderee_left_3 = re.compile("[本我](?:公司|单位)[\(\[(【]?$") # self.tenderee_left_4 = re.compile("(采购机构|组织机构|组织方|执行单位|采购组织单位|招标组织单位|招标组织部门|采购执行方|采购执行单位|询价执行组织|组织单位|联系单位|联系部门)[::是为][^。;,]{,2}$") self.tenderee_left_4 = re.compile("(采购机构|(?:采购|招标|询价)?(组织|执行)(机构|方|单位|部门|组织)|联系(单位|部门)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::是为][^。;,]{,2}$") self.tenderee_left_5 = re.compile("(撰写单位|发布(?:人|单位|机构|公司|部门|企业))[^。;,]{,2}$") self.tenderee_right = re.compile("^[^。;::]{,5}[((](以?下简?称)?,?[,\"“]*[我本][\u4e00-\u9fa5]{1,2}[,\"”]*[))]|" "^[\((][^。;::\))]{,5}称(?:招标|采购)(?:人|单位)|" "^[^。;::]{,10}[对就][^。;,]+,?[^。;,]{,20}进行[^。;,]*(采购|询比?价|遴选|招投?标|征集)|" "^[^。;::]{,10}关于[^。;,]+,?[^。;,]{,20}的[^。;,]{,20}公告|" "^[^。;,::]{,10}的[^。;,]+,?[^。;,]{,20}正在[^。;,]{,5}进行|" "^[^。;,::]{,10}的[^。;,]+,?[^。,;]{,20}已?[^。;,]{,20}批准|" "^[^。;,::]{,15}(选定|选取|征集|遴选)[^。;,]{,20}(供应商|(代理|咨询|设计)[^。;,]{,5}机构|代理人)") self.tenderee_right2 = re.compile("^[^。;,::]{,10}(招标办|采购部|办事处|采购小?组)") self.tenderee_right3 = re.compile("^[^。;,::]{,10}(对|就|关于|的)(?P[^。;,?!::]{4,40})") # 公告主语判断规则 self.subject = re.compile("[我本][院校局]") # 未识别实体召回正则 self.unrecognized1 = re.compile("(?P((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \ "(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \ "(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|:|:)+)(?P[^,。::;]+)[,。;::]") self.unrecognized2 = re.compile("(?P((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|选取|抽取|抽选|出售|标卖|比价|处置)" \ "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\ "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|:|:)+)(?P[^,。::;]+)[,。;::]") # 未识别实体尾部判断 # self.unrecognized_end1 = re.compile( # "^[\u4e00-\u9fa5]{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心|联合社|合作社)") # self.unrecognized_end2 = re.compile("^[\u4e00-\u9fa5]{4,}(?:署|局|厅|处|室|科|部|站|所|股|行|园)") def predict(self, list_articles,list_sentences, list_entitys, list_codenames): self.get_tenderee = False ents = [] list_name = [] agency_set = set() for ent in list_entitys[0]: if ent.entity_type == 'name': list_name.append(ent.entity_text) if ent.entity_type in ['org', 'company']: if ent.label == 0 and ent.values[ent.label]>=0.5: self.get_tenderee = True break elif ent.label == 1: if ent.values[ent.label]>0.5: agency_set.add(ent.entity_text) elif ent.label == 5: if len(ent.entity_text)>=4: ents.append(ent) if not self.get_tenderee: self.entity_context_rule(ents,list_name,list_sentences,list(agency_set)) if not self.get_tenderee: self.subject_rule(ents,list_articles,list_sentences) # if not self.get_tenderee: # self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55) # if not self.get_tenderee: # self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5) #entity上下文正则判断 def entity_context_rule(self,entitys,list_name,list_sentences,list_agency): list_sentences[0].sort(key=lambda x:x.sentence_index) entity_data = [] for ent in entitys: _sentence = list_sentences[0][ent.sentence_index] _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index, end_index=ent.end_index, size=40, center_include=True, word_flag=True, use_text=True, text=re.sub(")", ")", re.sub("(", "(", ent.entity_text))) entity_data.append((ent,_span)) if not self.get_tenderee: for _data in entity_data: ent = _data[0] _span = _data[1] if re.search(self.tenderee_left_1,_span[0]): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True if not self.get_tenderee: for _data in entity_data: ent = _data[0] _span = _data[1] if re.search(self.tenderee_left_2,_span[0]): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True if not self.get_tenderee: for _data in entity_data: ent = _data[0] _span = _data[1] if re.search(self.tenderee_left_3,_span[0]): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True if not self.get_tenderee: for _data in entity_data: ent = _data[0] _span = _data[1] if re.search(self.tenderee_left_4,_span[0]): if len(list_agency)>0: _same = False for agency in list_agency: if ent.entity_text in agency or agency in ent.entity_text: _same = True break if not _same: ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True else: if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent.entity_text ) or not re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent.entity_text) or re.search("自行.?采购",list_sentences[0][ent.sentence_index].sentence_text): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True if not self.get_tenderee: for _data in entity_data: ent = _data[0] _span = _data[1] if re.search(self.tenderee_left_5,_span[0]): if len(list_agency)>0: _same = False for agency in list_agency: if ent.entity_text in agency or agency in ent.entity_text: _same = True break if not _same: ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True else: if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent.entity_text ) or not re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent.entity_text): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True if not self.get_tenderee: for _data in entity_data: ent = _data[0] _span = _data[1] if re.search(self.tenderee_right, _span[2]): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True if not self.get_tenderee: for _data in entity_data: ent = _data[0] _span = _data[1] if re.search(self.tenderee_right2, _span[2]): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True if not self.get_tenderee: if list_name: for _data in entity_data: ent = _data[0] _span = _data[1] pj_name = re.search(self.tenderee_right3, _span[2]) if pj_name: pj_name = pj_name.groupdict()["project"] for _name in list_name: if _name in pj_name: ent.label = 0 ent.values[0] = 0.5 self.get_tenderee = True break # for _data in entity_data: # ent = _data[0] # _span = _data[1] # if re.search(self.tenderee_left,_span[0]): # ent.label = 0 # ent.values[0] = 0.5 + ent.values[0] / 10 # self.get_tenderee = True # elif re.search(self.tenderee_right,_span[2]): # ent.label = 0 # ent.values[0] = 0.5 + ent.values[0] / 10 # self.get_tenderee = True # elif re.search(self.tenderee_right2, _span[2]): # ent.label = 0 # ent.values[0] = 0.5 + ent.values[0] / 10 # self.get_tenderee = True # elif list_name: # pj_name = re.search(self.tenderee_right3, _span[2]) # if pj_name: # pj_name = pj_name.groupdict()["project"] # for _name in list_name: # if _name in pj_name: # ent.label = 0 # ent.values[0] = 0.5 # self.get_tenderee = True # break # 公告主语判断 def subject_rule(self, entitys,list_articles,list_sentences): content = list_articles[0].content.split('##attachment##')[0] if re.search(self.subject,content): _subject = re.search(self.subject,content).group() for ent in entitys: if re.search("院",_subject) and re.search("医院|学院",ent.entity_text): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True elif re.search("校",_subject) and re.search("学校|学院|大学|高中|初中|中学|小学",ent.entity_text): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True elif re.search("局", _subject) and re.search("局", ent.entity_text): _sentence = list_sentences[0][ent.sentence_index] _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index, end_index=ent.end_index, size=20, center_include=True, word_flag=True, use_text=True, text=re.sub(")", ")", re.sub("(", "(", ent.entity_text))) if not re.search("监督|投诉",_span[0][-10:]): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True # 正则召回未识别实体 # def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5): # list_sentence = list_sentences[0] # for in_attachment in [False,True]: # for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]: # sentence_text = sentence.sentence_text # tokens = sentence.tokens # doc_id = sentence.doc_id # in_attachment = sentence.in_attachment # list_tokenbegin = [] # begin = 0 # for i in range(0, len(tokens)): # list_tokenbegin.append(begin) # begin += len(str(tokens[i])) # list_tokenbegin.append(begin + 1) # for _match in re.finditer(pattern,sentence_text): # _groupdict = _match.groupdict() # _match_text = _match.group() # _unrecognized_text = _groupdict["unrecognized"] # _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text) # if not _unrecognized: # _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text) # if _unrecognized: # _unrecognized = _unrecognized.group() # else: # continue # # print(_unrecognized) # if re.search("某|乙方|代理",_unrecognized) or len(_unrecognized)>15: # continue # begin_index_temp = _match.start()+len(_groupdict['tenderee_left']) # for j in range(len(list_tokenbegin)): # if list_tokenbegin[j] == begin_index_temp: # begin_index = j # break # elif list_tokenbegin[j] > begin_index_temp: # begin_index = j - 1 # break # index = begin_index_temp + len(_unrecognized) # end_index_temp = index # for j in range(begin_index, len(list_tokenbegin)): # if list_tokenbegin[j] >= index: # end_index = j - 1 # break # entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index) # entity_text = _unrecognized # new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index, # begin_index_temp, end_index_temp, in_attachment=in_attachment) # new_entity.label = 0 # new_entity.values = [on_value,0,0,0,0,0] # list_entitys[0].append(new_entity) # self.get_tenderee = True # if self.get_tenderee: # list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index)) # break class RoleGrade(): def __init__(self): self.tenderee_left_9 = "(?P(招标|采购|遴选|寻源|竞价|议价|比选|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|方|单位))" self.tenderee_center_8 = "(?P受.{5,20}委托)" self.tenderee_left_8 = "(?P(尊敬的供应商|(需求|最终|发包|征集|甲|转让|出租|处置)(人|方|单位|组织|用户|业主|主体|部门|公司)))" self.tenderee_left_6 = "(?P(业主|建设|委托)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|业主|买方)" self.tenderee_left_5 = "(?P(发布)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|买方|发布机构)" self.agency_left_9 = "(?P代理)" self.winTenderer_left_9 = "(?P(中标|中选|中价|成交|竞得)|第[1一]名|排[名序]:1|名次:1)" self.winTenderer_left_8 = "(?P(入选供应商|供货商|乙方|最[终后]选[择取]))" # 229435497 最后选择西平,县中原彩印有限公司,作为此项目中标供应商, self.winTenderer_left_6 = "(?P(入围|承[接建包修做制担租销]))" self.secondTenderer_left_9 = "(?P(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]:2|名次:2))" self.thirdTenderer_left_9 = "(?P(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))" self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.tenderee_left_5,self.agency_left_9, self.winTenderer_left_9, self.winTenderer_left_8,self.winTenderer_left_6, self.secondTenderer_left_9, self.thirdTenderer_left_9] def predict(self, list_sentences, list_entitys, span=15, min_prob=0.7): ''' 根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人 :param list_articles: :param list_sentences: :param list_entitys: :param codeName: :return: ''' sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index) role2id = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4} org_winner = [] company_winner = [] org_tenderee = [] agency_l = [] agency_like_tenderee = [] # 类似招标人的代理人实体列表 low_prob_agency = [] low_prob_tenderee = [] for entity in list_entitys[0]: if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> min_prob: text = sentences[entity.sentence_index].sentence_text in_att = sentences[entity.sentence_index].in_attachment pre_prob = entity.values[entity.label] # 模型预测角色概率 b = entity.wordOffset_begin e = entity.wordOffset_end not_found = 1 if re.search('(乙方:甲方:|甲方:乙方:)$', text[max(0, b-span):b]): entity.label = 0 if entity.entity_type == 'org' else 2 entity.values[entity.label] = 0.55 continue elif re.search('(采购|招标)人(?或(采购|招标)?代理机构)?:$', text[max(0, b-span):b]): entity.label = 1 if is_agency(entity.entity_text) else 0 entity.values[entity.label] = 0.8 continue elif re.search('(采购|招标|询比?价|遴选|寻源|比选)机构[是为:]+', text[max(0, b-span):b]) and not is_agency(entity.entity_text): agency_like_tenderee.append(entity) for pattern in self.pattern_list: if 'left' in pattern: context = text[max(0, b-span):b] elif 'right' in pattern: context = text[e:e+span] elif 'center' in pattern: context = text[max(0, b-span):e+span] else: print('规则错误', pattern) ser = re.search(pattern, context) if ser: groupdict = pattern.split('>')[0].replace('(?P<', '') _role, _direct, _prob = groupdict.split('_') _label = role2id.get(_role) if _label != entity.label: continue _prob = int(_prob)*0.1 # print('规则修改角色概率前:', entity.entity_text, entity.label, entity.values) if in_att: _prob = _prob - 0.1 # 0.2 if pre_prob < _prob: # 如果模型预测概率小于关键词概率 _prob = 0.65 if len(entity.entity_text) < 6 and re.search('大学|医院', entity.entity_text)==None: # 如果实体名称小于6个字,概率再降0.05 _prob -= 0.05 if re.search('(地址|联系方式):$', context): # 地址结尾的概率 概率降低 _prob -= 0.05 entity.values[_label] = _prob + entity.values[_label] / 20 not_found = 0 # print('规则修改角色概率后:', entity.entity_text, entity.label, entity.values) break if not_found and entity.values[entity.label]> min_prob: _prob = min_prob - 0.1 if in_att else min_prob entity.values[entity.label] = _prob + entity.values[entity.label] / 20 # print('找不到规则修改角色概率:', entity.entity_text, entity.label, entity.values) if entity.label == 2 and entity.values[entity.label]> min_prob: if entity.entity_type == 'org': org_winner.append(entity) elif entity.entity_type == 'company': company_winner.append(entity) # 保存中标人实体 if entity.label == 0 and entity.values[entity.label]> min_prob: org_tenderee.append(entity.entity_text) # 保存所有招标人名称 elif entity.label == 1 and entity.values[entity.label]> min_prob: agency_l.append(entity.entity_text) # if entity.entity_type in ['org', 'company'] and entity.label == 0 and entity.entity_text in agency_set and entity.values[entity.label]<0.6: # 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人 # # log('修改概率小于0.6的且在大数据代理集合里面的招标人为代理人%s:'%entity.entity_text) # entity.label = 1 # entity.values[entity.label] = 0.5 elif entity.entity_type in ['org', 'company'] and entity.label in [1, 0] and 0.5<=entity.values[entity.label]<0.6: if entity.label == 1: low_prob_agency.append(entity) else: low_prob_tenderee.append(entity) if org_tenderee == [] and agency_like_tenderee: for entity in agency_like_tenderee: entity.label = 0 entity.values[entity.label] = 0.6 for entity in low_prob_agency: if entity.entity_text in org_tenderee: entity.label = 0 entity.values[entity.label] = 0.6 for entity in low_prob_tenderee: if entity.entity_text in agency_l: entity.label = 1 entity.values[entity.label] = 0.6 if org_winner != []: flag = 0 if org_tenderee != []: for ent in org_winner: if ent.entity_text in org_tenderee: # log('如果org中标人同时为招标人角色,降低中标概率:%s, %s' % (ent.entity_text, ent.label)) ent.values[2] = 0.6 flag = 1 if flag == 0 and company_winner != []: for ent in org_winner: if ent.label == 2 and ent.values[2] > 0.6: # log('如果同时包含org和company中标人,降低org中标人概率为0.6:%s, %s' % (ent.entity_text, ent.values[2])) ent.values[2] = 0.6 class MoneyGrade(): def __init__(self): self.tenderee_money_left_9 = "(?P最高(投标)?限价)|控制价|拦标价" self.tenderee_money_left_8 = "(?P预算|限价|起始|起拍|底价|标底)" self.tenderer_money_left_9 = "(?P(中标|成交|合同|总报价))" self.tenderer_money_left_8 = "(?P(投标|总价))" self.pattern_list = [self.tenderee_money_left_8, self.tenderer_money_left_8, self.tenderee_money_left_9, self.tenderer_money_left_9] def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7): sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index) role2id = {"tenderee": 0, "tenderer": 1} for entity in list_entitys[0]: if entity.entity_type in ['money'] and entity.label in [0, 1] and entity.values[entity.label]> 0.6: text = sentences[entity.sentence_index].sentence_text in_att = sentences[entity.sentence_index].in_attachment b = entity.wordOffset_begin e = entity.wordOffset_end context = text[max(0, b - span):b] not_found = 1 for pattern in self.pattern_list: ser = re.search(pattern, context) if ser: groupdict = pattern.split('>')[0].replace('(?P<', '') _role, _direct, _prob = groupdict.split('_') if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or float(entity.entity_text)<100: _prob = 6 _label = role2id.get(_role) if _label != entity.label: continue _prob = int(_prob) * 0.1 # print('规则修改金额概率前:', entity.entity_text, entity.label, entity.values) if in_att: _prob = max(0.5, _prob - 0.2) entity.values[_label] = _prob + entity.values[_label] / 20 not_found = 0 # print('规则修改金额概率后:', entity.entity_text, entity.label, entity.values) break if not_found and entity.values[entity.label] > min_prob: if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or float(entity.entity_text)<100: _prob = 0.6 elif in_att: _prob = max(0.5, min_prob - 0.1) else: _prob = min_prob # _prob = min_prob - 0.1 if in_att else min_prob entity.values[entity.label] = _prob + entity.values[entity.label] / 20 # print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values) # 时间类别 class TimePredictor(): def __init__(self,config=None): self.sess = tf.Session(graph=tf.Graph(),config=config) self.inputs_code = None self.outputs_code = None self.input_shape = (2,40,128) self.load_model() def load_model(self): model_path = os.path.dirname(__file__)+'/timesplit_model' if self.inputs_code is None: log("get model of time") with self.sess.as_default(): with self.sess.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path) signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.inputs_code = [] self.inputs_code.append( self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)) self.inputs_code.append( self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)) self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name) return self.inputs_code, self.outputs_code else: return self.inputs_code, self.outputs_code def search_time_data(self,list_sentences,list_entitys): data_x = [] points_entitys = [] for list_sentence, list_entity in zip(list_sentences, list_entitys): p_entitys = 0 p_sentences = 0 list_sentence.sort(key=lambda x: x.sentence_index) while(p_entitys= length: break if item_not_space in model_w2v.vocab: embed[out_index][index] = model_w2v[item_not_space] index += 1 else: embed[out_index][index] = model_w2v['unk'] index += 1 out_index += 1 return embed def predict(self, list_sentences,list_entitys): datas = self.search_time_data(list_sentences, list_entitys) if datas is None: return points_entitys = datas[1] with self.sess.as_default(): predict_y = limitRun(self.sess,[self.outputs_code], feed_dict={self.inputs_code[0]:datas[0][0] ,self.inputs_code[1]:datas[0][1]})[0] for i in range(len(predict_y)): entity = points_entitys[i] label = np.argmax(predict_y[i]) values = [] for item in predict_y[i]: values.append(item) if label != 0: if not timeFormat(entity.entity_text): label = 0 values[0] = 0.5 entity.set_Role(label, values) # 产品字段提取 class ProductPredictor(): def __init__(self,config=None): vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk" self.vocab = load(vocabpath) self.word2index = dict((w, i) for i, w in enumerate(np.array(self.vocab))) self.sess = tf.Session(graph=tf.Graph(),config=config) self.load_model() def load_model(self): # model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb' model_path = os.path.dirname(__file__)+'/product_savedmodel/productAndfailreason.pb' with self.sess.as_default(): with self.sess.graph.as_default(): output_graph_def = tf.GraphDef() with open(model_path, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') self.sess.run(tf.global_variables_initializer()) self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0') self.length = self.sess.graph.get_tensor_by_name("Sum:0") self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0") self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0") self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0") def decode(self,logits, lengths, matrix): paths = [] small = -1000.0 # start = np.asarray([[small] * 4 + [0]]) start = np.asarray([[small]*7+[0]]) for score, length in zip(logits, lengths): score = score[:length] pad = small * np.ones([length, 1]) logits = np.concatenate([score, pad], axis=1) logits = np.concatenate([start, logits], axis=0) path, _ = viterbi_decode(logits, matrix) paths.append(path[1:]) return paths def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000): ''' 预测实体代码,每个句子最多取MAX_AREA个字,超过截断 :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]] :param list_entitys: 多篇公告实体列表 :param MAX_AREA: 每个句子最多截取多少字 :return: 把预测出来的实体放进实体类 ''' with self.sess.as_default() as sess: with self.sess.graph.as_default(): result = [] product_list = [] if fail and list_articles!=[]: text_list = [list_articles[0].content[:MAX_AREA]] chars = [[self.word2index.get(it, self.word2index.get('')) for it in text] for text in text_list] if USE_API: requests_result = requests.post(API_URL + "/predict_product", json={"inputs": chars}, verify=True) batch_paths = json.loads(requests_result.text)['result'] lengths = json.loads(requests_result.text)['lengths'] else: lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran], feed_dict={ self.char_input: np.asarray(chars), self.dropout: 1.0 }) batch_paths = self.decode(scores, lengths, tran_) for text, path, length in zip(text_list, batch_paths, lengths): tags = ''.join([str(it) for it in path[:length]]) # 提取产品 for it in re.finditer("12*3", tags): start = it.start() end = it.end() _entity = Entity(doc_id=list_articles[0].id, entity_id="%s_%s_%s_%s" % ( list_articles[0].doc_id, 0, start, end), entity_text=text[start:end], entity_type="product", sentence_index=0, begin_index=0, end_index=0, wordOffset_begin=start, wordOffset_end=end) list_entitys[0].append(_entity) product_list.append(text[start:end]) # 提取失败原因 for it in re.finditer("45*6", tags): start = it.start() end = it.end() result.append(text[start:end].replace('?', '').strip()) reasons = [] for it in result: if "(√)" in it or "(√)" in it: reasons = [it] break if reasons != [] and (it not in reasons[-1] and it not in reasons): reasons.append(it) elif reasons == []: reasons.append(it) if reasons == []: # 如果模型识别不到失败原因 就用规则补充 for text in text_list: ser1 = re.search('\w{,4}(理由|原因):\s*((第\d+包|标项\d+|原因类型)?[::]?[\s*\w,]{2,30}((不满?足|少于|未达)((法定)?[123一二三两]家|(规定)?要求)|(项目|采购)(终止|废标)),?)+',text) ser2 = re.search( '\w{,4}(理由|原因):\s*(第\d+包|标项\d+|原因类型)?[::]?[\s*\w]{4,30},', text) if ser1: reasons.append(ser1.group(0)) break elif ser2: reasons.append(ser2.group(0)) break return {'fail_reason':';'.join(reasons)}, product_list if list_entitys is None: list_entitys = [[] for _ in range(len(list_sentences))] for list_sentence, list_entity in zip(list_sentences,list_entitys): if len(list_sentence)==0: result.append({"product":[]}) continue list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True) _begin_index = 0 item = {"product":[]} temp_list = [] while True: MAX_LEN = len(list_sentence[_begin_index].sentence_text) if MAX_LEN > MAX_AREA: MAX_LEN = MAX_AREA _LEN = MAX_AREA//MAX_LEN chars = [sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]] chars = [[self.word2index.get(it, self.word2index.get('')) for it in l] for l in chars] chars = pad_sequences(chars, maxlen=MAX_LEN, padding="post", truncating="post") if USE_API: requests_result = requests.post(API_URL + "/predict_product", json={"inputs": chars.tolist()}, verify=True) batch_paths = json.loads(requests_result.text)['result'] lengths = json.loads(requests_result.text)['lengths'] else: lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran], feed_dict={ self.char_input: np.asarray(chars), self.dropout: 1.0 }) batch_paths = self.decode(scores, lengths, tran_) for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths): tags = ''.join([str(it) for it in path[:length]]) for it in re.finditer("12*3", tags): start = it.start() end = it.end() _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % ( sentence.doc_id, sentence.sentence_index, start, end), entity_text=sentence.sentence_text[start:end], entity_type="product", sentence_index=sentence.sentence_index, begin_index=0, end_index=0, wordOffset_begin=start, wordOffset_end=end,in_attachment=sentence.in_attachment) list_entity.append(_entity) temp_list.append(sentence.sentence_text[start:end]) product_list.append(sentence.sentence_text[start:end]) # item["product"] = list(set(temp_list)) # result.append(item) if _begin_index+_LEN >= len(list_sentence): break _begin_index += _LEN item["product"] = list(set(temp_list)) result.append(item) # 修正bug return {'fail_reason': ""},product_list # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取 class ProductAttributesPredictor(): def __init__(self,): self.p0 = '(类别|类型|物类|目录|类目|分类)(名称|$)|^品名|^品类|^品目|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)(名称|内容|描述)' self.p1 = '(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体)[\))的]?([、\w]{,4}名称|内容|描述)' self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)(名称|内容|描述)' # self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)' # self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称' with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f: self.header_set = pickle.load(f) self.tb = TableTag2List() def isTrueTable(self, table): '''真假表格规则: 1、包含或标签为真 2、包含大量链接、表单、图片或嵌套表格为假 3、表格尺寸太小为假 4、外层嵌套子
,一般子为真,外为假''' if table.find_all(['caption', 'th']) != []: return True elif len(table.find_all(['form', 'a', 'img'])) > 5: # print('过滤表格:包含链接图片等大于5的为假表格') return False elif len(table.find_all(['tr'])) < 2: # print('过滤表格:行数小于2的为假表格') return False elif len(table.find_all(['table'])) >= 1: # print('过滤表格:包含多个表格的为假表格') return False else: return True def getTrs(self, tbody): # 获取所有的tr trs = [] objs = tbody.find_all(recursive=False) for obj in objs: if obj.name == "tr": trs.append(obj) if obj.name == "tbody": for tr in obj.find_all("tr", recursive=False): trs.append(tr) return trs def getTable(self, tbody): trs = self.getTrs(tbody) inner_table = [] if len(trs) < 2: return inner_table for tr in trs: tr_line = [] tds = tr.findChildren(['td', 'th'], recursive=False) if len(tds) < 2: continue for td in tds: # td_text = re.sub('\s+|…', ' ', td.get_text()).strip() td_text = re.sub('…', '', td.get_text()).strip() td_text = td_text.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '/').replace('"', '') # 修复272144312 # 产品单价数量提取结果有特殊符号\ 气动执行装置备件\密封组件\NBR+PT td_text = td_text.replace("(", "(").replace(")", ")").replace(':', ':') tr_line.append(td_text) inner_table.append(tr_line) return inner_table def fixSpan(self, tbody): # 处理colspan, rowspan信息补全问题 trs = self.getTrs(tbody) ths_len = 0 ths = list() trs_set = set() # 修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱 # 遍历每一个tr for indtr, tr in enumerate(trs): ths_tmp = tr.findChildren('th', recursive=False) # 不补全含有表格的tr if len(tr.findChildren('table')) > 0: continue if len(ths_tmp) > 0: ths_len = ths_len + len(ths_tmp) for th in ths_tmp: ths.append(th) trs_set.add(tr) # 遍历每行中的element tds = tr.findChildren(recursive=False) if len(tds) < 3: continue # 列数太少的不补全 for indtd, td in enumerate(tds): # 若有colspan 则补全同一行下一个位置 if 'colspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['colspan']))) != "": col = int(re.sub("[^0-9]", "", str(td['colspan']))) if col < 10 and len(td.get_text()) < 500: td['colspan'] = 1 for i in range(1, col, 1): td.insert_after(copy.copy(td)) for indtr, tr in enumerate(trs): ths_tmp = tr.findChildren('th', recursive=False) # 不补全含有表格的tr if len(tr.findChildren('table')) > 0: continue if len(ths_tmp) > 0: ths_len = ths_len + len(ths_tmp) for th in ths_tmp: ths.append(th) trs_set.add(tr) # 遍历每行中的element tds = tr.findChildren(recursive=False) same_span = 0 if len(tds) > 1 and 'rowspan' in tds[0].attrs: span0 = tds[0].attrs['rowspan'] for td in tds: if 'rowspan' in td.attrs and td.attrs['rowspan'] == span0: same_span += 1 if same_span == len(tds): continue for indtd, td in enumerate(tds): # 若有rowspan 则补全下一行同样位置 if 'rowspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['rowspan']))) != "": row = int(re.sub("[^0-9]", "", str(td['rowspan']))) td['rowspan'] = 1 for i in range(1, row, 1): # 获取下一行的所有td, 在对应的位置插入 if indtr + i < len(trs): tds1 = trs[indtr + i].findChildren(['td', 'th'], recursive=False) if len(tds1) >= (indtd) and len(tds1) > 0: if indtd > 0: tds1[indtd - 1].insert_after(copy.copy(td)) else: tds1[0].insert_before(copy.copy(td)) elif len(tds1) > 0 and len(tds1) == indtd - 1: tds1[indtd - 2].insert_after(copy.copy(td)) def get_monthlen(self, year, month): '''输入年份、月份 int类型 得到该月份天数''' try: weekday, num = calendar.monthrange(int(year), int(month)) except: num = 30 return str(num) def fix_time(self, text, html, page_time): '''输入日期字段返回格式化日期''' for it in [('十二', '12'),('十一', '11'),('十','10'),('九','9'),('八','8'),('七','7'), ('六','6'),('五','5'),('四','4'),('三','3'),('二','2'),('一','1')]: if it[0] in text: text = text.replace(it[0], it[1]) if re.search('^\d{1,2}月$', text): m = re.search('^(\d{1,2})月$', text).group(1) if len(m) < 2: m = '0' + m year = re.search('(\d{4})年(.{,12}采购意向)?', html) if year: y = year.group(1) num = self.get_monthlen(y, m) if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (y, m) order_end = "%s-%s-%s" % (y, m, num) elif page_time != "": year = re.search('\d{4}', page_time) if year: y = year.group(0) num = self.get_monthlen(y, m) if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (y, m) order_end = "%s-%s-%s" % (y, m, num) else: y = str(datetime.datetime.now().year) num = self.get_monthlen(y, m) if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (y, m) order_end = "%s-%s-%s" % (y, m, num) else: y = str(datetime.datetime.now().year) num = self.get_monthlen(y, m) if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (y, m) order_end = "%s-%s-%s" % (y, m, num) return order_begin, order_end t1 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})月?$', text) if t1: year = t1.group(1) month = t1.group(3) num = self.get_monthlen(year, month) if len(month)<2: month = '0'+month if len(num) < 2: num = '0'+num order_begin = "%s-%s-01" % (year, month) order_end = "%s-%s-%s" % (year, month, num) return order_begin, order_end t2 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})(月|/|\.|-)(\d{1,2})日?$', text) if t2: y = t2.group(1) m = t2.group(3) d = t2.group(5) m = '0'+ m if len(m)<2 else m d = '0'+d if len(d)<2 else d order_begin = order_end = "%s-%s-%s"%(y,m,d) return order_begin, order_end # 时间样式:"202105" t3 = re.search("^(20\d{2})(\d{1,2})$",text) if t3: year = t3.group(1) month = t3.group(2) if int(month)>0 and int(month)<=12: num = self.get_monthlen(year, month) if len(month) < 2: month = '0' + month if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (year, month) order_end = "%s-%s-%s" % (year, month, num) return order_begin, order_end # 时间样式:"20210510" t4 = re.search("^(20\d{2})(\d{2})(\d{2})$", text) if t4: year = t4.group(1) month = t4.group(2) day = t4.group(3) if int(month) > 0 and int(month) <= 12 and int(day)>0 and int(day)<=31: order_begin = order_end = "%s-%s-%s"%(year,month,day) return order_begin, order_end all_match = re.finditer('^(?P\d{4})(年|/|\.)(?P\d{1,2})(?:(月|/|\.)(?:(?P\d{1,2})日)?)?' '(到|至|-)(?:(?P\d{4})(年|/|\.))?(?P\d{1,2})(?:(月|/|\.)' '(?:(?P\d{1,2})日)?)?$', text) y1 = m1 = d1 = y2 = m2 = d2 = "" found_math = False for _match in all_match: if len(_match.group()) > 0: found_math = True for k, v in _match.groupdict().items(): if v!="" and v is not None: if k == 'y1': y1 = v elif k == 'm1': m1 = v elif k == 'd1': d1 = v elif k == 'y2': y2 = v elif k == 'm2': m2 = v elif k == 'd2': d2 = v if not found_math: return "", "" y2 = y1 if y2 == "" else y2 d1 = '1' if d1 == "" else d1 d2 = self.get_monthlen(y2, m2) if d2 == "" else d2 m1 = '0' + m1 if len(m1) < 2 else m1 m2 = '0' + m2 if len(m2) < 2 else m2 d1 = '0' + d1 if len(d1) < 2 else d1 d2 = '0' + d2 if len(d2) < 2 else d2 order_begin = "%s-%s-%s"%(y1,m1,d1) order_end = "%s-%s-%s"%(y2,m2,d2) return order_begin, order_end def fix_quantity(self, quantity_text, header_quan_unit): ''' 产品数量标准化,统一为数值型字符串 :param quantity_text: 原始数量字符串 :param header_quan_unit: 表头数量单位字符串 :return: 返回数量及单位 ''' quantity = quantity_text quantity = re.sub('[一壹]', '1', quantity) quantity = re.sub('[,,约]|(\d+)', '', quantity) ser = re.search('^(\d+\.?\d*)(?([㎡\w/]{,5})', quantity) if ser: quantity = str(ser.group(1)) quantity_unit = ser.group(2) if quantity_unit == "" and header_quan_unit != "": quantity_unit = header_quan_unit else: quantity = "" quantity_unit = "" return quantity, quantity_unit def find_header(self, items,p0, p1, p2): ''' inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容 :param items: 列表,内容为每个td 文本内容 :param p1: 优先表头正则 :param p2: 第二表头正则 :return: 表头所在列序号,是否表头,表头内容 ''' items = [re.sub('\s', '', it) for it in items] flag = False header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':'', '备注':'','发布日期':''} product = "" # 产品 quantity = "" # 数量 quantity_unit = "" # 数量单位 unitPrice = "" # 单价 brand = "" # 品牌 specs = "" # 规格 demand = "" # 采购需求 budget = "" # 预算金额 order_time = "" # 采购时间 total_price = "" # 总价 category = "" # 品目 parameter = "" # 参数 tenderee = "" # 采购人 notes = "" # 备注 2024/3/27 达仁 需求 issue_date = "" # 发布日期 2024/3/27 达仁 需求 # for i in range(min(6, len(items))): for i in range(len(items)): it = items[i] if len(it) < 15 and re.search(p0, it) != None: flag = True if category != "" and category != it: continue category = it header_dic['品目'] = i elif len(it) < 15 and re.search(p1, it) != None: flag = True if product !='' and product != it: break product = it header_dic['名称'] = i # break # if not flag: if product == "": # for i in range(min(4, len(items))): for i in range(len(items)): it = items[i] if len(it) < 15 and it != category and re.search(p2, it) and (re.search('^名称|^品名|^品目', it) or re.search( '编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None): flag = True product = it header_dic['名称'] = i break if flag == False and len(items)>3 and re.search('^第[一二三四五六七八九十](包|标段)$', items[0]): product = items[0] header_dic['名称'] = 0 flag = True if flag: # for j in range(i + 1, len(items)): for j in range(len(items)): if items[j] in [product, category]: continue if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10: continue if header_dic['数量']=="" and re.search('数量|采购量', items[j]) and re.search('单价|用途|要求|规格|型号|运输|承运', items[j])==None: header_dic['数量'] = j quantity = items[j] elif header_dic['单位']=="" and re.search('^(数量单位|计量单位|单位)$', items[j]): header_dic['单位'] = j quantity_unit = items[j] elif re.search('单价', items[j]) and re.search('数量|规格|型号|品牌|供应商', items[j])==None: header_dic['单价'] = j unitPrice = items[j] elif re.search('品牌', items[j]): header_dic['品牌'] = j brand = items[j] elif re.search('规格|型号', items[j]): header_dic['规格'] = j specs = items[j] elif re.search('参数', items[j]): header_dic['参数'] = j parameter = items[j] elif re.search('预算单位|(采购|招标|购买)(单位|人|方|主体)|项目业主|采购商|申购单位|需求单位|业主单位',items[j]) and len(items[j])<=8: header_dic['采购人'] = j tenderee = items[j] elif re.search('需求|服务要求|服务标准', items[j]): header_dic['需求'] = j demand = items[j] elif re.search('预算|控制金额', items[j]) and not re.search('预算单位',items[j]): header_dic['预算'] = j budget = items[j] elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]): header_dic['时间'] = j order_time = items[j] elif re.search('总价|(成交|中标|验收|合同|预算|控制|总|合计))?([金总]额|价格?)|最高限价|价格|金额', items[j]) and re.search('数量|规格|型号|品牌|供应商', items[j])==None: header_dic['总价'] = j total_price = items[j] elif re.search('^备\s*注$|资质要求|预留面向中小企业|是否适宜中小企业采购预算预留|公开征集信息', items[j]): header_dic['备注'] = j notes = items[j] elif re.search('^\w{,4}发布(时间|日期)$', items[j]): header_dic['发布日期'] = j issue_date = items[j] if header_dic.get('名称', "") != "" or header_dic.get('品目', "") != "": # num = 0 # for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time, total_price): # if it != "": # num += 1 # if num >=2: # return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time) if set([quantity, brand, specs, unitPrice, total_price])!=set([""]) or set([demand, budget])!=set([""]): return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee, notes,issue_date) flag = False return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee,notes,issue_date) def predict(self, docid='', html='', page_time=""): ''' 正则寻找table表格内 产品相关信息 :param html:公告HTML原文 :return:公告表格内 产品、数量、单价、品牌、规格 ,表头,表头列等信息 ''' html = html.replace('
', '\n').replace('
', '\n') html = re.sub("|||","",html) html = re.sub("##attachment##","",html) soup = BeautifulSoup(html, 'lxml') # flag_yx = True if re.search('采购意向', html) else False flag_yx = True if re.search('采购意向|招标意向|选取意向|意向公告|意向公示|意向公开', html) else False tables = soup.find_all(['table']) headers = [] headers_demand = [] header_col = [] product_link = [] demand_link = [] product_set = set() total_product_money = 0 unit_price_list = [] # 单价列表,用于判断是否重复单价,避免多个表格重复提取造成合计产品价格错误。 total_price_list = [] # 总价列表,拥有判断是否为几行产品合计总价 # print('表格数:', len(tables)) for i in range(len(tables)): # (len(tables)-1, -1, -1) 由从最后到前改为 前到后 table = tables[i] if table.parent.name == 'td' and len(table.find_all('td')) <= 3: table.string = table.get_text() table.name = 'turntable' # print('过滤表格:表格父节点为td,且表格td数量小于等于3') continue if not self.isTrueTable(table): continue # self.fixSpan(table) # inner_table = self.getTable(table) inner_table = self.tb.table2list(table) table.extract() # print(inner_table) i = 0 found_header = False header_quan_unit = "" # 数量表头 包含单位 header_colnum = 0 if flag_yx: # print('意向公告, 提取意向信息') col0_l = [] col1_l = [] for tds in inner_table: if len(tds) == 2: col0_l.append(re.sub('[::]', '', tds[0])) # 处理只有两列的情况 col1_l.append(tds[1]) elif len(tds)>=4 and len(inner_table)==2: # 处理只有两行的情况 col0_l = inner_table[0] col1_l = inner_table[1] break # print(set(col0_l)) # print('head: ',set(col0_l) & self.header_set) if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2 and len(col0_l)==len(col1_l): # 保证两个列数一致 header_list2 = [] product = demand = budget = order_begin = order_end = "" tenderee = "" notes = '' issue_date = '' for i in range(len(col0_l)): if re.search('项目名称', col0_l[i]): header_list2.append(col0_l[i]) product = col1_l[i] elif re.search('采购需求|需求概况', col0_l[i]): header_list2.append(col0_l[i]) demand = col1_l[i] elif re.search('采购预算|预算金额|控制金额', col0_l[i]): header_list2.append(col0_l[i]) _budget = col1_l[i] re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", _budget) if re_price: _budget = re_price[0] if '万元' in col0_l[i] and '万' not in _budget: _budget += '万元' budget = str(getUnifyMoney(_budget)) elif re.search('预算单位|(采购|招标|购买)(单位|人|方|主体)|项目业主|采购商|申购单位|需求单位|业主单位', col0_l[i]): header_list2.append(col0_l[i]) tenderee = re.sub("\s","",col1_l[i]) if len(tenderee) > 20: tenderee = "" elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]): header_list2.append(col0_l[i]) order_time = col1_l[i].strip() order_begin, order_end = self.fix_time(order_time, html, page_time) elif re.search('^备\s*注$|资质要求|预留面向中小企业|是否适宜中小企业采购预算预留|公开征集信息', col0_l[i]): header_list2.append(col0_l[i]) notes = col1_l[i].strip() elif re.search('^\w{,4}发布(时间|日期)$', col0_l[i]): header_list2.append(col0_l[i]) issue_date = self.fix_time(col1_l[i].strip(), '', '')[0] if order_begin != "" and order_end!="": order_begin_year = int(order_begin.split("-")[0]) order_end_year = int(order_end.split("-")[0]) # 限制附件错误识别时间 if order_begin_year>=2050 or order_end_year>=2050: order_begin = order_end = "" # print(product,demand,budget,order_begin) if product!= "" and demand != "" and budget!="" and order_begin != "" and len(budget)<15: # 限制金额小于15位数的才要 link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget, 'order_begin': order_begin, 'order_end': order_end ,'tenderee':tenderee, 'notes':notes, 'issue_date':issue_date} if link not in demand_link: demand_link.append(link) headers_demand.append('_'.join(header_list2)) continue if len(inner_table)>3 and len(inner_table[0])==2 and len(inner_table[1])==2: # 只有两列且第一列为表头的,行列切换 col0_l = [] col1_l = [] for tds in inner_table: if len(tds) == 2: col0_l.append(re.sub('[::]', '', tds[0])) # 处理只有两列的情况 col1_l.append(tds[1]) else: break if len(set(col0_l) & self.header_set) > len(col0_l) * 0.5 and len(col0_l) == len(col1_l): inner_table = [col0_l, col1_l] elif len(inner_table)>2 and len(inner_table[0])==4 and len(inner_table[1])==4 and len(set(inner_table[0]) & self.header_set)==2: # 只有两列且第一列为表头的,行列切换 col0_l = [] col1_l = [] col2_l = [] col3_l = [] for tds in inner_table: if len(tds) == 4 and len(set(tds))>2: col0_l.append(re.sub('[::]', '', tds[0])) # 处理只有两列的情况 col1_l.append(tds[1]) col2_l.append(re.sub('[::]', '', tds[2])) # 处理只有两列的情况 col3_l.append(tds[3]) else: break if len(set(col0_l) & self.header_set) > len(col0_l) * 0.5 and len(set(col2_l) & self.header_set) > len(col2_l) * 0.5: inner_table = [col0_l+col2_l, col1_l+col3_l] while i < (len(inner_table)): tds = inner_table[i] not_empty = [it for it in tds if re.sub('\s', '', it) != ""] if len(set(not_empty))<2 or len(set(tds))<2 or (len(set(tds))==2 and re.search('总计|合计|汇总', tds[0])): # 非空列或者不重复内容小于两列的 继续 i += 1 # print('表格产品提取:非空列或者不重复内容小于两列的 继续', i, tds) continue product = "" # 产品 quantity = "" # 数量 quantity_unit = "" # 数量单位 unitPrice = "" # 单价 brand = "" # 品牌 specs = "" # 规格 demand = "" # 采购需求 budget = "" # 预算金额 order_time = "" # 采购时间 order_begin = "" order_end = "" total_price = "" # 总金额 parameter = "" # 参数 tenderee = "" # 采购人 notes = '' # 备注 issue_date = '' # 发布日期 if len(set([re.sub('[::\s]','',td) for td in tds]) & self.header_set) > len(tds) * 0.4: # if len(set(tds) & self.header_set) > len(tds) * 0.2: header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2) if found_header: header_colnum = len(tds) # 保存表头所在行列数 # print('发现表头:', header_colnum, header_dic) if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位 quantity_header = header_list[1].replace('单位:', '') if re.search('(([\w/]{,5}))', quantity_header): header_quan_unit = re.search('(([\w/]{,5}))', quantity_header).group(1) else: header_quan_unit = "" if found_header and ('_'.join(header_list) not in headers or '_'.join(header_list2) not in headers_demand):# and len(headers)<1: # 只保留出现的第一个表头 headers.append('_'.join(header_list)) headers_demand.append('_'.join(header_list2)) header_col.append('_'.join(tds)) i += 1 # print('表头数量占行列数0.4倍不做内容匹配', set([re.sub('[::]','',td) for td in tds]) & self.header_set, tds) continue elif found_header: if len(tds) > header_colnum or len(tds)-1= len(tds) or tds[v] in self.header_set: # print('内容属性在表头集合里面', tds[v], v >= len(tds)) not_attr = 1 # break if not_attr>=2: # 只要属性里面有两项为表头,停止匹配 i += 1 found_header = False # print('只要属性里面有两项为表头,停止匹配') continue if id1!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \ re.search('备注|汇总|合计|总价|价格|金额|^详见|无$|xxx', tds[id1]) == None: product = tds[id1] if id0!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id0]) and tds[id0] not in self.header_set and \ re.search('备注|汇总|合计|总价|价格|金额|^详见|无$|xxx', tds[id0]) == None: category = tds[id0] product = "%s_%s"%(category, product) if product!="" and product!=category else category if product != "" and product not in ['工程类', '服务类', '货物类', '工程', '服务', '货物']: # print('匹配产品内容: ', product) if id2 != "": if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]): # if re.search('(^\d{,3}(,?\d{3}){2,}(\.\d{2,7},?)$)|万?元', tds[id2]): # 254816100 这篇数量很大,貌似正常 # i += 1 # print('过滤:数量包含金额单位或值很大类似金额', tds[id2]) # continue quantity = tds[id2] elif re.search('\w{5,}', tds[id2]) and re.search('^详见|^详情', tds[id2])==None: i += 1 # print('过滤:数量包含五个字符以上且不包含^详见|^详情等字符', tds[id2]) continue if id2_2 != "": if re.search('^\w{1,4}$', tds[id2_2]) and re.search('元', tds[id2_2])==None: quantity_unit = tds[id2_2] if id3 != "": if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]): unitPrice = tds[id3] elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id3].strip()): unitPrice = tds[id3] elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', tds[id3])) > 5 and re.search('^详见|^详情', tds[id3])==None: i += 1 # print('过滤:产品单价包含金额外的字符数大于5个', tds[id3]) continue if id4 != "": if re.search('\w', tds[id4]): brand = tds[id4] if re.match('^详见|^详情', brand.strip()): brand = "" else: brand = "" if id5 != "": if re.search('\w', tds[id5]): specs = tds[id5][:500] # 限制最多500字 if re.match('^详见|^详情', specs.strip()): specs = "" else: specs = "" if id6 != "": if re.search('\w', tds[id6]): demand = tds[id6] else: demand = "" if id7 != "": if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]): budget = tds[id7] if id8 != "": if re.search('\w', tds[id8]): order_time = tds[id8].strip() order_begin, order_end = self.fix_time(order_time, html, page_time) if id9 != "": if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id9]): total_price = tds[id9] elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id9].strip()): total_price = tds[id9] elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', tds[id9])) > 5 and re.search('^详见|^详情', tds[id9])==None: i += 1 # print('过滤:产品总价包含金额外的字符数大于5个', tds[id9]) continue if id10 != "": parameter = tds[id10][:500] if re.match('^详见|^详情', parameter.strip()): parameter = "" if id11 != "": tenderee = re.sub("\s","",tds[id11]) if len(tenderee) > 30: tenderee = "" if id12 != "": notes = tds[id12].strip() if id13 != "": issue_date = self.fix_time(tds[id13].strip(), '', '')[0] # print('数量:{0}, 单价:{1}, 品牌:{2}, 规格:{3},总价:{4}'.format(quantity ,unitPrice, brand, specs, total_price)) if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price or '单价' in header_dic or '总价' in header_dic: if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]+', tds[id2])) > 1 and len(re.split('[;;、,\n]+', tds[id1])) == len(re.split('[;;、,\n]+', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743 products = re.split('[;;、,\n]+', tds[id1]) quantitys = re.split('[;;、,\n]+', tds[id2]) unitPrices = re.split('[;;、,\n]+', tds[id3]) total_prices = re.split('[;;、,\n]+', total_price) brands = re.split('[;;、,\n]+', brand) if re.search('等$', brand)==None else [brand] specses = re.split('[;;、,\n]+', specs) if re.search('等$', specs)==None else [specs] parameters = re.split('[;;、,\n]+', parameter) if re.search('等$', parameter)==None else [parameter] unitPrices = [""]*len(products) if len(unitPrices)==1 else unitPrices total_prices = [""]*len(products) if len(total_prices)==1 else total_prices brands = brands*len(products) if len(brands)==1 else brands specses = specses*len(products) if len(specses)==1 else specses brands = [brand]*len(products) if len(brands) < len(products) else brands specses = [specs] * len(products) if len(specses) < len(products) else specses parameters = parameters*len(products) if len(parameters)==1 else parameters # print('产品拆分:', len(products),len(quantitys) , len(unitPrices),len(brands),len(specses)) if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len(specses): for product, quantity, unitPrice, brand, specs, total_price, parameter in zip(products,quantitys,unitPrices, brands, specses, total_prices, parameters): if quantity != "": quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit) quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit if unitPrice != "": unitPrice, _money_unit = money_process(unitPrice, header_list[3]) unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else "" if budget != "": budget, _money_unit = money_process(budget, header_list2[2]) budget = str(budget) if budget != 0 and budget<50000000000 else '' if total_price != "": total_price, _money_unit = money_process(total_price, header_list[6]) total_price_list.append(total_price) total_price = str(total_price) if total_price != 0 and total_price<50000000000 else "" link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice, 'brand': brand[:50], 'specs': specs, 'total_price': total_price, 'parameter': parameter} # if link not in product_link: # product_link.append(link) # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity']) # if link['unitPrice'] != "" and mat: # try: # total_product_money += float(link['unitPrice']) * float( # mat.group(1).replace(',', '')) if float( # mat.group(1).replace(',', '')) < 50000 else 0 # except: # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % ( # link['unitPrice'], link['quantity'])) if (product, specs, unitPrice, quantity) not in product_set: product_set.add((product, specs, unitPrice, quantity)) product_link.append(link) if link['unitPrice'] != "" and link['quantity'] != '': try: total_product_money += float(link['unitPrice']) * float( link['quantity']) if float(link['quantity']) < 50000 else 0 except: log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % ( link['unitPrice'], link['quantity'])) elif len(product)>100: # 产品名称长于100字 i += 1 # print('过滤: 产品名称长于100字',) continue else: if quantity != "": quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit) quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit if unitPrice != "": unitPrice, _money_unit = money_process(unitPrice, header_list[3]) unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else "" if budget != "": budget, _money_unit = money_process(budget, header_list2[2]) budget = str(budget) if budget != 0 and budget<50000000000 else '' if total_price != "": total_price, _money_unit = money_process(total_price, header_list[6]) total_price_list.append(total_price) total_price = str(total_price) if total_price != 0 and total_price<50000000000 else "" link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice, 'brand': brand[:50], 'specs':specs, 'total_price': total_price, 'parameter': parameter} # if link not in product_link: # product_link.append(link) # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity']) # if link['unitPrice'] != "" and mat: # try: # total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', '')) if float(mat.group(1).replace(',', ''))<50000 else 0 # except: # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity'])) # if (product, unitPrice, quantity) not in product_set: # product_set.add((product, unitPrice, quantity)) if (product, unitPrice,) not in product_set: # 2023/09/22 改为只判断产品/单价,只要两个一样就不作为新产品 避免多个表格重复表达有些没数量造成重复提取 353858683 product_set.add((product, unitPrice)) product_link.append(link) if link['unitPrice']: unit_price_list.append(link['unitPrice']) if link['unitPrice'] != "" and link['quantity'] != '': try: total_product_money += float(link['unitPrice'])*float(link['quantity']) if float(link['quantity'])<50000 else 0 except: log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity'])) if order_begin != "" and order_end != "": order_begin_year = int(order_begin.split("-")[0]) order_end_year = int(order_end.split("-")[0]) # 限制附件错误识别时间 if order_begin_year >= 2050 or order_end_year >= 2050: order_begin = order_end = "" # print(budget,order_time) if budget != "" and order_time != "": link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end, 'tenderee':tenderee,'notes':notes,'issue_date':issue_date} if link not in demand_link: demand_link.append(link) i += 1 else: i += 1 if len(total_price_list)>1 and len(set(total_price_list))/len(total_price_list)<=0.5: # 2023/7/27 总价一半以上重复的为多行一个总价,需去掉 # print('总价一半以上重复的为多行一个总价,需去掉', total_price_list) for link in product_link: # 预防最后一列总价为所有产品总价,列补全后所有产品总价一样情况 if 'total_price' in link: link['total_price'] = "" if len(unit_price_list)>0 and len(unit_price_list)==len(product_link) and len(set(unit_price_list))/len(unit_price_list)<=0.5: # 2023/7/18 如果单价重复率高不算总产品价避免错误 # print('如果单价重复率高不算总产品价避免错误') total_product_money = 0 # for link in product_link: # if 'unitPrice' in link: # link['unitPrice'] = "" if len(product_link)>0: attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}} else: attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}} if len(demand_link)>0: demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}} else: demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}} # print('表格产品属性提取:', attr_dic) return [attr_dic, demand_dic], total_product_money def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""): if len(prem[0]['prem'])==1: list_sentences[0].sort(key=lambda x:x.sentence_index) list_sentence = list_sentences[0] list_entity = list_entitys[0] _data = product_attrs[1]['demand_info']['data'] re_bidding_time = re.compile("(采购|采购实施|预计招标)(时间|月份|日期)[::,].{0,2}$") order_times = [] for entity in list_entity: if entity.entity_type=='time': sentence = list_sentence[entity.sentence_index] s = spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index,size=20) entity_left = "".join(s[0]) if re.search(re_bidding_time,entity_left): time_text = entity.entity_text.strip() standard_time = re.compile("((?P\d{4}|\d{2})\s*[-\/年\.]\s*(?P\d{1,2})\s*[-\/月\.]\s*((?P\d{1,2})日?)?)") time_match = re.search(standard_time,time_text) # print(time_text, time_match) if time_match: time_text = time_match.group() order_times.append(time_text) # print(order_times) order_times = [tuple(self.fix_time(order_time, html, page_time)) for order_time in order_times] order_times = [order_time for order_time in order_times if order_time[0]!=""] if len(set(order_times))==1: order_begin,order_end = order_times[0] project_name = codeName[0]['name'] pack_info = [pack for pack in prem[0]['prem'].values()] budget = pack_info[0].get('tendereeMoney',0) product = prem[0]['product'] link = {'project_name': project_name, 'product': product, 'demand': project_name, 'budget': budget, 'order_begin': order_begin, 'order_end': order_end} _data.append(link) product_attrs[1]['demand_info']['data'] = _data # print('predict_without_table: ', product_attrs) return product_attrs def predict_by_text(self,product_attrs,html,list_outlines,product_list,page_time=""): product_entity_list = list(set(product_list)) list_outline = list_outlines[0] get_product_attrs = False for _outline in list_outline: if re.search("信息|情况|清单|概况",_outline.outline_summary): outline_text = _outline.outline_text outline_text = outline_text.replace(_outline.outline_summary,"") key_value_list = [_split for _split in re.split("[,。;]",outline_text) if re.search("[::]",_split)] if not key_value_list: continue head_list = [] head_value_list = [] for key_value in key_value_list: key_value = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key_value) temp = re.split("[::]",key_value) if len(temp)>2: if temp[0] in head_list: key = temp[0] value = "".join(temp[1:]) else: key = temp[-2] value = temp[-1] else: key = temp[0] value = temp[1] key = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key) head_list.append(key) head_value_list.append(value) head_set = set(head_list) # print('head_set',head_set) if len(head_set & self.header_set) > len(head_set)*0.2: loop_list = [] begin_list = [0] for index,head in enumerate(head_list): if head not in loop_list: if re.search('第[一二三四五六七八九十](包|标段)', head) and re.search('第[一二三四五六七八九十](包|标段)', '|'.join(loop_list)): begin_list.append(index) loop_list = [] loop_list.append(head) else: loop_list.append(head) else: begin_list.append(index) loop_list = [] loop_list.append(head) headers = [] headers_demand = [] header_col = [] product_link = [] demand_link = [] product_set = set() for idx in range(len(begin_list)): if idx==len(begin_list)-1: deal_list = head_value_list[begin_list[idx]:] tmp_head_list = head_list[begin_list[idx]:] else: deal_list = head_value_list[begin_list[idx]:begin_list[idx+1]] tmp_head_list = head_list[begin_list[idx]:begin_list[idx+1]] product = "" # 产品 quantity = "" # 数量 quantity_unit = "" # 单位 unitPrice = "" # 单价 brand = "" # 品牌 specs = "" # 规格 demand = "" # 采购需求 budget = "" # 预算金额 order_time = "" # 采购时间 order_begin = "" order_end = "" total_price = "" # 总金额 parameter = "" # 参数 header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p0, self.p1,self.p2) if found_header: headers.append('_'.join(header_list)) headers_demand.append('_'.join(header_list2)) header_col.append('_'.join(tmp_head_list)) # print('header_dic: ',header_dic) id0 = header_dic.get('品目', "") id1 = header_dic.get('名称', "") id2 = header_dic.get('数量', "") id2_2 = header_dic.get('单位', "") id3 = header_dic.get('单价', "") id4 = header_dic.get('品牌', "") id5 = header_dic.get('规格', "") id6 = header_dic.get('需求', "") id7 = header_dic.get('预算', "") id8 = header_dic.get('时间', "") id9 = header_dic.get("总价", "") id10 = header_dic.get('参数', "") if id1!='' and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \ re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None: product = deal_list[id1] if id0 != "" and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id0]) and deal_list[id0] not in self.header_set and \ re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id0]) == None: category = deal_list[id0] product = "%s_%s" % (category, product) if product != "" else category if product == "": # print(deal_list[id4],deal_list[id5],tmp_head_list,deal_list) if (id4 != "" and deal_list[id4] != "") or (id5 != "" and deal_list[id5] != ""): for head,value in zip(tmp_head_list,deal_list): if value and value in product_entity_list: product = value break if product != "": if id2 != "": if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]): quantity = deal_list[id2] quantity = re.sub('[()(),,约]', '', quantity) quantity = re.sub('[一壹]', '1', quantity) ser = re.search('^(\d+(?:\.\d+)?)([㎡\w/]{,5})', quantity) if ser: quantity = str(ser.group(1)) quantity_unit = ser.group(2) if float(quantity)>=10000*10000: quantity = "" quantity_unit = "" else: quantity = "" quantity_unit = "" if id2_2 != "": if re.search('^\w{1,4}$', deal_list[id2_2]): quantity_unit = deal_list[id2_2] else: quantity_unit = "" # if id2 != "": # if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]): # quantity = deal_list[id2] # else: # quantity = "" if id3 != "": if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id3]): _unitPrice = deal_list[id3] re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice) if re_price: _unitPrice = re_price[0] if '万元' in header_list[3] and '万' not in _unitPrice: _unitPrice += '万元' unitPrice = getUnifyMoney(_unitPrice) if unitPrice>=10000*10000: unitPrice = "" unitPrice = str(unitPrice) if id4 != "": if re.search('\w', deal_list[id4]): brand = deal_list[id4] if re.match('^详见|^详情', brand.strip()): brand = "" else: brand = "" if id5 != "": if re.search('\w', deal_list[id5]): specs = deal_list[id5][:500] if re.match('^详见|^详情', specs.strip()): brand = "" else: specs = "" if id6 != "": if re.search('\w', deal_list[id6]): demand = deal_list[id6] else: demand = "" if id7 != "": if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id7]): _budget = deal_list[id7] re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_budget) if re_price: _budget = re_price[0] if '万元' in header_list2[2] and '万' not in _budget: _budget += '万元' budget = str(getUnifyMoney(_budget)) if float(budget)>= 100000*10000: budget = "" if id8 != "": if re.search('\w', deal_list[id8]) and re.search("(采购|采购实施|预计招标)(时间|月份|日期)",header_list2[3]): order_time = deal_list[id8].strip() order_begin, order_end = self.fix_time(order_time, html, page_time) if id9 != "": if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id9]): total_price = deal_list[id9] elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', deal_list[id9].strip()): total_price = deal_list[id9] if id10 != "": parameter = deal_list[id10][:500] if re.match('^详见|^详情', parameter.strip()): parameter = "" if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price: if id1 != "" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]', deal_list[id2])) > 1 and len( re.split('[;;、,\n]', deal_list[id1])) == len(re.split('[;;、,\n]', deal_list[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743 products = re.split('[;;、,\n]', deal_list[id1]) quantitys = re.split('[;;、,\n]', deal_list[id2]) unitPrices = re.split('[;;、,\n]', deal_list[id3]) total_prices = re.split('[;;、,\n]', total_price) brands = re.split('[;;、,\n]', brand) if re.search('等$', brand) == None else [brand] specses = re.split('[;;、,\n]', specs) if re.search('等$', specs) == None else [specs] parameters = re.split('[;;、,\n]', parameter) if re.search('等$', parameter) == None else [parameter] unitPrices = [""] * len(products) if len(unitPrices) == 1 else unitPrices total_prices = [""] * len(products) if len(total_prices) == 1 else total_prices brands = brands * len(products) if len(brands) == 1 else brands specses = specses * len(products) if len(specses) == 1 else specses parameters = parameters * len(products) if len(parameters) == 1 else parameters if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len( specses): for product, quantity, unitPrice, brand, specs, total_price, parameter in zip( products, quantitys, unitPrices, brands, specses, total_prices, parameters): if quantity != "": quantity, quantity_unit_ = self.fix_quantity(quantity,quantity_unit) quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit if unitPrice != "": unitPrice, _money_unit = money_process(unitPrice, header_list[3]) unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else "" if budget != "": budget, _money_unit = money_process(budget, header_list2[2]) budget = str(budget) if budget != 0 and budget<50000000000 else '' if total_price != "": total_price, _money_unit = money_process(total_price, header_list[6]) total_price = str(total_price) if total_price != 0 and total_price<50000000000 else "" link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice, 'brand': brand[:50], 'specs': specs, 'total_price': total_price, 'parameter': parameter} if (product, specs, unitPrice, quantity) not in product_set: product_set.add((product, specs, unitPrice, quantity)) product_link.append(link) # if link['unitPrice'] != "" and link['quantity'] != '': # try: # total_product_money += float(link['unitPrice']) * float( # link['quantity']) if float( # link['quantity']) < 50000 else 0 # except: # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % ( # link['unitPrice'], link['quantity'])) elif len(unitPrice) > 15 or len(product) > 100: # 单价大于15位数或 产品名称长于100字 # i += 1 continue else: if quantity != "": quantity, quantity_unit_ = self.fix_quantity(quantity, quantity_unit) quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit if unitPrice != "": unitPrice, _money_unit = money_process(unitPrice, header_list[3]) unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else "" if budget != "": budget, _money_unit = money_process(budget, header_list2[2]) budget = str(budget) if budget != 0 and budget<50000000000 else '' if total_price != "": total_price, _money_unit = money_process(total_price, header_list[6]) total_price = str(total_price) if total_price != 0 and total_price<50000000000 else "" link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice, 'brand': brand[:50], 'specs': specs, 'total_price': total_price, 'parameter': parameter} if (product, specs, unitPrice, quantity) not in product_set: product_set.add((product, specs, unitPrice, quantity)) product_link.append(link) # if link['unitPrice'] != "" and link['quantity'] != '': # try: # total_product_money += float(link['unitPrice']) * float( # link['quantity']) if float(link['quantity']) < 50000 else 0 # except: # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % ( # link['unitPrice'], link['quantity'])) if order_begin != "" and order_end != "": order_begin_year = int(order_begin.split("-")[0]) order_end_year = int(order_end.split("-")[0]) # 限制附件错误识别时间 if order_begin_year >= 2050 or order_begin_year < 2000 or order_end_year >= 2050 or order_end_year < 2000: order_begin = order_end = "" # print(budget, order_time) if budget != "" and order_time != "": link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget, 'order_begin': order_begin, 'order_end': order_end} if link not in demand_link: demand_link.append(link) if len(product_link) > 0: attr_dic = {'product_attrs': {'data': product_link, 'header': list(set(headers)), 'header_col': list(set(header_col))}} get_product_attrs = True else: attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}} if len(demand_link) > 0: demand_dic = {'demand_info': {'data': demand_link, 'header': headers_demand, 'header_col': header_col}} else: demand_dic = {'demand_info': {'data': [], 'header': [], 'header_col': []}} product_attrs[0] = attr_dic if len(product_attrs[1]['demand_info']['data']) == 0: product_attrs[1] = demand_dic if get_product_attrs: break # print('predict_by_text: ', product_attrs) return product_attrs def add_product_attrs(self,channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time): if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0: product_attrs = self.predict_without_table(product_attrs, list_sentences,list_entitys,codeName,prem,text,page_time) if len(product_attrs[0]['product_attrs']['data']) == 0: product_attrs = self.predict_by_text(product_attrs,text,list_outlines,product_list,page_time) if len(product_attrs[1]['demand_info']['data'])>0: for d in product_attrs[1]['demand_info']['data']: for product in set(prem[0]['product']): if product in d['project_name'] and product not in d['product']: d['product'].append(product) #把产品在项目名称中的添加进需求要素中 # docchannel类型提取 class DocChannel(): def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb',config=None): self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\ self.mask, self.mask_title = self.load_life(life_model,config) self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\ self.type_mask, self.type_mask_title = self.load_type(type_model) self.sequen_len = 200 # 150 200 self.title_len = 30 self.sentence_num = 10 self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预' lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯'] lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告'] self.id2type = {k: v for k, v in enumerate(lb_type)} self.id2life = {k: v for k, v in enumerate(lb_life)} self.load_pattern() def load_pattern(self): self.type_dic = { '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地', '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会', '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)', '采招数据': '(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' # |变更|答疑|澄清|中标|成交|合同|废标|流标 |(采购|招标|代理)(人|机构|单位)| } self.title_type_dic = { '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地', '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍', '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让', '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务', # |竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标 '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)|行政审批结果' } self.life_dic = { '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示', '采购意向neg': '发布政府采购意向|采购意向公告已于', '招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)', '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[:\s]|[^\w]成交规则|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格要求|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)', '资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格 '招标答疑': '现澄清(为|如下)|答疑补遗|澄清内容如下|第[0-9一二三四五]次澄清|答疑澄清|(最高(投标)?限价|控制价|拦标价)公示', # |异议的回复 '公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)', '公告变更neg': '履约变更内容', '候选人公示': '候选人公示|评标结果公示|中标候选人名单公示|现将中标候选人(进行公示|公[示布]如下)|(中标|中选)候选人(信息|情况)[::\s]', '候选人公示neg': '中标候选人公示期|中标候选人公示前', '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果|竞价结果)\w{,4}(进行公示|公[示布]如下)|(询价|竞价|遴选)(成交|中标|中选)(公告|公示)|(成交|中标|中选|选定|选取|入围|询价)结果(如下|公告|公示)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(人|成交)|成交)\w{,3}(信息|情况)[::\s]', '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源(采购|招标)?的?(中标|成交|结果)|项目已结束', # |单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示 '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(信息[,:]?)?(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]', # |唯一 '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家|通知中标单位|影响(成交|中标)结果', # |确定成交供应商[:,\s] '合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|完成(日期|时间))|(供应商乙方|乙方供应商):|合同总?金额|履约信息', '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标|废置)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)', '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?(原因|理由)|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人|人数|供应商|单位)(不足|未达\w{,3}数量)|已终止|不足[3三]家|无(废标)|成交情况:\s*[流废]标|现予以废置', '废标公告neg': '超过此报价将作为[废流]标处理|否则按[废流]标处理|终止规则:|成交规则:|视为流标|竞价失败的一切其他情形' } self.title_life_dic = { '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示|意向公开', '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|(供应|招标)计划表?$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)', '公告变更': '第[\d一二]次变更|(变更|更正(事项)?|更改|延期|暂停)(招标|采购)?的?(公告|公示|通知)|变更$|更正$', '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)', '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置', '合同公告': '(合同(成交|变更)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$', # |(履约|验收)(结果)? '候选人公示': '候选人(变更)?公示|评标(结果)?公示|中标前?公示|中标预公示', '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$', # |开标(记录|信息|情况) '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示', '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告', '开标记录': '开标记录|截标信息|评委名单公示|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果', '验收合同': '(验收|履约)(公告|公示)|(验收|履约)(结果|报告|意见|单)(公告|公示)' } def load_life(self,life_model,config): with tf.Graph().as_default() as graph: output_graph_def = graph.as_graph_def() with open(os.path.dirname(__file__)+life_model, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') # print("%d ops in the final graph" % len(output_graph_def.node)) del output_graph_def sess = tf.Session(graph=graph,config=config) sess.run(tf.global_variables_initializer()) inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') title = sess.graph.get_tensor_by_name('inputs/title:0') mask = sess.graph.get_tensor_by_name('inputs/mask:0') mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0') # logit = sess.graph.get_tensor_by_name('output/logit:0') softmax = sess.graph.get_tensor_by_name('output/softmax:0') return sess, title, inputs, prob, softmax, mask, mask_title def load_type(self,type_model): with tf.Graph().as_default() as graph: output_graph_def = graph.as_graph_def() with open(os.path.dirname(__file__)+type_model, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') # print("%d ops in the final graph" % len(output_graph_def.node)) del output_graph_def sess = tf.Session(graph=graph) sess.run(tf.global_variables_initializer()) inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') title = sess.graph.get_tensor_by_name('inputs/title:0') mask = sess.graph.get_tensor_by_name('inputs/mask:0') mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0') # logit = sess.graph.get_tensor_by_name('output/logit:0') softmax = sess.graph.get_tensor_by_name('output/softmax:0') return sess, title, inputs, prob, softmax, mask, mask_title def predict_process(self, docid='', doctitle='', dochtmlcon=''): # print('准备预处理') def get_kw_senten(s, span=10): doc_sens = [] tmp = 0 num = 0 end_idx = 0 for it in re.finditer(self.kws, s): # '|'.join(keywordset) left = s[end_idx:it.end()].split() right = s[it.end():].split() tmp_seg = s[tmp:it.start()].split() if len(tmp_seg) > span or tmp == 0: doc_sens.append(' '.join(left[-span:] + right[:span])) end_idx = it.end() + 1 + len(' '.join(right[:span])) tmp = it.end() num += 1 if num >= self.sentence_num: break if doc_sens == []: doc_sens.append(s) return doc_sens def word2id(wordlist, max_len=self.sequen_len): ids = [getIndexOfWords(w) for w in wordlist] ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids)) assert len(ids) == max_len return ids cost_time = dict() datas = [] datas_title = [] try: segword_title = ' '.join(selffool.cut(doctitle)[0]) segword_content = dochtmlcon except: segword_content = '' segword_title = '' if isinstance(segword_content, float): segword_content = '' if isinstance(segword_title, float): segword_title = '' segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \ replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \ replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止') segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title) segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content) doc_word_list = segword_content.split() if len(doc_word_list) > self.sequen_len / 2: doc_sens = get_kw_senten(' '.join(doc_word_list[100:500])) doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens) else: doc_sens = ' '.join(doc_word_list[:self.sequen_len]) # print('标题:',segword_title) # print('正文:',segword_content) datas.append(doc_sens.split()) datas_title.append(segword_title.split()) # print('完成预处理') return datas, datas_title def is_houxuan(self, title, content): ''' 通过标题和中文内容判断是否属于候选人公示类别 :param title: 公告标题 :param content: 公告正文文本内容 :return: 1 是候选人公示 ;0 不是 ''' if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围) if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title): return 0 return 1 if re.search('候选人的?公示', content[:100]): if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]): return 0 return 1 else: return 0 def predict(self, title='', list_sentence='', web_source_no='', original_docchannel=''): not_extract_dic = { 104: '招标文件', 106: '法律法规', 107: '新闻资讯', 108: '拟建项目', 109: '展会推广', 110: '企业名录', 111: '企业资质', 112: '全国工程人员', 113: '业主采购' } if original_docchannel in not_extract_dic: return {'docchannel': {'docchannel':'', 'doctype':not_extract_dic[original_docchannel], "original_docchannel_id": str(original_docchannel)}} if web_source_no in ['02104-7']: return {'docchannel': {'docchannel':'', 'doctype':'采招数据'}} if isinstance(list_sentence, list): token_l = [it.tokens for it in list_sentence] tokens = [it for l in token_l for it in l] content = ' '.join(tokens[:500]) title = re.sub('[^\u4e00-\u9fa5]', '', title) if len(title)>50: title = title[:20]+title[-30:] data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字 text_len = len(data_content[0]) if len(data_content[0])]*richTextFetch', html) # if ser and len(re.sub('[^\u4e00-\u9fa5]', '', html[:ser.start()]))>500: # html = html[:ser.start()]+'##richTextFetch##' if ser: if len(re.sub('[^\u4e00-\u9fa5]', '', html[:ser.start()])) > 200: html = html[:ser.start()] + '##richTextFetch##' else: html = html[:ser.start() + 500] text = re.sub('<[^<]*?>', '', html).replace(' ', ' ') # text = re.sub('http[0-9a-zA-Z-.:/]+|[0-9a-zA-Z-./@]+', '', text) text = re.sub('\s+', ' ', text) # text = re.sub('[/|[()()]', '', text) text = cut_single_cn_space(text) return text[:20000] def count_diffser(pattern, text): num = 0 kw = [] for p in pattern.split(';'): if re.search(p, text): num += 1 kw.append(re.search(p, text).group(0)) return num, ';'.join(kw) def is_contain_winner(extract_json): if re.search('win_tenderer', extract_json): return True else: return False def is_single_source(bidway, title): if re.search('单一来源|单一性采购', title): return True elif bidway == '单一来源': return True else: return False def get_type(title, text): if re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text): # and re.search('(土地|用地|宗地|地块)(经营权)?(流转|承包|出租|招租|租赁|确权)', text)==None if re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]): return '采招数据', re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]).group(0) return '土地矿产', (re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text)).group(0) elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)): if re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]): return '采招数据', re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]).group(0) return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0) elif re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text): if re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]): return '采招数据', re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]).group(0) return '产权交易', (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)).group(0) elif re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text): return '采招数据', ( re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text)).group( 0) elif re.search(self.title_type_dic['新闻资讯'], title): if re.search(self.title_type_dic['采招数据'], title +text.strip().split(' ')[0]): return '采招数据', re.search(self.title_type_dic['采招数据'], title +text.strip().split(' ')[0]).group(0) return '新闻资讯', re.search(self.title_type_dic['新闻资讯'], title).group(0) else: return '', '没有公告类型关键词,返回空' def get_life(title, text): title = re.sub('[-()()0-9a-z]|第?[二三四]次公?告?', '', title) first_line = text.split()[0] if len(text.split()) > 2 else '' if title.strip()[-2:] not in ['公告', '公示'] and 5 < len(first_line) < 50 and first_line[-2:] in ['公告', '公示']: # print('title: ', title, first_line) title += first_line def count_score(l): return len(l) + len(set(l)) * 2 life_kw_title = {} life_kw_content = {} life_score = {} # msc = "" # 查找标题每个类别关键词 for k, v in self.title_life_dic.items(): k2 = re.sub('[\da-z]', '', k) if k2 not in life_kw_title: life_kw_title[k2] = [] for it in re.finditer(v, title): life_kw_title[k2].append(it.group(0)) # 查找正文每个类别关键词 for k, v in self.life_dic.items(): k2 = re.sub('[\da-z]', '', k) if k2 not in life_kw_content: life_kw_content[k2] = {'pos': [], 'neg': []} for it in re.finditer(v, text): if 'neg' not in k: life_kw_content[k2]['pos'].append(it.group(0)) else: life_kw_content[k2]['neg'].append(it.group(0)) for k2 in life_kw_content: life_score[k2] = count_score(life_kw_content[k2]['pos']) - count_score( life_kw_content[k2]['neg']) life_kw_title = {k: v for k, v in life_kw_title.items() if v != []} life_kw_content = {k: v for k, v in life_kw_content.items() if life_score[k] > 0} msc = [life_kw_title, life_kw_content, life_score] msc = json.dumps(msc, ensure_ascii=False) max_score = 0 life_list = [] for k in life_score.keys(): if life_score[k] > max_score: max_score = life_score[k] life_list = [k] elif life_score[k] == max_score and life_score[k] > 0: life_list.append(k) if '采购意向' in life_kw_title or '采购意向' in life_list: if '中标信息' in life_kw_title or '中标信息' in life_list: return '中标信息', msc elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set(): return '', msc return '采购意向', msc elif '招标预告' in life_kw_title or '招标预告' in life_list: if '中标信息' in life_kw_title or '中标信息' in life_list: return '中标信息', msc elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set(): return '', msc return '招标预告', msc elif '公告变更' in life_kw_title or '公告变更' in life_list: if life_score.get('候选人公示', 0) > 3 or '候选人公示' in life_kw_title: return '候选人公示', msc elif life_score.get('合同公告', 0) > 3 or '合同公告' in life_kw_title: return '合同公告', msc elif life_score.get('中标信息', 0) > 3 or '中标信息' in life_kw_title: return '中标信息', msc elif '招标公告' in life_kw_title and re.search('变更|更正', title[-4:])==None and life_score.get('公告变更', 0) < 4: return '招标公告', msc return '公告变更', msc elif '招标答疑' in life_kw_title or '招标答疑' in life_list: if '招标公告' in life_kw_title and life_score.get('招标答疑', 0) < 4: return '招标公告', msc elif life_score.get('招标答疑', 0) < max_score: if max_score > 3 and len(life_list) == 1: return life_list[0], msc return '', msc return '招标答疑', msc elif '开标记录' in life_kw_title: if '开标结果' in title and is_contain_winner(prem_json): return '中标信息', msc return '开标记录', msc elif '验收合同' in life_kw_title: return '验收合同', msc elif '候选人公示' in life_kw_title or '候选人公示' in life_list: if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3: return '招标公告', msc elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5: return '废标公告', msc return '候选人公示', msc elif '合同公告' in life_kw_title or '合同公告' in life_list: if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3: return '招标公告', msc elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5: return '废标公告', msc return '合同公告', msc elif '中标信息' in life_kw_title or '中标信息' in life_list: if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 2: # (life_score.get('招标公告', 0)>2 or life_score.get('中标信息', 0)<4) 0.7886409793924245 return '招标公告', msc elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5: return '废标公告', msc elif life_score.get('候选人公示', 0) > 3: return '候选人公示', msc elif life_score.get('合同公告', 0) > 5: return '合同公告', msc return '中标信息', msc elif '废标公告' in life_kw_title or '废标公告' in life_list: if life_score.get('招标公告', 0) > 3 and '废标公告' not in life_kw_title: return '招标公告', msc return '废标公告', msc elif '资审结果' in life_kw_title or '资审结果' in life_list: return '资审结果', msc elif '招标公告' in life_kw_title or '招标公告' in life_list: return '招标公告', msc return '', msc def get_model_inputs(list_sentence): list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index) token_l = [it.tokens for it in list_sentence] tokens = [it for l in token_l for it in l] content = ' '.join(tokens[:500]) data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字 text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128)) array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128)) return array_content, array_title ,text_len, title_len, content def type_model_predict(): pred = self.type_sess.run(self.type_softmax, feed_dict={ self.type_title: array_title, self.type_content: array_content, self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)], self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)], self.type_prob: 1} ) id = np.argmax(pred, axis=1)[0] prob = pred[0][id] return id, prob def life_model_predict(): pred = self.lift_sess.run(self.lift_softmax, feed_dict={ self.lift_title: array_title, self.lift_content: array_content, self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)], self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)], self.lift_prob: 1} ) id = np.argmax(pred, axis=1)[0] prob = pred[0][id] return id, prob def final_change(msc): ''' 修改逻辑: 1、中标公告、合同公告无中标人且原始为非中标,返回原类型 2、废标公告有中标人且标题无废标关键词,返回中标信息 3、答疑公告标题无答疑关键且原始为招标,返回原始类别 4、招标公告有中标人且原始为中标,返回中标信息 5、预测为招标,原始为预告、意向,返回原始类别 6、预测及原始均在变更、答疑,返回原始类别 7、预测为采招数据,原始为产权且有关键词,返回原始类别 8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别 9、若预测为非采招数据且源网为采招数据且有招标关键词返回采招数据 10、招标公告有中标人,且标题有直购关键词,改为中标信息 11、预测预告,原始为意向、招标且标题无预告关键词,返回原始类别 ''' if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get( original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(prem_json)==False: result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '') msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型' elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search( self.title_life_dic['废标公告'], title) == None: result['docchannel']['docchannel'] = '中标信息' msc += '最终规则修改:预测为废标却有中标人且标题无废标关键词改为中标信息;' elif result['docchannel']['docchannel'] in ['招标答疑'] and re.search( self.title_life_dic['招标答疑'], title) == None and origin_dic.get( original_docchannel, '') in ['招标公告', '采购意向', '招标预告']: result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '') msc += '最终规则修改:答疑公告标题无答疑关键且原始为招标,返回原始类别;' elif result['docchannel']['docchannel'] == '招标公告' and is_contain_winner(prem_json) and origin_dic.get( original_docchannel, '') == '中标信息': result['docchannel']['docchannel'] = '中标信息' msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;' elif result['docchannel']['docchannel'] in ['招标公告'] and origin_dic.get( original_docchannel, '') in ['采购意向', '招标预告']: result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '') msc += '最终规则修改:预测为招标,原始为预告、意向,返回原始类别' elif result['docchannel']['docchannel'] in ['招标预告'] and origin_dic.get( original_docchannel, '') in ['采购意向', '招标公告'] and re.search( self.title_life_dic['招标预告'], title)==None: result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '') msc += '最终规则修改:预测预告,原始为意向、招标且标题无预告关键词,返回原始类别' elif result['docchannel']['docchannel'] in ['招标答疑', '公告变更'] and origin_dic.get( original_docchannel, '') in ['招标答疑', '公告变更']: result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '') msc += '最终规则修改:预测及原始均在答疑、变更,返回原始类别' elif result['docchannel']['doctype'] == '采招数据' and origin_dic.get( original_docchannel, '') in ['产权交易', '土地矿产'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产', text): result['docchannel']['doctype'] = origin_dic.get(original_docchannel, '') msc += '最终规则修改:预测为采招数据,原始为产权且有关键词,返回原始类别' elif result['docchannel']['docchannel'] == '废标公告' and origin_dic.get( original_docchannel, '') in ['招标公告', '采购意向', '招标预告'] and re.search( self.title_life_dic['废标公告'], title) == None: result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '') msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;' elif result['docchannel']['docchannel'] in ['招标公告', '招标预告'] and is_contain_winner( prem_json) and re.search('直购', title): result['docchannel']['docchannel'] = '中标信息' msc += "最终规则修改:预测为招标却有中标人且标题有直购关键词返回中标" if result['docchannel']['doctype'] in ['产权交易', '土地矿产', '拍卖出让'] and origin_dic.get( original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] \ and re.search('产权|转让|受让|招租|招商|出租|承租|竞价|资产|挂牌|出让|拍卖|招拍|划拨|销售', title) == None\ and re.search('(采购|招投?标|投标)(信息|内容|项目|公告|数量|人|单位|方式)|(建设|工程|服务|施工|监理|勘察|设计)项目', text): result['docchannel']['doctype'] = '采招数据' msc += ' 最终规则修改:预测为非采招数据,原始为采招数据且有招标关键词,返回采招数据' '''下面是新格式增加返回字段''' if result['docchannel']['docchannel'] != '': # 预测到生命周期的复制到life_docchannel,否则用数据源结果 result['docchannel']['life_docchannel'] = result['docchannel']['docchannel'] else: result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别') return msc not_extract_dic = { 104: '招标文件', 106: '法律法规', 107: '新闻资讯', 108: '拟建项目', 109: '展会推广', 110: '企业名录', 111: '企业资质', 112: '全国工程人员', 113: '业主采购' } origin_dic = {51: '公告变更', 52: '招标公告', 101: '中标信息', 102: '招标预告', 103: '招标答疑', 104: '招标文件', 105: '资审结果', 106: '法律法规', 107: '新闻资讯', 108: '拟建项目', 109: '展会推广', 110: '企业名录', 111: '企业资质', 112: '全国工程', 113: '业主采购', 114: '采购意向', 115: '拍卖出让', 116: '土地矿产', 117: '产权交易', 118: '废标公告', 119: '候选人公示', 120: '合同公告'} if original_docchannel in not_extract_dic: return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel], 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '公告类别不在提取范围' if web_source_no in ['02104-7', '04733', 'DX007628-6']: # 这些数据源无法识别 return {'docchannel': {'docchannel': '', 'doctype': '采招数据', 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '此数据源公告分类不明确,返回数据源类别' title = re.sub('[^\u4e00-\u9fa5]+|出租车', '', title) if len(title) > 50: title = title[:20] + title[-30:] text = html2text(html) prem_json = json.dumps(prem, ensure_ascii=False) result = {'docchannel': {'docchannel': '', 'doctype': ''}} doc_type, type_kw = get_type(title, text) # doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel) doc_life, life_kw = get_life(title, text) if doc_type in self.title_type_dic: result['docchannel']['doctype'] = doc_type if doc_life in self.title_life_dic: result['docchannel']['docchannel'] = doc_life # print('channel正则预测结果:', result) msc = '正则结果:类型:%s, 关键词:%s, 周期:%s, 关键词:%s'%(doc_type, type_kw,doc_life, life_kw)+'\n'+'模型结果:' # print('类型:%s, 关键词:%s, 周期:%s, 关键词:%s'%(doc_type, type_kw,doc_life, life_kw)) if doc_type == "" or doc_life == "": array_content, array_title, text_len, title_len, content = get_model_inputs(list_sentence) if doc_type =="": type_id, type_prob = type_model_predict() type_model = self.id2type[type_id] if type_model == '新闻资讯' and doc_life!='': # 修复bug 78584245 "docchannel": "合同公告", "doctype": "新闻资讯", result['docchannel']['doctype'] = '采招数据' msc += '模型结果为新闻资讯,生命周期不为空,改为采招数据;' else: result['docchannel']['doctype'] = type_model msc += type_model + ' 概率:%.4f;'%type_prob # print('公告类别:', self.id2type[id], '概率:',prob) # if id == 0: if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']: if len(text)>150 and re.search(self.kws, content): life_id, life_prob = life_model_predict() life_model = self.id2life[life_id] result['docchannel']['docchannel'] = life_model msc += life_model + ' 概率:%.4f;\n'%life_prob msc = final_change(msc) # print('channel ', msc) return result, msc # 保证金支付方式提取 class DepositPaymentWay(): def __init__(self,): self.pt = '(保证金的?(交纳|缴纳|应按下列|入账|支付)方式)[::]*([^,。]{,60})' self.pt2 = '保证金(必?须以|必?须?通过|以)(.{,8})方式' kws = ['银行转账', '公?对公方?式?转账', '对公转账', '柜台转账', '(线上|网上)自?行?(缴纳|交纳|缴退|收退)', '网上银行支付', '现金存入', '直接缴纳', '支票', '汇票', '本票', '电汇', '转账', '汇款', '随机码', '入账', '基本账户转出', '基本账户汇入', '诚信库中登记的账户转出', '银行保函', '电子保函', '担保函', '保证保险', '合法担保机构出具的担保', '金融机构、担保机构出具的保函'] self.kws = sorted(kws, key=lambda x: len(x), reverse=True) def predict(self,content): pay_way = {'deposit_patment_way':''} result = [] pay = re.search(self.pt, content) if pay: # print(pay.group(0)) pay = pay.group(3) for it in re.finditer('|'.join(self.kws), pay): result.append(it.group(0)) pay_way['deposit_patment_way'] = ';'.join(result) return pay_way pay = re.search(self.pt2, content) if pay: # print(pay.group(0)) pay = pay.group(2) for it in re.finditer('|'.join(self.kws), pay): result.append(it.group(0)) pay_way['deposit_patment_way'] = ';'.join(result) return pay_way else: return pay_way # 项目标签 class ProjectLabel(): def __init__(self, ): self.keyword_list = self.get_label_keywords() self.kongjing_keyword_list = self.get_kongjing_keywords() def get_label_keywords(self): import csv path = os.path.dirname(__file__)+'/project_label_keywords.csv' with open(path, 'r',encoding='utf-8') as f: reader = csv.reader(f) key_word_list = [] for r in reader: if r[0] == '类型': continue type = r[0] key_wrod = r[1] key_paichuci = str(r[2]) key_paichuci = key_paichuci if key_paichuci and key_paichuci != 'nan' else "" type_paichuci = str(r[3]) type_paichuci = type_paichuci if type_paichuci and type_paichuci != 'nan' else "" key_word_list.append((type, key_wrod, key_paichuci, type_paichuci)) return key_word_list def get_kongjing_keywords(self): import csv path = os.path.dirname(__file__)+'/kongjing_label_keywords.csv' with open(path, 'r',encoding='utf-8') as f: reader = csv.reader(f) key_word_list = [] for r in reader: if r[0] == '关键词': continue key_wrod = r[0] key_wrod2 = str(r[1]) key_wrod2 = key_wrod2 if key_wrod2 and key_wrod2 != 'nan' else "" search_type = r[2] info_type_list = str(r[3]) info_type_list = info_type_list if info_type_list and info_type_list != 'nan' else "" key_word_list.append((key_wrod, key_wrod2, search_type, info_type_list)) return key_word_list def predict(self, doctitle,product,project_name,prem): doctitle = doctitle if doctitle else "" product = product if product else "" product = ",".join(set(product.split(','))) # 产品词去重 project_name = project_name if project_name else "" tenderee = "" agency = "" sub_project_names = [] # 标段名称 try: for k,v in prem[0]['prem'].items(): sub_project_names.append(k) for link in v['roleList']: if link['role_name'] == 'tenderee' and tenderee == "": tenderee = link['role_text'] if link['role_name'] == 'agency' and agency == "": agency = link['role_text'] except Exception as e: # print('解析prem 获取招标人、代理人出错') pass sub_project_names = ";".join(sub_project_names) # 核心字段:标题+产品词+项目名称+标段名称 main_text = ",".join([doctitle, product, project_name, sub_project_names]) # 剔除 招标单位、代理机构名称 if tenderee: doctitle = doctitle.replace(tenderee, " ") main_text = main_text.replace(tenderee, " ") if agency: doctitle = doctitle.replace(agency, " ") main_text = main_text.replace(agency, " ") doctitle_dict = dict() main_text_dict = dict() for item in self.keyword_list: _type = item[0] key_wrod = item[1] # 关键词排除词 key_paichuci = item[2] key_paichuci_s = "|".join(key_paichuci.split('、')) # 类型排除词 type_paichuci = item[3] if type_paichuci: paichuci_split = type_paichuci.split('、') if re.search("|".join(paichuci_split), main_text): continue if doctitle: if key_wrod in doctitle: if not key_paichuci_s or (key_paichuci_s and not re.search(key_paichuci_s, doctitle)): key_wrod_count1 = doctitle.count(key_wrod) if _type not in doctitle_dict: # doctitle_dict[_type] = {'关键词': [], '排除词': type_paichuci} doctitle_dict[_type] = [] doctitle_dict[_type].append((key_wrod, key_wrod_count1)) if main_text: if key_wrod in main_text: if not key_paichuci_s or (key_paichuci_s and not re.search(key_paichuci_s, main_text)): key_wrod_count2 = main_text.count(key_wrod) if _type not in main_text_dict: # main_text_dict[_type] = {'关键词': [], '排除词': type_paichuci} main_text_dict[_type] = [] main_text_dict[_type].append((key_wrod, key_wrod_count2)) # 排序 doctitle for k, v in doctitle_dict.items(): doctitle_dict[k].sort(key=lambda x: x[1], reverse=True) # 按匹配次数保留前10个标签 if len(doctitle_dict) > 10: doctitle_labels = [(k, sum(w[1] for w in doctitle_dict[k])) for k in doctitle_dict] doctitle_labels.sort(key=lambda x: x[1], reverse=True) for item in doctitle_labels[10:]: doctitle_dict.pop(item[0]) # main_text pop_list = [] for k, v in main_text_dict.items(): if sum([j[1] for j in main_text_dict[k]]) == 1: # 关键词匹配次数等于1的标签 pop_list.append(k) main_text_dict[k].sort(key=lambda x: x[1], reverse=True) # 核心字段标签,若存在同一个标签的关键词匹配次数大于1,则只保留关键词匹配次数大于1的标签,关键词匹配次数等于1的标签不要 if len(pop_list) < len(main_text_dict): for k in pop_list: main_text_dict.pop(k) # 按匹配次数保留前10个标签 if len(main_text_dict) > 10: main_text_labels = [(k, sum(w[1] for w in main_text_dict[k])) for k in main_text_dict] main_text_labels.sort(key=lambda x: x[1], reverse=True) for item in main_text_labels[10:]: main_text_dict.pop(item[0]) return {"标题":doctitle_dict,"核心字段":main_text_dict} def predict_other(self,project_label,industry,doctitle,project_name,product,list_articles): # doctextcon 取正文内容 doctextcon = list_articles[0].content.split('##attachment##')[0] info_type = industry.get('industry',{}).get("class_name","") doctitle = doctitle if doctitle else "" product = product if product else "" product = ",".join(set(product.split(','))) # 产品词去重 project_name = project_name if project_name else "" get_kongjing_label = False keywords_list = [] for item in self.kongjing_keyword_list: key_wrod = item[0] key_wrod2 = item[1] search_type = item[2] info_type_list = item[3] info_type_list = info_type_list.split("|") if info_type_list else [] search_text = "" if search_type=='正文': search_text = ",".join([doctextcon,doctitle,project_name,product]) elif search_type=='产品': search_text = ",".join([doctitle,project_name,product]) if search_type=='行业': # ’行业’类型直接用info_type匹配关键词 if info_type==key_wrod: # 匹配关键词记录 keywords_list.append(key_wrod) get_kongjing_label = True # break else: if key_wrod in search_text: if key_wrod2 and key_wrod2 not in search_text: continue if info_type_list and info_type not in info_type_list: continue # 匹配关键词记录 if key_wrod2: keywords_list.append(key_wrod+'+'+key_wrod2) else: keywords_list.append(key_wrod) get_kongjing_label = True # break if get_kongjing_label: project_label["核心字段"]["空净通"] = [[word,1] for word in keywords_list][:10] return project_label # 总价单价提取 class TotalUnitMoney: def __init__(self): pass def predict(self, list_sentences, list_entitys): for i in range(len(list_entitys)): list_entity = list_entitys[i] # 总价单价 for _entity in list_entity: if _entity.entity_type == 'money': word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text # 总价在中投标金额中 if _entity.label == 1: result = extract_total_money(word_of_sentence, _entity.entity_text, [_entity.wordOffset_begin, _entity.wordOffset_end]) if result: _entity.is_total_money = 1 # 单价在普通金额中 else: result = extract_unit_money(word_of_sentence, _entity.entity_text, [_entity.wordOffset_begin, _entity.wordOffset_end]) if result: _entity.is_unit_money = 1 # print("total_unit_money", _entity.entity_text, # _entity.is_total_money, _entity.is_unit_money) # 行业分类 class IndustryPredictor(): def __init__(self,): self.model_path = os.path.dirname(__file__)+ '/industry_model' self.id2lb = {0: '专业施工', 1: '专用仪器仪表', 2: '专用设备修理', 3: '互联网信息服务', 4: '互联网安全服务', 5: '互联网平台', 6: '互联网接入及相关服务', 7: '人力资源服务', 8: '人造原油', 9: '仓储业', 10: '仪器仪表', 11: '仪器仪表修理', 12: '会计、审计及税务服务', 13: '会议、展览及相关服务', 14: '住宅、商业用房', 15: '体育场地设施管理', 16: '体育组织', 17: '体育设备', 18: '保险服务', 19: '信息处理和存储支持服务', 20: '信息技术咨询服务', 21: '信息系统集成和物联网技术服务', 22: '修缮工程', 23: '健康咨询', 24: '公路旅客运输', 25: '其他专业咨询与调查', 26: '其他专业技术服务', 27: '其他交通运输设备', 28: '其他公共设施管理', 29: '其他土木工程建筑', 30: '其他工程服务', 31: '其他建筑建材', 32: '其他运输业', 33: '农业和林业机械', 34: '农业服务', 35: '农产品', 36: '农副食品,动、植物油制品', 37: '出版业', 38: '办公消耗用品及类似物品', 39: '办公设备', 40: '化学原料及化学制品', 41: '化学纤维', 42: '化学药品和中药专用设备', 43: '医疗设备', 44: '医药品', 45: '卫星传输服务', 46: '卫生', 47: '印刷服务', 48: '图书和档案', 49: '图书档案设备', 50: '图书馆与档案馆', 51: '土地管理业', 52: '地质勘查', 53: '地震服务', 54: '场馆、站港用房', 55: '城市公共交通运输', 56: '塑料制品、半成品及辅料', 57: '天然石料', 58: '娱乐设备', 59: '婚姻服务', 60: '安全保护服务', 61: '安全生产设备', 62: '家具用具', 63: '家用电器修理', 64: '工业、生产用房', 65: '工业与专业设计及其他专业技术服务', 66: '工矿工程建筑', 67: '工程技术与设计服务', 68: '工程机械', 69: '工程监理服务', 70: '工程评价服务', 71: '工程造价服务', 72: '市场调查', 73: '广告业', 74: '广播', 75: '广播、电视、电影设备', 76: '广播电视传输服务', 77: '废弃资源综合利用业', 78: '建筑涂料', 79: '建筑物、构筑物附属结构', 80: '建筑物拆除和场地准备活动', 81: '建筑装饰和装修业', 82: '录音制作', 83: '影视节目制作', 84: '房地产中介服务', 85: '房地产开发经营', 86: '房地产租赁经营', 87: '房屋租赁', 88: '招标代理', 89: '探矿、采矿、选矿和造块设备', 90: '政法、检测专用设备', 91: '教育服务', 92: '教育设备', 93: '文物及非物质文化遗产保护', 94: '文物和陈列品', 95: '文艺创作与表演', 96: '文艺设备', 97: '新闻业', 98: '旅行社及相关服务', 99: '日杂用品', 100: '有色金属冶炼及压延产品', 101: '有色金属矿', 102: '木材、板材等', 103: '木材采集和加工设备', 104: '机械设备', 105: '机械设备经营租赁', 106: '林业产品', 107: '林业服务', 108: '架线和管道工程建筑', 109: '核工业专用设备', 110: '橡胶制品', 111: '殡葬服务', 112: '殡葬设备及用品', 113: '气象服务', 114: '水上交通运输设备', 115: '水上运输业', 116: '水利和水运工程建筑', 117: '水工机械', 118: '水文服务', 119: '水资源管理', 120: '污水处理及其再生利用', 121: '汽车、摩托车修理与维护', 122: '法律服务', 123: '洗染服务', 124: '测绘地理信息服务', 125: '海洋仪器设备', 126: '海洋工程建筑', 127: '海洋服务', 128: '消防设备', 129: '清洁服务', 130: '渔业产品', 131: '渔业服务', 132: '炼焦和金属冶炼轧制设备', 133: '烟草加工设备', 134: '热力生产和供应', 135: '焦炭及其副产品', 136: '煤炭采选产品', 137: '燃气生产和供应业', 138: '物业管理', 139: '特种用途动、植物', 140: '环保咨询', 141: '环境与生态监测检测服务', 142: '环境污染防治设备', 143: '环境治理业', 144: '玻璃及其制品', 145: '理发及美容服务', 146: '生态保护', 147: '电信', 148: '电力、城市燃气、蒸汽和热水、水', 149: '电力供应', 150: '电力工业专用设备', 151: '电力工程施工', 152: '电力生产', 153: '电子和通信测量仪器', 154: '电工、电子专用生产设备', 155: '电影放映', 156: '电气安装', 157: '电气设备', 158: '电气设备修理', 159: '畜牧业服务', 160: '监控设备', 161: '石油制品', 162: '石油和化学工业专用设备', 163: '石油和天然气开采产品', 164: '石油天然气开采专用设备', 165: '研究和试验发展', 166: '社会工作', 167: '社会经济咨询', 168: '科技推广和应用服务业', 169: '科研、医疗、教育用房', 170: '管道和设备安装', 171: '粮油作物和饲料加工设备', 172: '纸、纸制品及印刷品', 173: '纺织原料、毛皮、被服装具', 174: '纺织设备', 175: '绿化管理', 176: '缝纫、服饰、制革和毛皮加工设备', 177: '航空器及其配套设备', 178: '航空客货运输', 179: '航空航天工业专用设备', 180: '节能环保工程施工', 181: '装卸搬运', 182: '计算机和办公设备维修', 183: '计算机设备', 184: '计量标准器具及量具、衡器', 185: '货币处理专用设备', 186: '货币金融服务', 187: '质检技术服务', 188: '资本市场服务', 189: '车辆', 190: '边界勘界和联检专用设备', 191: '运行维护服务', 192: '通信设备', 193: '通用设备修理', 194: '道路货物运输', 195: '邮政专用设备', 196: '邮政业', 197: '采矿业和制造业服务', 198: '铁路、船舶、航空航天等运输设备修理', 199: '铁路、道路、隧道和桥梁工程建筑', 200: '铁路运输设备', 201: '防洪除涝设施管理', 202: '陶瓷制品', 203: '雷达、无线电和卫星导航设备', 204: '非金属矿', 205: '非金属矿物制品工业专用设备', 206: '非金属矿物材料', 207: '食品加工专用设备', 208: '食品及加工盐', 209: '餐饮业', 210: '饮料、酒精及精制茶', 211: '饮料加工设备', 212: '饲养动物及其产品', 213: '黑色金属冶炼及压延产品', 214: '黑色金属矿'} self.industry_dic = {'专业施工': {'大类': '专业施工', '门类': '建筑业'}, '专用仪器仪表': {'大类': '专用设备', '门类': '零售批发'}, '专用设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'}, '互联网信息服务': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'}, '互联网安全服务': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'}, '互联网平台': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'}, '互联网接入及相关服务': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'}, '人力资源服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '人造原油': {'大类': '炼焦产品、炼油产品', '门类': '零售批发'}, '仓储业': {'大类': '装卸搬运和运输代理业', '门类': '交通运输、仓储和邮政业'}, '仪器仪表': {'大类': '通用设备', '门类': '零售批发'}, '仪器仪表修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'}, '会计、审计及税务服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '会议、展览及相关服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '住宅、商业用房': {'大类': '房屋建筑业', '门类': '建筑业'}, '体育场地设施管理': {'大类': '体育', '门类': '文化、体育和娱乐业'}, '体育组织': {'大类': '体育', '门类': '文化、体育和娱乐业'}, '体育设备': {'大类': '专用设备', '门类': '零售批发'}, '保险服务': {'大类': '保险业', '门类': '金融业'}, '信息处理和存储支持服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'}, '信息技术咨询服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'}, '信息系统集成和物联网技术服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'}, '修缮工程': {'大类': '修缮工程', '门类': '建筑业'}, '健康咨询': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '公路旅客运输': {'大类': '道路运输业', '门类': '交通运输、仓储和邮政业'}, '其他专业咨询与调查': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '其他专业技术服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'}, '其他交通运输设备': {'大类': '专用设备', '门类': '零售批发'}, '其他公共设施管理': {'大类': '公共设施管理业', '门类': '水利、环境和公共设施管理业'}, '其他土木工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'}, '其他工程服务': {'大类': '工程服务', '门类': '科学研究和技术服务业'}, '其他建筑建材': {'大类': '建筑建材', '门类': '零售批发'}, '其他运输业': {'大类': '其他运输业', '门类': '交通运输、仓储和邮政业'}, '农业和林业机械': {'大类': '专用设备', '门类': '零售批发'}, '农业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'}, '农产品': {'大类': '农林牧渔业产品', '门类': '零售批发'}, '农副食品,动、植物油制品': {'大类': '食品、饮料和烟草原料', '门类': '零售批发'}, '出版业': {'大类': '新闻和出版业', '门类': '文化、体育和娱乐业'}, '办公消耗用品及类似物品': {'大类': '办公消耗用品及类似物品', '门类': '零售批发'}, '办公设备': {'大类': '通用设备', '门类': '零售批发'}, '化学原料及化学制品': {'大类': '基础化学品及相关产品', '门类': '零售批发'}, '化学纤维': {'大类': '基础化学品及相关产品', '门类': '零售批发'}, '化学药品和中药专用设备': {'大类': '专用设备', '门类': '零售批发'}, '医疗设备': {'大类': '专用设备', '门类': '零售批发'}, '医药品': {'大类': '医药品', '门类': '零售批发'}, '卫星传输服务': {'大类': '电信、广播电视和卫星传输服务', '门类': '信息传输、软件和信息技术服务业'}, '卫生': {'大类': '卫生', '门类': '卫生和社会工作'}, '印刷服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '图书和档案': {'大类': '图书和档案', '门类': '零售批发'}, '图书档案设备': {'大类': '通用设备', '门类': '零售批发'}, '图书馆与档案馆': {'大类': '文化艺术业', '门类': '文化、体育和娱乐业'}, '土地管理业': {'大类': '土地管理业', '门类': '水利、环境和公共设施管理业'}, '地质勘查': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'}, '地震服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'}, '场馆、站港用房': {'大类': '房屋建筑业', '门类': '建筑业'}, '城市公共交通运输': {'大类': '道路运输业', '门类': '交通运输、仓储和邮政业'}, '塑料制品、半成品及辅料': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'}, '天然石料': {'大类': '建筑建材', '门类': '零售批发'}, '娱乐设备': {'大类': '专用设备', '门类': '零售批发'}, '婚姻服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'}, '安全保护服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '安全生产设备': {'大类': '专用设备', '门类': '零售批发'}, '家具用具': {'大类': '家具用具', '门类': '零售批发'}, '家用电器修理': {'大类': '机动车、电子产品和日用产品修理业', '门类': '居民服务、修理和其他服务业'}, '工业、生产用房': {'大类': '房屋建筑业', '门类': '建筑业'}, '工业与专业设计及其他专业技术服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'}, '工矿工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'}, '工程技术与设计服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'}, '工程机械': {'大类': '专用设备', '门类': '零售批发'}, '工程监理服务': {'大类': '工程服务', '门类': '科学研究和技术服务业'}, '工程评价服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'}, '工程造价服务': {'大类': '工程服务', '门类': '科学研究和技术服务业'}, '市场调查': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '广告业': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '广播': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'}, '广播、电视、电影设备': {'大类': '通用设备', '门类': '零售批发'}, '广播电视传输服务': {'大类': '电信、广播电视和卫星传输服务', '门类': '信息传输、软件和信息技术服务业'}, '废弃资源综合利用业': {'大类': '废弃资源综合利用业', '门类': '废弃资源综合利用业'}, '建筑涂料': {'大类': '建筑建材', '门类': '零售批发'}, '建筑物、构筑物附属结构': {'大类': '建筑建材', '门类': '零售批发'}, '建筑物拆除和场地准备活动': {'大类': '建筑装饰和其他建筑业', '门类': '建筑业'}, '建筑装饰和装修业': {'大类': '建筑装饰和其他建筑业', '门类': '建筑业'}, '录音制作': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'}, '影视节目制作': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'}, '房地产中介服务': {'大类': '房地产业', '门类': '房地产业'}, '房地产开发经营': {'大类': '房地产业', '门类': '房地产业'}, '房地产租赁经营': {'大类': '房地产业', '门类': '房地产业'}, '房屋租赁': {'大类': '租赁业', '门类': '租赁和商务服务业'}, '招标代理': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'}, '探矿、采矿、选矿和造块设备': {'大类': '专用设备', '门类': '零售批发'}, '政法、检测专用设备': {'大类': '专用设备', '门类': '零售批发'}, '教育服务': {'大类': '教育服务', '门类': '教育'}, '教育设备': {'大类': '专用设备', '门类': '零售批发'}, '文体设备和用品出租': {'大类': '租赁业', '门类': '租赁和商务服务业'}, '文物及非物质文化遗产保护': {'大类': '文化艺术业', '门类': '文化、体育和娱乐业'}, '文物和陈列品': {'大类': '文物和陈列品', '门类': '零售批发'}, '文艺创作与表演': {'大类': '文化艺术业', '门类': '文化、体育和娱乐业'}, '文艺设备': {'大类': '专用设备', '门类': '零售批发'}, '新闻业': {'大类': '新闻和出版业', '门类': '文化、体育和娱乐业'}, '旅行社及相关服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '日杂用品': {'大类': '日杂用品', '门类': '零售批发'}, '有色金属冶炼及压延产品': {'大类': '建筑建材', '门类': '零售批发'}, '有色金属矿': {'大类': '矿与矿物', '门类': '零售批发'}, '木材、板材等': {'大类': '建筑建材', '门类': '零售批发'}, '木材采集和加工设备': {'大类': '专用设备', '门类': '零售批发'}, '机械设备': {'大类': '通用设备', '门类': '零售批发'}, '机械设备经营租赁': {'大类': '租赁业', '门类': '租赁和商务服务业'}, '林业产品': {'大类': '农林牧渔业产品', '门类': '零售批发'}, '林业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'}, '架线和管道工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'}, '核工业专用设备': {'大类': '专用设备', '门类': '零售批发'}, '橡胶制品': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'}, '殡葬服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'}, '殡葬设备及用品': {'大类': '专用设备', '门类': '零售批发'}, '气象服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'}, '水上交通运输设备': {'大类': '专用设备', '门类': '零售批发'}, '水上运输业': {'大类': '水上运输业', '门类': '交通运输、仓储和邮政业'}, '水利和水运工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'}, '水工机械': {'大类': '专用设备', '门类': '零售批发'}, '水文服务': {'大类': '水利管理业', '门类': '水利、环境和公共设施管理业'}, '水资源管理': {'大类': '水利管理业', '门类': '水利、环境和公共设施管理业'}, '污水处理及其再生利用': {'大类': '水的生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'}, '汽车、摩托车修理与维护': {'大类': '机动车、电子产品和日用产品修理业', '门类': '居民服务、修理和其他服务业'}, '法律服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '洗染服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'}, '测绘地理信息服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'}, '海洋仪器设备': {'大类': '专用设备', '门类': '零售批发'}, '海洋工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'}, '海洋服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'}, '消防设备': {'大类': '专用设备', '门类': '零售批发'}, '清洁服务': {'大类': '其他服务业', '门类': '居民服务、修理和其他服务业'}, '渔业产品': {'大类': '农林牧渔业产品', '门类': '零售批发'}, '渔业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'}, '炼焦和金属冶炼轧制设备': {'大类': '专用设备', '门类': '零售批发'}, '烟草加工设备': {'大类': '专用设备', '门类': '零售批发'}, '热力生产和供应': {'大类': '电力、热力生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'}, '焦炭及其副产品': {'大类': '炼焦产品、炼油产品', '门类': '零售批发'}, '煤炭采选产品': {'大类': '矿与矿物', '门类': '零售批发'}, '燃气生产和供应业': {'大类': '燃气生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'}, '物业管理': {'大类': '房地产业', '门类': '房地产业'}, '特种用途动、植物': {'大类': '农林牧渔业产品', '门类': '零售批发'}, '环保咨询': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '环境与生态监测检测服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'}, '环境污染防治设备': {'大类': '专用设备', '门类': '零售批发'}, '环境治理业': {'大类': '生态保护和环境治理业', '门类': '水利、环境和公共设施管理业'}, '玻璃及其制品': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'}, '理发及美容服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'}, '生态保护': {'大类': '生态保护和环境治理业', '门类': '水利、环境和公共设施管理业'}, '电信': {'大类': '电信、广播电视和卫星传输服务', '门类': '信息传输、软件和信息技术服务业'}, '电力、城市燃气、蒸汽和热水、水': {'大类': '电力、城市燃气、蒸汽和热水、水', '门类': '零售批发'}, '电力供应': {'大类': '电力、热力生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'}, '电力工业专用设备': {'大类': '专用设备', '门类': '零售批发'}, '电力工程施工': {'大类': '土木工程建筑业', '门类': '建筑业'}, '电力生产': {'大类': '电力、热力生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'}, '电子和通信测量仪器': {'大类': '通用设备', '门类': '零售批发'}, '电工、电子专用生产设备': {'大类': '专用设备', '门类': '零售批发'}, '电影放映': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'}, '电气安装': {'大类': '建筑安装业', '门类': '建筑业'}, '电气设备': {'大类': '通用设备', '门类': '零售批发'}, '电气设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'}, '畜牧业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'}, '监控设备': {'大类': '通用设备', '门类': '零售批发'}, '石油制品': {'大类': '炼焦产品、炼油产品', '门类': '零售批发'}, '石油和化学工业专用设备': {'大类': '专用设备', '门类': '零售批发'}, '石油和天然气开采产品': {'大类': '矿与矿物', '门类': '零售批发'}, '石油天然气开采专用设备': {'大类': '专用设备', '门类': '零售批发'}, '研究和试验发展': {'大类': '研究和试验发展', '门类': '科学研究和技术服务业'}, '社会工作': {'大类': '社会工作', '门类': '卫生和社会工作'}, '社会经济咨询': {'大类': '商务服务业', '门类': '租赁和商务服务业'}, '科技推广和应用服务业': {'大类': '科技推广和应用服务业', '门类': '科学研究和技术服务业'}, '科研、医疗、教育用房': {'大类': '房屋建筑业', '门类': '建筑业'}, '管道和设备安装': {'大类': '建筑安装业', '门类': '建筑业'}, '粮油作物和饲料加工设备': {'大类': '专用设备', '门类': '零售批发'}, '纸、纸制品及印刷品': {'大类': '纸、纸制品及印刷品', '门类': '零售批发'}, '纺织原料、毛皮、被服装具': {'大类': '纺织原料、毛皮、被服装具', '门类': '零售批发'}, '纺织设备': {'大类': '专用设备', '门类': '零售批发'}, '绿化管理': {'大类': '公共设施管理业', '门类': '水利、环境和公共设施管理业'}, '缝纫、服饰、制革和毛皮加工设备': {'大类': '专用设备', '门类': '零售批发'}, '航空器及其配套设备': {'大类': '专用设备', '门类': '零售批发'}, '航空客货运输': {'大类': '航空运输业', '门类': '交通运输、仓储和邮政业'}, '航空航天工业专用设备': {'大类': '专用设备', '门类': '零售批发'}, '节能环保工程施工': {'大类': '土木工程建筑业', '门类': '建筑业'}, '装卸搬运': {'大类': '装卸搬运和运输代理业', '门类': '交通运输、仓储和邮政业'}, '计算机和办公设备维修': {'大类': '机动车、电子产品和日用产品修理业', '门类': '居民服务、修理和其他服务业'}, '计算机设备': {'大类': '通用设备', '门类': '零售批发'}, '计量标准器具及量具、衡器': {'大类': '通用设备', '门类': '零售批发'}, '货币处理专用设备': {'大类': '专用设备', '门类': '零售批发'}, '货币金融服务': {'大类': '货币金融服务', '门类': '金融业'}, '质检技术服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'}, '资本市场服务': {'大类': '资本市场服务', '门类': '金融业'}, '车辆': {'大类': '通用设备', '门类': '零售批发'}, '边界勘界和联检专用设备': {'大类': '专用设备', '门类': '零售批发'}, '运行维护服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'}, '通信设备': {'大类': '通用设备', '门类': '零售批发'}, '通用设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'}, '道路货物运输': {'大类': '道路运输业', '门类': '交通运输、仓储和邮政业'}, '邮政专用设备': {'大类': '专用设备', '门类': '零售批发'}, '邮政业': {'大类': '邮政业', '门类': '交通运输、仓储和邮政业'}, '采矿业和制造业服务': {'大类': '采矿业和制造业服务', '门类': '农林牧副渔服务'}, '铁路、船舶、航空航天等运输设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'}, '铁路、道路、隧道和桥梁工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'}, '铁路运输设备': {'大类': '专用设备', '门类': '零售批发'}, '防洪除涝设施管理': {'大类': '水利管理业', '门类': '水利、环境和公共设施管理业'}, '陶瓷制品': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'}, '雷达、无线电和卫星导航设备': {'大类': '通用设备', '门类': '零售批发'}, '非金属矿': {'大类': '矿与矿物', '门类': '零售批发'}, '非金属矿物制品工业专用设备': {'大类': '专用设备', '门类': '零售批发'}, '非金属矿物材料': {'大类': '建筑建材', '门类': '零售批发'}, '食品加工专用设备': {'大类': '专用设备', '门类': '零售批发'}, '食品及加工盐': {'大类': '食品、饮料和烟草原料', '门类': '零售批发'}, '餐饮业': {'大类': '餐饮业', '门类': '住宿和餐饮业'}, '饮料、酒精及精制茶': {'大类': '食品、饮料和烟草原料', '门类': '零售批发'}, '饮料加工设备': {'大类': '专用设备', '门类': '零售批发'}, '饲养动物及其产品': {'大类': '农林牧渔业产品', '门类': '零售批发'}, '黑色金属冶炼及压延产品': {'大类': '建筑建材', '门类': '零售批发'}, '黑色金属矿': {'大类': '矿与矿物', '门类': '零售批发'}} self.sess = tf.Session(graph=tf.Graph()) self.get_model() with open(os.path.dirname(__file__)+'/industry_rule_kw_json/tw_industry_keyword_org/tw_industry_keyword_org.json', 'r', encoding='utf-8') as fp1: self.json_data_industry = json.load(fp1) with open(os.path.dirname(__file__)+'/industry_rule_kw_json/tw_company_classification_keyword/tw_company_classification_keyword.json', 'r', encoding='utf-8') as fp2: self.json_data_company = json.load(fp2) with open(os.path.dirname(__file__)+'/industry_rule_kw_json/tw_custom_keyword/tw_custom_keyword.json', 'r', encoding='utf-8') as fp3: self.json_data_custom = json.load(fp3) def get_model(self): with self.sess.as_default() as sess: with self.sess.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(sess, tags=['serve'], export_dir=os.path.dirname(__file__)+'/industry_model') signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.title = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs['title'].name) self.project = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs['project'].name) self.product = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs['product'].name) self.outputs = sess.graph.get_tensor_by_name(signature_def[signature_key].outputs['outputs'].name) def text2array(self, text, tenderee='', maxSententLen=20): tenderee = tenderee.replace('(', '(').replace(')', ')') text = text.replace('(', '(').replace(')', ')') text = re.sub( '(废标|终止|综?合?评审|评标|开标|资审|履约|验收|成交|中标人?|中选人?|单一来源|合同|候选人|结果|变更|更正|答疑|澄清|意向|需求|采购|招标|询比?价|磋商|谈判|比选|比价|竞价|议价)的?(公告|预告|公示)?|关于为?|选取|定点|直接|邀请函?|通知书?|备案|公开|公示|公告|记录|竞争性', '', text) text = text.replace(tenderee, '') text = ' ' if text=="" else text words_docs_list = selffool.cut(text) words_docs_list = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)][-maxSententLen:] for l in words_docs_list] array = embedding(words_docs_list, shape=(len(words_docs_list), maxSententLen, 128)) return array def process(self, title, project, product, tenderee): return self.text2array(title, tenderee), self.text2array(project, tenderee), self.text2array(product) def predict_model(self, title, project, product, tenderee=''): title_array, project_array, product_array = self.process(title, project, product, tenderee) rs = self.sess.run(self.outputs, feed_dict={ self.title:title_array, self.project:project_array, self.product:product_array } ) pred = np.argmax(rs[0]) return self.id2lb[pred], rs[0][pred] # # 返回top2 结果 # pred_list = np.argsort(-rs[0]) # return self.id2lb[pred_list[0]], self.id2lb[pred_list[1]], rs[0][pred_list[0]], rs[0][pred_list[1]] def predict_rule(self, doctitle, tenderee, win_tenderer, project_name, product): doctitle = doctitle if doctitle else '' tenderee = tenderee if tenderee else '' win_tenderer = win_tenderer if win_tenderer else '' project_name = project_name if project_name else '' product = product if product else '' text_ind = (doctitle + project_name + product).replace(tenderee, '') text_com = win_tenderer length_ind_text = len(text_ind) + 1 length_com_text = len(text_com) + 1 # print(text) dic_res = {} # 行业分类字典 score_lst = [] # 得分列表 word_lst = [] # 关键词列表 # 主要内容关键词 if text_ind: # logging.info("data_ind%s"%str(_json_data_industry[0])) for data_industry in self.json_data_industry: industry = data_industry['xiaolei'] key_word = data_industry['key_word'] key_word_2 = data_industry['key_word2'] power = float(data_industry['power']) if data_industry['power'] else 0 this_score = power * (text_ind.count(key_word) * len(key_word) / length_ind_text) if key_word_2: # key_word_compose = key_word + "+" + key_word_2 if text_ind.count(key_word_2) == 0: this_score = 0 if this_score > 0: # print(industry,key_word,this_score) if industry in dic_res.keys(): dic_res[industry] += this_score else: dic_res[industry] = this_score if key_word not in word_lst: word_lst.append(key_word) # 供应商关键词 if text_com: for data_company in self.json_data_company: industry = data_company['industry_type'] key_word = data_company['company_word'] power = float(data_company['industry_rate']) if data_company['industry_rate'] else 0 this_score = power * (text_com.count(key_word) * len(key_word) / length_com_text) if this_score > 0: # print(industry,key_word,this_score) if industry in dic_res.keys(): dic_res[industry] += this_score else: dic_res[industry] = this_score if key_word not in word_lst: word_lst.append(key_word) # 自定义关键词 if text_ind: custom_ind = [ ['tenderee', '医院|疾病预防', ['设备', '系统', '器'], '医疗设备'], ['tenderee', '学校|大学|小学|中学|学院|幼儿园', ['设备', '器'], '教育设备'], ['tenderee', '学校|大学|小学|中学|学院|幼儿园|医院', ['工程'], '科研、医疗、教育用房'], ['tenderee', '供电局|电网|国网|电力|电厂|粤电', ['设备', '器', '物资'], '电力工业专用设备'], ['tenderee', '公安|法院|检察院', ['设备', '器'], '政法、检测专用设备'], ['tenderee', '^中铁|^中交|^中建|中国建筑', ['材料'], '其他建筑建材'], ['doctextcon', '信息技术服务|系统开发|信息化|信息系统', ['监理'], '信息技术咨询服务'], ['doctextcon', '工程', ['消防'], '专业施工'], ['doctextcon', '铁路|航空|船舶|航天|广铁', ['维修'], '铁路、船舶、航空航天等运输设备修理'], ['doctextcon', '设备|仪|器', ['租赁'], '机械设备经营租赁'], ['doctextcon', '交通|铁路|公路|道路|桥梁', ['工程'], '铁路、道路、隧道和桥梁工程建筑'], ['win_tenderer', '电力', ['设备', '器'], '电力工业专用设备'], ['win_tenderer', '信息|网络科技', ['系统'], '信息系统集成和物联网技术服务'], ['tenderee,doctextcon', '铁路|广铁|铁道', ['设备', '器', '物资', '材料', '铁路'], '铁路运输设备'], ] for data_custom in self.json_data_custom: industry_custom = data_custom['industry'] key_word = data_custom['company_word'] power = float(data_custom['industry_rate']) for k in range(len(custom_ind)): subject = '' if 'tenderee' in custom_ind[k][0]: subject += tenderee if 'win_tenderer' in custom_ind[k][0]: subject += win_tenderer if 'doctextcon' in custom_ind[k][0]: subject += text_ind ptn = custom_ind[k][1] # print('ptn',ptn) if re.search(ptn, subject) and industry_custom in custom_ind[k][2]: industry = custom_ind[k][3] else: continue this_score = power * (text_ind.count(key_word) * len(key_word) / len(subject)) if this_score > 0: # print(industry,key_word,this_score) if industry in dic_res.keys(): dic_res[industry] += this_score else: dic_res[industry] = this_score if key_word not in word_lst: word_lst.append(key_word) sort_res = sorted(dic_res.items(), key=lambda x: x[1], reverse=True) lst_res = [s[0] for s in sort_res] score_lst = [str(round(float(s[1]), 2)) for s in sort_res] if len(lst_res) > 0: return lst_res, score_lst, word_lst else: return [""], [], [] def predict_merge(self, pinmu_type, industry_lst): ''' 通过一系列规则最终决定使用模型还是规则的结果 :param pinmu_type: 模型预测类别 :param industry_lst: 规则预测类别列表 :return: ''' industry_type = industry_lst[0] if industry_type == "": return pinmu_type if industry_type == '专用设备修理' and re.search('修理|维修|装修|修缮', pinmu_type): final_type = pinmu_type elif industry_type == '其他土木工程建筑' and re.search('工程|建筑|用房|施工|安装|质检|其他专业咨询与调查', pinmu_type): final_type = pinmu_type elif pinmu_type == '专用设备修理' and re.search('工程|修理', industry_type): final_type = industry_type elif pinmu_type == '信息系统集成和物联网技术服务' and re.search('卫星传输|信息处理和存储支持服务|信息技术咨询服务|运行维护服务|其他专业技术服务|医疗设备|医药品', industry_type): final_type = industry_type elif industry_type == '仪器仪表' and re.search('仪器|器具|医疗设备', pinmu_type): final_type = pinmu_type elif industry_type == '医药品' and re.search('医疗设备', pinmu_type): final_type = pinmu_type elif industry_type == '医药品' and re.search('医疗设备', pinmu_type): final_type = pinmu_type elif re.search('设备', industry_type) and re.search('修理|维修', pinmu_type): final_type = pinmu_type elif industry_type == '社会工作' and re.search('工程', pinmu_type): final_type = pinmu_type elif industry_type == '信息系统集成和物联网技术服务' and re.search('信息处理|设备', pinmu_type): final_type = pinmu_type elif industry_type == '研究和试验发展' and re.search('其他专业咨询与调查|质检技术服务|信息系统集成|其他工程服务', pinmu_type): final_type = pinmu_type elif industry_type == '其他专业咨询与调查' and re.search('工程造价服务', pinmu_type): final_type = pinmu_type elif industry_type == '广告业' and re.search('印刷服务|影视节目制作|信息系统', pinmu_type): final_type = pinmu_type elif industry_type == '清洁服务' and re.search('工程|环境污染防治设备|修理', pinmu_type): final_type = pinmu_type elif industry_type == '其他公共设施管理' and re.search('信息系统', pinmu_type): final_type = pinmu_type elif industry_type == '其他专业技术服务' and re.search('工程技术与设计服务|质检技术服务|环境与生态监测检测服务', pinmu_type): final_type = pinmu_type elif industry_type == '机械设备经营租赁' and re.search('电信', pinmu_type): final_type = pinmu_type elif industry_type == '货币金融服务' and re.search('信息系统集成和物联网技术服务', pinmu_type): final_type = pinmu_type elif industry_type == '体育场地设施管理' and re.search('体育设备', pinmu_type): final_type = pinmu_type elif industry_type == '安全保护服务' and re.search('信息系统|监控设备|互联网安全服务', pinmu_type): final_type = pinmu_type elif industry_type == '互联网接入及相关服务' and re.search('通信设备', pinmu_type): final_type = pinmu_type elif industry_type == '卫生' and re.search('医疗设备|信息系统', pinmu_type): final_type = pinmu_type elif pinmu_type == '研究和试验发展' and re.search('其他工程服务', industry_type): final_type = industry_type elif pinmu_type == '办公设备' and re.search('教育设备', industry_type): final_type = industry_type elif re.search('车辆|机械设备经营租赁', pinmu_type) and re.search('公路旅客运输', industry_type): final_type = industry_type elif len(industry_lst) > 1 and pinmu_type == industry_lst[1] and re.search('会计|法律|物业|家具|印刷|互联网安全', industry_type) == None \ and re.search('其他|人力资源服务', pinmu_type) == None: final_type = pinmu_type elif industry_type != "": final_type = industry_type else: final_type = pinmu_type return final_type def predict(self, title, project, product, prem): def get_ree_win(prem): tenderee = "" win_tenderer = "" try: for v in prem[0]['prem'].values(): for link in v['roleList']: if link['role_name'] == 'tenderee' and tenderee == "": tenderee = link['role_text'] elif link['role_name'] == 'win_tenderer' and win_tenderer == "": win_tenderer = link['role_text'] except Exception as e: print('解析prem 获取招标人、中标人出错') return tenderee, win_tenderer tenderee, win_tenderer = get_ree_win(prem) result_model, prob = self.predict_model(title, project, product, tenderee) industry_lst, score_lst, word_lst = self.predict_rule(title, tenderee, win_tenderer, project, product) final_type = self.predict_merge(result_model, industry_lst) # print('模型:%s;规则:%s;最终:%s'%(result_model, industry_lst[0], final_type)) # return {'industry': final_type} return {'industry': { 'class_name': final_type, 'subclass': self.industry_dic[final_type]['大类'], 'class': self.industry_dic[final_type]['门类'] } } class DistrictPredictor(): def __init__(self): # with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f: # dist_dic = pickle.load(f) # short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True)) # full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True)) # short2id = {} # full2id = {} # for k, v in dist_dic.items(): # if v['简称'] not in short2id: # short2id[v['简称']] = [k] # else: # short2id[v['简称']].append(k) # if v['全称'] not in full2id: # full2id[v['全称']] = [k] # else: # full2id[v['全称']].append(k) # self.dist_dic = dist_dic # self.short_name = short_name # self.full_name = full_name # self.short2id = short2id # self.full2id = full2id # # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8') with open(os.path.dirname(__file__)+'/district_tuple.pkl', 'rb') as f: district_tuple = pickle.load(f) self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple def predict_backup(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""): ''' 先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content :param project_name: :param prem: :param title: :param list_articles: :param web_source_name: :return: ''' def get_ree_addr(prem): tenderee = "" tenderee_address = "" try: for v in prem[0]['prem'].values(): for link in v['roleList']: if link['role_name'] == 'tenderee' and tenderee == "": tenderee = link['role_text'] tenderee_address = link['address'] except Exception as e: print('解析prem 获取招标人、及地址出错') return tenderee, tenderee_address def get_area(text, web_source_name, not_in_content=True): score_l = [] id_set = set() if re.search(self.short_name, text): for it in re.finditer(self.full_name, text): name = it.group(0) score = len(name) / len(text) for _id in self.full2id[name]: area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area'])) # score_l.append([_id, score] + area) # w = self.dist_dic[_id]['权重'] score_l.append([_id, score + 1] + area) # 匹配全称的加1 ,不加权重,因为权重某些赋值不好 flag = 0 for it in re.finditer(self.short_name, text): if it.end() < len(text) and re.search('^(村|镇|街|路|江|河|湖|北路|南路|东路|大道|社区)', text[it.end():]) == None: name = it.group(0) score = (it.start() + len(name)) / len(text) for _id in self.short2id[name]: score2 = 0 w = self.dist_dic[_id]['权重'] _type = self.dist_dic[_id]['类型'] area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area'])) if area[0] in ['2', '16', '20', '30']: _type += 10 if w < 1 and it.end() < len(text) and text[it.end()] in ['省', '市', '县']: # 如果简称后面 有省市县权重改为1 w = 1 score2 += w if _id not in id_set: if _type == 20: type_w = 3 elif _type == 30: if it.start()>3 and text[it.start()-1] == '市': # 城市后面 简称不能作为市 type_w = 0 else: type_w = 2 else: if it.end() 0: df_city = df.groupby('city').sum().sort_values(by=['score'], ascending=False) city_id = df_city.index[0] area_dic['city'] = self.dist_dic[city_id]['地区'] df = df[df['district'] != ""] df = df[df['city'] == city_id] if len(df) > 0: df_dist = df.groupby('district').sum().sort_values(by=['score'], ascending=False) dist_id = df_dist.index[0] area_dic['district'] = self.dist_dic[dist_id]['地区'] # print(area_dic) return {'district': area_dic} def get_role_address(text): '''正则匹配获取招标人地址 3:地址直接在招标人后面 招标人:xxx,地址:xxx 4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx. ''' p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])' p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])' p5 = '(采购|招标)(人|单位)(联系)?地址:(?P(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])' if re.search(p3, text): return re.search(p3, text).group('addr') elif re.search(p4, text): return re.search(p4, text).group('addr') elif re.search(p5, text): return re.search(p5, text).group('addr') else: return '' def get_project_addr(text): p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?):(?P(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])' if re.search(p1, text): return re.search(p1, text).group('addr') else: return '' def get_bid_addr(text): p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(?P(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])' if re.search(p2, text): return re.search(p2, text).group('addr') else: return '' def get_all_addr(list_entitys): tenderee_l = [] addr_l = [] for ent in list_entitys[0]: if ent.entity_type == 'location' and len(ent.entity_text)>2: addr_l.append(ent.entity_text) elif ent.entity_type in ['org', 'company']: if ent.label in [0, 1]: # 加招标或代理 tenderee_l.append(ent.entity_text) return ' '.join(addr_l), ' '.join(tenderee_l) def get_title_addr(text): p1 = '(?P(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])' if re.search(p1, text): return re.search(p1, text).group('addr') else: return '' if '##attachment##' in list_articles[0].content: content, attachment = list_articles[0].content.split('##attachment##') if len(content) < 200: content += attachment else: content = list_articles[0].content tenderee, tenderee_address = get_ree_addr(prem) msc = "" pro_addr = get_project_addr(content) if pro_addr != "": msc += '使用规则提取的项目地址;' tenderee_address = pro_addr else: role_addr = get_role_address(content) if role_addr != "": msc += '使用规则提取的联系人地址;' tenderee_address = role_addr if tenderee_address == "": title_addr = get_title_addr(title) if title_addr != "": msc += '使用规则提取的标题地址;' tenderee_address = title_addr else: bid_addr = get_bid_addr(content) if bid_addr != "": msc += '使用规则提取的开标地址;' tenderee_address = bid_addr project_name = str(project_name) tenderee = str(tenderee) # print('招标人地址',role_addr, tenderee_address) project_name = project_name + title if project_name not in title else project_name project_name = project_name.replace(tenderee, '') text1 = "{0} {1} {2}".format(project_name, tenderee, tenderee_address) web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错 text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) #预防提取错 合肥 路南 新会 等地区 if pro_addr: msc += '## 使用项目地址输入:%s ##;' % pro_addr rs = get_area(pro_addr, '') msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % ( rs['district']['province'], rs['district']['city'], rs['district']['district']) if rs['district']['province'] != '全国': # print('地区匹配:', msc) return rs # print('text1:', text1) msc += '## 第一次预测输入:%s ##;'%text1 rs = get_area(text1, web_source_name) msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % ( rs['district']['province'], rs['district']['city'], rs['district']['district']) # self.f.write('%s %s \n' % (list_articles[0].id, msc)) # print('地区匹配:', msc) if rs['district']['province'] == '全国' or rs['district']['city'] == '未知': msc = "" all_addr, tenderees = get_all_addr(list_entitys) text2 = tenderees + " " + all_addr + ' ' + title msc += '使用实体列表所有招标人+所有地址;' # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:] text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2) # print('text2:', text2) msc += '## 第二次预测输入:%s ##'%text2 rs2 = get_area(text2, web_source_name, not_in_content=False) rs2['district']['is_in_text'] = True if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国': rs = rs2 elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知': rs = rs2 msc += '预测结果:省份:%s, 城市:%s,区县:%s'%( rs['district']['province'],rs['district']['city'],rs['district']['district']) # self.f.write('%s %s \n'%(list_articles[0].id, msc)) # print('地区匹配:', msc) return rs def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""): ''' 先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content :param project_name: :param prem: :param title: :param list_articles: :param web_source_name: :return: ''' def get_ree_addr(prem): tenderee = "" tenderee_address = "" try: for v in prem[0]['prem'].values(): for link in v['roleList']: if link['role_name'] == 'tenderee' and tenderee == "": tenderee = link['role_text'] tenderee_address = link['address'] except Exception as e: print('解析prem 获取招标人、及地址出错') return tenderee, tenderee_address def get_role_address(text): '''正则匹配获取招标人地址 3:地址直接在招标人后面 招标人:xxx,地址:xxx 4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx. ''' p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])' p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])' p5 = '(采购|招标)(人|单位)(联系)?地址:(?P(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])' if re.search(p3, text): return re.search(p3, text).group('addr') elif re.search(p4, text): return re.search(p4, text).group('addr') elif re.search(p5, text): return re.search(p5, text).group('addr') else: return '' def get_project_addr(text): p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?):(?P(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])' if re.search(p1, text): return re.search(p1, text).group('addr') else: return '' def get_bid_addr(text): p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售|所属)(地址|地点|所在地区?|地域):(?P(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])' if re.search(p2, text): return re.search(p2, text).group('addr') else: return '' def get_all_addr(list_entitys): tenderee_l = [] addr_l = [] for ent in list_entitys[0]: if ent.entity_type == 'location' and len(ent.entity_text) > 2: addr_l.append(ent.entity_text) elif ent.entity_type in ['org', 'company']: if ent.label in [0, 1]: # 加招标或代理 tenderee_l.append(ent.entity_text) return ' '.join(addr_l), ' '.join(tenderee_l) def get_title_addr(text): p1 = '(?P(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])' if re.search(p1, text): return re.search(p1, text).group('addr') else: return '' def find_areas(pettern, text): ''' 通过正则匹配字符串返回地址 :param pettern: 地址正则 广东省|广西省|... :param text: 待匹配文本 :return: ''' addr = [] for it in re.finditer(pettern, text): if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search( '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]): continue if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前 continue addr.append((it.group(0), it.start(), it.end())) if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end():]): addr.append((it.group(0), it.start(), it.end())) return addr def get_pro_city_dis_score(text, text_weight=1): text = re.sub('复合肥|海南岛|兴业银行|双河口', '', text) text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市 text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州 province_l = find_areas(p_pro, text) city_l = find_areas(p_city, text) district_l = find_areas(p_dis, text) province_l = chage_area2score(province_l, max_len=len(text)) city_l = chage_area2score(city_l, max_len=len(text)) district_l = chage_area2score(district_l, max_len=len(text)) pro_ids = dict() city_ids = dict() dis_ids = dict() for pro in province_l: name, score = pro assert (name in full_dic['province'] or name in short_dic['province']) if name in full_dic['province']: idx = full_dic['province'][name] if idx not in pro_ids: pro_ids[idx] = 0 pro_ids[idx] += (score + 2) else: idx = short_dic['province'][name] if idx not in pro_ids: pro_ids[idx] = 0 pro_ids[idx] += (score + 1) for city in city_l: name, score = city if name in full_dic['city']: w = 0.1 if len(full_dic['city'][name]) > 1 else 1 for idx in full_dic['city'][name]: if idx not in city_ids: city_ids[idx] = 0 # weight = idx_dic[idx]['权重'] city_ids[idx] += (score + 2) * w pro_idx = idx_dic[idx]['省'] if pro_idx in pro_ids: pro_ids[pro_idx] += (score + 2) * w else: pro_ids[pro_idx] = (score + 2) * w * 0.5 elif name in short_dic['city']: w = 0.1 if len(short_dic['city'][name]) > 1 else 1 for idx in short_dic['city'][name]: if idx not in city_ids: city_ids[idx] = 0 weight = idx_dic[idx]['权重'] city_ids[idx] += (score + 1) * w * weight pro_idx = idx_dic[idx]['省'] if pro_idx in pro_ids: pro_ids[pro_idx] += (score + 1) * w * weight else: pro_ids[pro_idx] = (score + 1) * w * weight * 0.5 for dis in district_l: name, score = dis if name in full_dic['district']: w = 0.1 if len(full_dic['district'][name]) > 1 else 1 for idx in full_dic['district'][name]: if idx not in dis_ids: dis_ids[idx] = 0 # weight = idx_dic[idx]['权重'] dis_ids[idx] += (score + 1) * w pro_idx = idx_dic[idx]['省'] if pro_idx in pro_ids: pro_ids[pro_idx] += (score + 1) * w else: pro_ids[pro_idx] = (score + 1) * w * 0.5 city_idx = idx_dic[idx]['市'] if city_idx in city_ids: city_ids[city_idx] += (score + 1) * w else: city_ids[city_idx] = (score + 1) * w * 0.5 elif name in short_dic['district']: w = 0.1 if len(short_dic['district'][name]) > 1 else 1 for idx in short_dic['district'][name]: if idx not in dis_ids: dis_ids[idx] = 0 weight = idx_dic[idx]['权重'] dis_ids[idx] += (score + 0) * w pro_idx = idx_dic[idx]['省'] if pro_idx in pro_ids: pro_ids[pro_idx] += (score + 0) * w * weight else: pro_ids[pro_idx] = (score + 0) * w * weight * 0.5 city_idx = idx_dic[idx]['市'] if city_idx in city_ids: city_ids[city_idx] += (score + 0) * w * weight else: city_ids[city_idx] = (score + 0) * w * weight * 0.5 for k, v in pro_ids.items(): pro_ids[k] = v * text_weight for k, v in city_ids.items(): city_ids[k] = v * text_weight for k, v in dis_ids.items(): dis_ids[k] = v * text_weight return pro_ids, city_ids, dis_ids def chage_area2score(group_list, max_len): ''' 把匹配的的地址转为分数 :param group_list: [('name', b, e)] :return: ''' area_list = [] if group_list != []: for it in group_list: name, b, e = it area_list.append((name, (e - b + e) / max_len / 2)) return area_list def get_final_addr(pro_ids, city_ids, dis_ids): ''' 先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市 :param province_l: 匹配到的所有省份 :param city_l: 匹配到的所有城市 :param district_l: 匹配到的所有区县 :return: ''' big_area = "" pred_pro = "" pred_city = "" pred_dis = "" final_pro = "" final_city = "" if len(pro_ids) >= 1: pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True) final_pro, score = pro_l[0] if score >= 0.01: pred_pro = idx_dic[final_pro]['返回名称'] big_area = idx_dic[final_pro]['大区'] # else: # print("得分过低,过滤掉", idx_dic[final_pro]['返回名称'], score) if pred_pro != "" and len(city_ids) >= 1: city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True) for it in city_l: if idx_dic[it[0]]['省'] == final_pro: final_city = it[0] pred_city = idx_dic[final_city]['返回名称'] break if final_city != "" and len(set(dis_ids)) >= 1: dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True) for it in dis_l: if idx_dic[it[0]]['市'] == final_city: pred_dis = idx_dic[it[0]]['返回名称'] if pred_city in ['北京', '天津', '上海', '重庆']: pred_city = pred_dis pred_dis = "" return big_area, pred_pro, pred_city, pred_dis def get_area(text, web_name, in_content=False): area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False} pro_ids, city_ids, dis_ids = get_pro_city_dis_score(text) pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name, text_weight=0.2) for k in pro_ids1: if k in pro_ids: pro_ids[k] += pro_ids1[k] else: pro_ids[k] = pro_ids1[k] for k in city_ids1: if k in city_ids: city_ids[k] += city_ids1[k] else: city_ids[k] = city_ids1[k] for k in dis_ids1: if k in dis_ids: dis_ids[k] += dis_ids1[k] else: dis_ids[k] = dis_ids1[k] big_area, pred_pro, pred_city, pred_dis = get_final_addr(pro_ids, city_ids, dis_ids) if big_area != "": area_dic['area'] = big_area if pred_pro != "": area_dic['province'] = pred_pro if pred_city != "": area_dic['city'] = pred_city if pred_dis != "": area_dic['district'] = pred_dis if in_content: area_dic['is_in_text'] = True return {'district': area_dic} p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic if '##attachment##' in list_articles[0].content: content, attachment = list_articles[0].content.split('##attachment##') if len(content) < 200: content += attachment else: content = list_articles[0].content tenderee, tenderee_address = get_ree_addr(prem) msc = "" pro_addr = get_project_addr(content) if pro_addr != "": msc += '使用规则提取的项目地址;' tenderee_address = pro_addr else: role_addr = get_role_address(content) if role_addr != "": msc += '使用规则提取的联系人地址;' tenderee_address = role_addr if tenderee_address == "": title_addr = get_title_addr(title) if title_addr != "": msc += '使用规则提取的标题地址;' tenderee_address = title_addr else: bid_addr = get_bid_addr(content) if bid_addr != "": msc += '使用规则提取的开标地址;' tenderee_address = bid_addr project_name = str(project_name) tenderee = str(tenderee) # print('招标人地址',role_addr, tenderee_address) project_name = project_name + title if project_name not in title else title project_name = project_name.replace(tenderee, '') text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name) web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错 text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) # 预防提取错 合肥 路南 新会 等地区 if pro_addr: msc += '## 使用项目地址输入:%s ##;' % pro_addr rs = get_area(pro_addr, '') msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % ( rs['district']['province'], rs['district']['city'], rs['district']['district']) if rs['district']['province'] != '全国': # print('地区匹配:', msc) return rs # print('text1:', text1) msc += '## 第一次预测输入:%s ##;' % text1 rs = get_area(text1, web_source_name) msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % ( rs['district']['province'], rs['district']['city'], rs['district']['district']) # self.f.write('%s %s \n' % (list_articles[0].id, msc)) # print('地区匹配:', msc) if rs['district']['province'] == '全国' or rs['district']['city'] == '未知': msc = "" all_addr, tenderees = get_all_addr(list_entitys) text2 = tenderees + " " + all_addr + ' ' + title msc += '使用实体列表所有招标人+所有地址;' # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:] text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2) # print('text2:', text2) msc += '## 第二次预测输入:%s ##' % text2 rs2 = get_area(text2, web_source_name, in_content=True) # rs2['district']['is_in_text'] = True if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国': rs = rs2 elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知': rs = rs2 msc += '预测结果:省份:%s, 城市:%s,区县:%s' % ( rs['district']['province'], rs['district']['city'], rs['district']['district']) # self.f.write('%s %s \n'%(list_articles[0].id, msc)) # print('地区匹配:', msc) return rs class TableTag2List(): '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]''' def table2list(self, table, text_process=None): self._output = [] row_ind = 0 col_ind = 0 for row in table.find_all('tr'): # record the smallest row_span, so that we know how many rows # we should skip smallest_row_span = 1 if len(row.find_all(['td', 'th'], recursive=False)) > 20: log('未补全前表格列数大于20的不做表格处理') return [] for cell in row.children: if cell.name in ('td', 'th'): # check multiple rows # pdb.set_trace() row_span = int(re.sub('[^0-9]', '', cell.get('rowspan'))) if cell.get('rowspan') and cell.get('rowspan').isdigit() else 1 # try updating smallest_row_span smallest_row_span = min(smallest_row_span, row_span) # check multiple columns col_span = int(re.sub('[^0-9]', '', cell.get('colspan'))) if cell.get('colspan') and cell.get('colspan').isdigit() else 1 if col_span > 20: # 修复 335590254 山东港口阳光智采e平台 数据源表格第一行colspan为200超过50列造成无法提取问题 col_span = 20 # find the right index while True: if self._check_cell_validity(row_ind, col_ind): break col_ind += 1 # insert into self._output try: if text_process != None: # text = [re.sub('\xa0', '', text_process(cell, final=False)), 0] # td_text = re.sub('\xa0', '', text_process(cell, final=False)) td_text = re.sub('\s|\xa0', '', str(cell.get_text())) # 修复 370835008 td 内公司被p标签拆分为两半情况 if 'title' in cell.attrs and cell.get_text().strip().endswith('...') and cell.get_text().strip()[:-3] in cell.attrs['title']: td_text = cell.attrs['title'] # 修复 类似 215597851 省略号隐藏内容 elif len(td_text)>30: td_text = re.sub('\xa0', '', text_process(cell, final=False)) if td_text == "": td_text = ' ' text = [td_text,0] else: text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '') # text = re.sub('\s', '', text)[:200] # 只需取前200字即可 text = ' ' if text == "" else text self._insert(row_ind, col_ind, row_span, col_span, text) except UnicodeEncodeError: raise Exception( 'Failed to decode text; you might want to specify kwargs transformer=unicode' ) # update col_ind col_ind += col_span if col_ind > 50 and text_process == None: # 表格要素提取及候选人提取的 表格列数大于50的去掉 return [] # update row_ind row_ind += smallest_row_span col_ind = 0 return self._output def _check_validity(self, i, j, height, width): """ check if a rectangle (i, j, height, width) can be put into self.output """ return all(self._check_cell_validity(ii, jj) for ii in range(i, i+height) for jj in range(j, j+width)) def _check_cell_validity(self, i, j): """ check if a cell (i, j) can be put into self._output """ if i >= len(self._output): return True if j >= len(self._output[i]): return True if self._output[i][j] == "": return True return False def _insert(self, i, j, height, width, val): # pdb.set_trace() for ii in range(i, i+height): for jj in range(j, j+width): self._insert_cell(ii, jj, val) def _insert_cell(self, i, j, val): while i >= len(self._output): self._output.append([]) while j >= len(self._output[i]): self._output[i].append("") if self._output[i][j] == "": self._output[i][j] = val class TablePremExtractor(object): def __init__(self): '''各要素表头规则''' self.head_rule_dic = { 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)", 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$", "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的|^包)(名称?|内容)", "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序", "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$", "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)", "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价", "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价", } with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f: self.headerset = pickle.load(f) self.tb = TableTag2List() def find_header(self, td_list): fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头 header_dic = dict() flag = False contain_header = False if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6: flag = True for i in range(len(td_list)) : text = td_list[i] if text == '备选中标人': text = '第二候选人' if len(text) > 15: # 长度大于15 不进行表头匹配 continue if re.search('未(中标|成交)原因', text): # 不提取此种表格 return flag, contain_header, dict() num = 0 for k, v in self.head_rule_dic.items(): if re.search('评分|得分|分数|分值', text): continue if re.search(v, text): if k in ['tenderer'] and re.search('是否', text): continue if k in header_dic: continue header_dic[k] = (i, text) num += 1 if num>1: print('表头错误,一个td匹配到两个表头:', header_dic) return flag, contain_header, dict() if re.search(';金额((万?元))?;', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额 if 'tenderer' in header_dic and 'bid_amount' not in header_dic: for i in range(len(td_list)): text = td_list[i] if re.search('^金额((万?元))?$',text): header_dic['bid_amount'] = (i, text) break elif 'tenderee' in header_dic and 'budget' not in header_dic: for i in range(len(td_list)): text = td_list[i] if re.search('^金额((万?元))?$', text): header_dic['budget'] = (i, text) break if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and ( 'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取 return flag, contain_header, header_dic elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取 if re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_sort' not in header_dic: # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629 # print('只有供应商名称 没排名和包号的去掉') return flag, contain_header, dict() return flag,contain_header, header_dic elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头 contain_header = True return flag, contain_header, dict() def is_role(self, text): if len(text) > 25 or len(text)<4: return False elif len(re.findall('有限责?任?公司', text)) > 1: return False elif re.search('[\w()]{4,}(有限责?任?公司|学校|学院|大学|中学|小学|医院|管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处)$', text): return True else: ners = selffool.ner(text) if len(ners[0]) == 1 and ('company' in ners[0][0] or 'org' in ners[0][0]): return True return False def get_role(self, text, nlp_enterprise): ''' 获取字符串text角色实体 :param text: 待获取实体字符串 :param nlp_enterprise: 公告中的角色实体列表 :return: ''' text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]' , ',', text) text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n if text in nlp_enterprise: return text if len(text) > 50 or len(text)<4: return '' ners = getNers([text], useselffool=True) roles = [] if ners: for ner in ners[0]: if ner[2] in ['org', 'company', 'location']: roles.append(ner[3]) if roles and len(''.join(roles)) > len(text)*0.8: return roles[0] else: return '' def extract_from_df(self, df, headers, web_source_name): prem_dic = {} previous_package = "" # 上一行包号 multi_same_package = False # 非连续的重复包号 package_fix2raw = dict() # 处理后包号:处理前包号 字典 link_set = set() not_package = True if 'project_name' in headers and re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1]) and \ 'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False if set(['project_code', 'package_code', 'tenderee', 'tenderer']) & set(headers) == set() and ('project_name' not in headers # 补充没有项目名称或有项目名称且是货物的才过滤掉 or re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683; 补充避免423647863采购意向被过滤 # print('没有包号及角色的不要') return {} for i in df.index: same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购 project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else "" package_code_raw = df.loc[i, headers['package_code'][0]].strip() if "package_code" in headers else "" project_name = df.loc[i, headers['project_name'][0]].strip() if "project_name" in headers else "" tenderee = df.loc[i, headers['tenderee'][0]].strip() if "tenderee" in headers else "" tenderer = df.loc[i, headers['tenderer'][0]].strip() if "tenderer" in headers else "" budget_ = df.loc[i, headers['budget'][0]].strip() if "budget" in headers else "" bid_amount_ = df.loc[i, headers['bid_amount'][0]].strip() if "bid_amount" in headers else "" win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else "" if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配 # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset) break if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2: # 内容为空或全部一样 停止匹配 # print('内容为空或全部一样 停止匹配') break if re.search('详见', project_name): # 去除某些表达: 详见招标文件 project_name = "" if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name): package_code_raw = project_name project_name = "" package_code = package_code_raw if re.search('合计|总计', package_code+project_code): continue if package_code != '' and package_code == previous_package: # 处理 208162730 一个包采购多种东西情况 same_package = True project_name = '' previous_package = package_code if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取 防止类似 328485591 作为多包 break if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and (re.search('否|未(中标|成交|中选)', win_sort) or win_sort==''): # 2024/04/2 修复 252208201 为空的不中标 continue if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉' continue if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]) and re.search('推荐中标候选人', headers['tenderer'][1])==None: tenderer = "" if tenderer in ['采购失败', '废标']: # 避免类似 353867205 这篇只提取到一个 continue # tenderee = tenderee if self.is_role(tenderee) else "" # tenderer = tenderer if self.is_role(tenderer) else "" tenderee = self.get_role(tenderee, self.nlp_enterprise) if tenderee!="" else tenderee tenderer = self.get_role(tenderer, self.nlp_enterprise) if tenderer!='' else tenderer if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2: break if not_package: if (project_code, package_code, tenderee, tenderer, budget_, bid_amount_) in link_set: continue link_set.add((project_code, package_code, tenderee, tenderer, budget_, bid_amount_)) else: if (project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_) in link_set: continue link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_)) package = uniform_package_name(package_code) if package_code else str(len(prem_dic)+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标 if project_code != "": uni_project_code= uniform_package_name(project_code) if uni_project_code != "" and package != "": # print('重组包号:', '%s_%s'%(uni_project_code, package)) package = '%s_%s'%(uni_project_code, package) if package_code_raw!='': if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本 package_fix2raw[package] = package_code_raw elif same_package == False: multi_same_package = True if multi_same_package: package = package_code_raw if package not in prem_dic or not same_package: prem_dic[package] = { 'code': '', 'name': '', 'roleList': [], 'tendereeMoney': 0, 'tendereeMoneyUnit': "" } prem_dic[package]['code'] = project_code prem_dic[package]['name'] = project_name if budget_ != "": if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配 break budget_header = headers['budget'][1] if 'budget' in headers else '' budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '') if (re.search('费率|下浮率|[%%‰折]', budget_header + budget_) and budget < 100) or budget > 50000000000: # 如果是费率或大于500亿的金额改为0 budget = 0 if budget > 0: if same_package and prem_dic[package]['tendereeMoney'] != budget: # 处理 类似 136839070 一包多物品多预算 prem_dic[package]['tendereeMoney'] += budget else: prem_dic[package]['tendereeMoney'] = budget prem_dic[package]['tendereeMoneyUnit'] = money_unit if tenderee and not same_package: prem_dic[package]['roleList'].append({ "address": "", "linklist": [], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "tenderee", "role_text": tenderee, "serviceTime": "" }) if tenderer and not same_package: if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配 break bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率', bid_amount_)==None and 'bid_amount' in headers else (0, '') if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标 if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃 prem_dic.pop(package) continue bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else '' if (re.search('费率|下浮率|[%%‰折]', bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000: # 如果是费率或大于500亿的金额改为0 bid_amount = 0 prem_dic[package]['roleList'].append({ "address": "", "linklist": [], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": bid_amount, "money_unit": money_unit }, "role_name": "win_tenderer", "role_text": tenderer, "serviceTime": "" }) if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配 prem_dic.pop(package) # break # 注释掉避免 400084571 某些包废标 中断匹配 if multi_same_package: # 预处理后包号重复的,使用原始包号 for k, v in package_fix2raw.items(): if k in prem_dic: prem_dic[v] = prem_dic.pop(k) return prem_dic def get_prem(self, soup, web_source_name=''): tables = soup.find_all('table') tables.reverse() rs_dic = {} for table in tables: text = table.text.strip() previous = table.findPreviousSibling() text2 = previous .text.strip() if previous else "" # text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else "" if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理 tb_ex = table.extract() if previous: sib = previous.extract() continue trs = self.tb.table2list(table) # table.extract() i = 0 headers = "" table_prem = {} while i < len(trs) - 1: flag_, contain_header_, headers_ = self.find_header(trs[i]) if flag_ and headers_ != dict(): table_items = [] headers = headers_ for j in range(i + 1, len(trs)): if len(trs[j]) == len(trs[i]): flag_, contain_header_, headers_ = self.find_header(trs[j]) if flag_ or contain_header_: break else: table_items.append(trs[j]) else: # print('表头,内容 列数不一致', len(trs[i]), len(trs[j])) break if len(table_items) > 0: df = pd.DataFrame(table_items) prem_ = self.extract_from_df(df, headers, web_source_name) # rs_dic.update(prem_) table_prem.update(prem_) i = j - 1 i += 1 if table_prem and len(trs) == 2 and 'package_code' not in headers and '1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段 sib = table.find_previous_sibling() sib_text = sib.get_text() ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}', sib_text) if sib.name in ['p', 'div'] and len(sib_text)<30 and ser_sib: package_sib = ser_sib.group(0) package_sib = uniform_package_name(package_sib) table_prem[package_sib] = table_prem.pop('1') if table_prem: rs_dic.update(table_prem) table.extract() return rs_dic def predict(self, html, nlp_enterprise, web_source_name=""): html = re.sub("|||","",html) html = re.sub("##attachment##","",html) soup = BeautifulSoup(html, 'lxml') richText = soup.find(name='div', attrs={'class': 'richTextFetch'}) self.nlp_enterprise = nlp_enterprise if richText: richText = richText.extract() # 过滤掉附件 prem = self.get_prem(soup, web_source_name) if prem == {} and richText: prem = self.get_prem(richText, web_source_name) if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project k = list(prem)[0] if k == '1' or len(k) > 2: prem['Project'] = prem.pop(k) return prem class CandidateExtractor(object): def __init__(self): '''各要素表头规则''' self.head_rule_dic = { 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$", "win_sort": "排名|排序|名次|推荐顺序", 'win_or_not': '是否中标|是否入围|是否入库|入围结论', "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称)?$", "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价", "win_tenderer": "第一名|第一(中标|成交)?候选人", "second_tenderer": "第二名|第二(中标|成交)?候选人", "third_tenderer": "第三名|第三(中标|成交)?候选人", } '''非表格候选人正则''' self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?:$' self.tb = TableTag2List() with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f: self.headerset = pickle.load(f) def find_header(self, td_list): fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头 header_dic = dict() flag = False contain_header = False if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6: flag = True for i in range(len(td_list)) : text = td_list[i] if len(text) > 15: # 长度大于15 不进行表头匹配 continue if re.search('未(中标|成交)原因', text): # 不提取此种表格 return flag, contain_header, dict() num = 0 for k, v in self.head_rule_dic.items(): if k == 'candidate' and re.search('第[一二三]名|第[一二三](中标|成交)?候选人', text): continue if re.search('评分|得分|分数|分值', text): continue if re.search(v, text): if k in ['candidate', 'win_tenderer', 'second_tenderer', 'third_tenderer'] and re.search('是否', text): continue header_dic[k] = (i, text) # if k != 'candidate': # candidate 可与前三候选重复 num += 1 if 'win_tenderer'in header_dic and 'second_tenderer' in header_dic and 'candidate' in header_dic: header_dic.pop('candidate') if num>1: # print('表头错误,一个td匹配到两个表头:', header_dic) return flag, contain_header, dict() if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic): return flag, contain_header, header_dic elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(fix_td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头 contain_header = True return flag, contain_header, dict() def is_role(self, text): if len(text) > 25 or len(text) < 4: return False elif len(re.findall('有限责?任?公司', text)) > 1: return False elif re.search('[\w()]{4,}(有限责?任?公司|学校|学院|大学|中学|小学|医院|管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处)$', text): return True else: ners = selffool.ner(text) if len(ners[0]) == 1 and ('company' in ners[0][0] or 'org' in ners[0][0]): return True return False def get_role(self, text, nlp_enterprise): ''' 获取字符串text角色实体 :param text: 待获取实体字符串 :param nlp_enterprise: 公告中的角色实体列表 :return: ''' text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]' , ',', text) text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n if text in nlp_enterprise: return text if len(text) > 50 or len(text)<4: return '' ners = getNers([text], useselffool=True) roles = [] if ners: for ner in ners[0]: if ner[2] in ['org', 'company', 'location']: roles.append(ner[3]) if roles and len(''.join(roles)) > len(text)*0.8: return roles[0] else: return '' def extract_from_df(self, df, headers): prem_dic = {} link_set = set() candidate_set = set() role_dic = dict() # 保存一二三候选人并排的情况 findtop3 = False findmoney = False line_num = 0 line_package = None for i in df.index: package_code_raw = df.loc[i, headers['package_code'][0]].strip() if "package_code" in headers else "" candidate_ = df.loc[i, headers['candidate'][0]].strip() if "candidate" in headers else "" win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else "" # budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else "" bid_amount_ = df.loc[i, headers['bid_amount'][0]].strip() if "bid_amount" in headers else "" win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else "" win_tenderer = df.loc[i, headers['win_tenderer'][0]].strip() if "win_tenderer" in headers else "" second_tenderer = df.loc[i, headers['second_tenderer'][0]].strip() if "second_tenderer" in headers else "" third_tenderer = df.loc[i, headers['third_tenderer'][0]].strip() if "third_tenderer" in headers else "" if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配 # 排除 ,win_sort 避免367940050漏提取 # print('包含表头, 停止匹配') break if len(set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2: # 全部为空或内容一样 停止匹配 # print('全部为空或内容一样 停止匹配') if len(set(df.loc[i,:]))==1 and re.search('^第?([一二三四五六七八九十]{1,3}|[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段])([一二三四五六七八九十]{1,3}|[a-zA-Z0-9-]{,9})?$', win_sort): line_package = win_sort continue else: break if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替 col_indx = headers['candidate'][0] -1 pre_col = df.loc[i, col_indx] if col_indx > 0 and pre_col == candidate_: pre_col = df.loc[i, col_indx - 1] if re.search('第[一二三]名|第[一二三](中标)?候选人', pre_col): win_sort = pre_col package_code = package_code_raw if package_code == '' and line_package: package_code = line_package # candidate = candidate_ if self.is_role(candidate_) else "" # tenderer = tenderer if self.is_role(tenderer) else "" candidate = self.get_role(candidate_, self.nlp_enterprise) # if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2: # break if(candidate_,win_tenderer, second_tenderer,third_tenderer, bid_amount_) in link_set: continue link_set.add((candidate_, win_tenderer, second_tenderer, third_tenderer, bid_amount_)) package = package_code package = uniform_package_name(package) if package !="" else "Project" if candidate: if win_or_not and re.search('否|未入围', win_or_not): pass else: candidate_set.add(candidate) if win_tenderer and second_tenderer: # and third_tenderer 128778062 这篇只有 第一二候选人 if re.search("(候选人|投标人|单位|公司)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人|单位|公司)名?称?", df.loc[i, 1]): findtop3 = True for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'], [win_tenderer, second_tenderer, third_tenderer]): text = self.get_role(text, self.nlp_enterprise) if text: # if self.is_role(text): if type not in role_dic: role_dic[type] = dict() role_dic[type]['role_text'] = text if type in ['second_tenderer', 'third_tenderer']: candidate_set.add(text) elif re.search('投标报价|报价$', df.loc[i, 0]) or re.search('投标报价|报价$', df.loc[i, 1]): findmoney = True header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1] for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'], [win_tenderer, second_tenderer, third_tenderer]): if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', text)) > 5: # 金额字段出现超过5个非金额字符,中断匹配 break money, money_unit = money_process(text, header) if (re.search('费率|下浮率|[%%‰折]', header+text) and money < 100) or money > 50000000000: # 如果是费率或大于500亿的金额改为0 money = 0 if money > 0: if type not in role_dic: role_dic[type] = dict() role_dic[type]['money'] = money role_dic[type]['money_unit'] = money_unit else: line_num += 1 if findtop3 and findmoney: break if line_num > 3: break elif candidate and win_sort: role_type = "" if re.search('第[一1]|^[一1]$', win_sort): role_type = "win_tenderer" elif re.search('第[二2]|^[二2]$', win_sort): role_type = "second_tenderer" elif re.search('第[三3]|^[三3]$', win_sort): role_type = "third_tenderer" if role_type != "": if package not in prem_dic: prem_dic[package] = { 'code': '', 'name': '', 'roleList': [], 'tendereeMoney': 0, 'tendereeMoneyUnit': "" } if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', bid_amount_))> 5: # 金额字段出现超过5个非金额字符,中断匹配 break bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if "bid_amount" in headers else (0, "") header = headers['bid_amount'][1] if "bid_amount" in headers else '' if (re.search('费率|下浮率|[%%‰折]', header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000: # 如果是费率或大于500亿的金额改为0 bid_amount = 0 prem_dic[package]['roleList'].append({ "address": "", "linklist": [], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": bid_amount, "money_unit": money_unit }, "role_name": role_type, "role_text": candidate, "serviceTime": "" }) if len(prem_dic[package]['roleList']) == 0: # 只有项目编号和名称的 丢弃 prem_dic.pop(package) if role_dic and prem_dic == dict(): if package not in prem_dic: prem_dic[package] = { 'code': '', 'name': '', 'roleList': [], 'tendereeMoney': 0, 'tendereeMoneyUnit': "" } for role_type, v in role_dic.items(): role_text = v.get('role_text', '') if role_text == "": continue money = v.get('money', 0) money_unit = v.get('money_unit', '') prem_dic[package]['roleList'].append({ "address": "", "linklist": [], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": money, "money_unit": money_unit }, "role_name": role_type, "role_text": role_text, "serviceTime": "" }) if len(prem_dic[package]['roleList']) == 0: # 只有项目编号和名称的 丢弃 prem_dic.pop(package) return prem_dic, candidate_set def get_prem(self, soup): tables = soup.find_all('table') tables.reverse() rs_dic = {} candidate_set = set() for table in tables: trs = self.tb.table2list(table) table.extract() i = 0 headers = "" while i < len(trs) - 1: flag_, contain_header_, headers_ = self.find_header(trs[i]) if flag_ and headers_ != dict(): table_items = [] headers = headers_ for j in range(i + 1, len(trs)): if len(trs[j]) == len(trs[i]): flag_, contain_header_, headers_ = self.find_header(trs[j]) if flag_ or contain_header_: break else: table_items.append(trs[j]) else: # print('表头,内容 列数不一致', len(trs[i]), len(trs[j])) break if len(table_items) >= 1: df = pd.DataFrame(table_items) prem_, candidate_set_ = self.extract_from_df(df, headers) # print('prem_: ', prem_) rs_dic.update(prem_) candidate_set.update(candidate_set_) i = j - 1 i += 1 return rs_dic, candidate_set def get_candidates_from_text(self, list_sentences, list_entitys): candidates = set() sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index) for ent in list_entitys[0]: if ent.entity_type in ['org', 'company']: sen_index = ent.sentence_index text = sentences[sen_index].sentence_text b = ent.wordOffset_begin e = ent.wordOffset_end if ent.label in [2,3,4]: # 直接加实体预测的候选人, 否则规则检查是否为候选人 candidates.add(ent.entity_text) elif isinstance(b, int) and isinstance(e, int): foreword = text[max(0, b - 10):b] if re.search(self.p, foreword): candidates.add(ent.entity_text) return candidates def predict(self, html, list_sentences, list_entitys, nlp_enterprise): self.nlp_enterprise = nlp_enterprise html = html.replace('比选申请单位', '中标候选人') # 82347769 html = re.sub("|||","",html) html = re.sub("##attachment##","",html) soup = BeautifulSoup(html, 'lxml') richText = soup.find(name='div', attrs={'class': 'richTextFetch'}) if richText: richText = richText.extract() # 过滤掉附件 prem, candidate_set = self.get_prem(soup) if prem == {} and richText: prem, candidate_set = self.get_prem(richText) if prem == {} and candidate_set == set(): candidate_set = self.get_candidates_from_text(list_sentences, list_entitys) return prem, {'candidate': ','.join(candidate_set)} def role_special_predictor(web_source_name, content, nlp_enterprise): if web_source_name == '中国电子科技集团有限公司电子采购平台': ser = re.search(',(\w{5,30}),发布时间:\d+', content) if ser and ser.group(1) in nlp_enterprise: return ser.group(1) elif web_source_name == '高校仪器设备竞价网': ser = re.search('--(\w{5,30}),申购单主题', content) if ser and ser.group(1) in nlp_enterprise: return ser.group(1) elif web_source_name == '台泥阳光采购平台': ser = re.search(',(\w{5,30})招标公告,', content) if ser and ser.group(1) in nlp_enterprise: return ser.group(1) class WebsourceTenderee(): def __init__(self): with open(os.path.dirname(__file__)+'/websource_tenderee.pkl', 'r', encoding='utf-8') as f: self.webno2ree = json.load(f) def get_websource_tenderee(self, web_source_no, prem): ''' 通过数据源唯一招标人召回调整prem中的招标人, :param web_source_no: :param prem: :return: ''' p = '(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校|银行|研究院|血站|红十字会|防治院|研究所)' web_ree = self.webno2ree.get(web_source_no, '') if web_source_no.startswith('18591-') and web_ree == "": web_ree = '中国人民解放军总医院' elif web_source_no.startswith('Y00484-') and web_ree == "": web_ree = '航空总医院' if web_ree != '': if 'Project' in prem[0]['prem']: find_tenderee = False for d in prem[0]['prem']['Project']['roleList']: if d['role_name'] == 'tenderee': find_tenderee = True if d['role_text'] == "": d['role_text'] = web_ree elif re.search('大学$', web_ree) and re.search('学院$', d['role_text']) and web_ree not in d['role_text']: d['role_text'] = web_ree elif d.get('role_prob', 0) < 0.8 and get_business_data(d['role_text'])[0] == False: # 20240201 概率低于0.8且没有工商数据的替换为站源招标人 d['role_text'] = web_ree # elif re.search(p, web_ree) and (re.search(p, d['role_text'])==None and len(d['role_text'])<6): # 数据源唯一招标人以医院等结尾,角色中无相关关键词的,替换为数据源招标人 # d['role_text'] = web_ree # elif re.search('有限(责任)?公司', web_ree) and (re.search('有限(责任)?公司', d['role_text'])==None and len(d['role_text'])<6): # d['role_text'] = web_ree break if not find_tenderee: # 没招标人的添加 prem[0]['prem']['Project']['roleList'].append({'role_name': 'tenderee', 'role_text': '%s' % web_ree, 'role_money': {'money': 0, 'money_unit': '', 'floating_ratio': '', 'downward_floating_ratio': '', 'discount_ratio': ''}, 'linklist': [], 'serviceTime': '', 'address': ''}) else: prem[0]['prem']['Project'] = {'code': '', 'tendereeMoney': 0, 'roleList': [ {'role_name': 'tenderee', 'role_text': '%s' % web_ree, 'role_money': {'money': 0, 'money_unit': '', 'floating_ratio': '', 'downward_floating_ratio': '', 'discount_ratio': ''}, 'linklist': [], 'serviceTime': '', 'address': ''} ]} return prem def getSavedModel(): #predictor = FormPredictor() graph = tf.Graph() with graph.as_default(): model = tf.keras.models.load_model("../form/model/model_form.model_item.hdf5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score}) #print(tf.graph_util.remove_training_nodes(model)) tf.saved_model.simple_save( tf.keras.backend.get_session(), "./h5_savedmodel/", inputs={"image": model.input}, outputs={"scores": model.output} ) def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights): ''' model = models.Sequential() model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True))) crf = CRF(len(chunk_tags), sparse_target=True) model.add(crf) model.summary() model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) return model ''' input = layers.Input(shape=(None,),dtype="int32") if weights is not None: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input) else: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input) bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding) bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm) crf = CRF(len(chunk_tags),sparse_target=True) crf_out = crf(bilstm_dense) model = models.Model(input=[input],output = [crf_out]) model.summary() model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy]) return model import h5py def h5_to_graph(sess,graph,h5file): f = h5py.File(h5file,'r') #打开h5文件 def getValue(v): _value = f["model_weights"] list_names = str(v.name).split("/") for _index in range(len(list_names)): print(v.name) if _index==1: _value = _value[list_names[0]] _value = _value[list_names[_index]] return _value.value def _load_attributes_from_hdf5_group(group, name): """Loads attributes of the specified name from the HDF5 group. This method deals with an inherent problem of HDF5 file which is not able to store data larger than HDF5_OBJECT_HEADER_LIMIT bytes. # Arguments group: A pointer to a HDF5 group. name: A name of the attributes to load. # Returns data: Attributes data. """ if name in group.attrs: data = [n.decode('utf8') for n in group.attrs[name]] else: data = [] chunk_id = 0 while ('%s%d' % (name, chunk_id)) in group.attrs: data.extend([n.decode('utf8') for n in group.attrs['%s%d' % (name, chunk_id)]]) chunk_id += 1 return data def readGroup(gr,parent_name,data): for subkey in gr: print(subkey) if parent_name!=subkey: if parent_name=="": _name = subkey else: _name = parent_name+"/"+subkey else: _name = parent_name if str(type(gr[subkey]))=="": readGroup(gr[subkey],_name,data) else: data.append([_name,gr[subkey].value]) print(_name,gr[subkey].shape) layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names') list_name_value = [] readGroup(f["model_weights"], "", list_name_value) ''' for k, name in enumerate(layer_names): g = f["model_weights"][name] weight_names = _load_attributes_from_hdf5_group(g, 'weight_names') #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names] for weight_name in weight_names: list_name_value.append([weight_name,np.asarray(g[weight_name])]) ''' for name_value in list_name_value: name = name_value[0] ''' if re.search("dense",name) is not None: name = name[:7]+"_1"+name[7:] ''' value = name_value[1] print(name,graph.get_tensor_by_name(name),np.shape(value)) sess.run(tf.assign(graph.get_tensor_by_name(name),value)) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f] adam_vars = [] for _vars in not_initialized_vars: if re.search("Adam",_vars.name) is not None: adam_vars.append(_vars) print([str(i.name) for i in adam_vars]) # only for testing if len(adam_vars): sess.run(tf.variables_initializer(adam_vars)) def save_codename_model(): # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5" filepath = "../../dl_dev/projectCode/models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt" vocabpath = "../projectCode/models/vocab.pk" classlabelspath = "../projectCode/models/classlabels.pk" # vocab = load(vocabpath) # class_labels = load(classlabelspath) w2v_matrix = load('codename_w2v_matrix.pk') graph = tf.get_default_graph() with graph.as_default() as g: '''''' # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None) #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function}) sess = tf.Session(graph=g) # sess = tf.keras.backend.get_session() char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix) #with sess.as_default(): sess.run(tf.global_variables_initializer()) # print(sess.run("time_distributed_1/kernel:0")) # model.load_weights(filepath) saver = tf.train.Saver() saver.restore(sess, filepath) # print("logits",sess.run(logits)) # print("#",sess.run("time_distributed_1/kernel:0")) # x = load("codename_x.pk") #y = model.predict(x) # y = sess.run(model.output,feed_dict={model.input:x}) # for item in np.argmax(y,-1): # print(item) tf.saved_model.simple_save( sess, "./codename_savedmodel_tf/", inputs={"inputs": char_input, "inputs_length":length, 'keepprob':keepprob}, outputs={"logits": logits, "trans":trans} ) def save_role_model(): ''' @summary: 保存model为savedModel,部署到PAI平台上调用 ''' model_role = PREMPredict().model_role with model_role.graph.as_default(): model = model_role.getModel() sess = tf.Session(graph=model_role.graph) print(type(model.input)) sess.run(tf.global_variables_initializer()) h5_to_graph(sess, model_role.graph, model_role.model_role_file) model = model_role.getModel() tf.saved_model.simple_save(sess, "./role_savedmodel/", inputs={"input0":model.input[0], "input1":model.input[1], "input2":model.input[2]}, outputs={"outputs":model.output} ) def save_money_model(): model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5" graph = tf.Graph() with graph.as_default(): sess = tf.Session(graph=graph) with sess.as_default(): # model = model_money.getModel() # model.summary() # sess.run(tf.global_variables_initializer()) # h5_to_graph(sess, model_money.graph, model_money.model_money_file) model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) model.summary() print(model.weights) tf.saved_model.simple_save(sess, "./money_savedmodel2/", inputs = {"input0":model.input[0], "input1":model.input[1], "input2":model.input[2]}, outputs = {"outputs":model.output} ) def save_person_model(): model_person = EPCPredict().model_person with model_person.graph.as_default(): x = load("person_x.pk") _data = np.transpose(np.array(x),(1,0,2,3)) model = model_person.getModel() sess = tf.Session(graph=model_person.graph) with sess.as_default(): sess.run(tf.global_variables_initializer()) model_person.load_weights() #h5_to_graph(sess, model_person.graph, model_person.model_person_file) predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]}) #predict_y = model.predict([_data[0],_data[1]]) print(np.argmax(predict_y,-1)) tf.saved_model.simple_save(sess, "./person_savedmodel/", inputs={"input0":model.input[0], "input1":model.input[1]}, outputs = {"outputs":model.output}) def save_form_model(): model_form = FormPredictor() with model_form.graph.as_default(): model = model_form.getModel("item") sess = tf.Session(graph=model_form.graph) sess.run(tf.global_variables_initializer()) h5_to_graph(sess, model_form.graph, model_form.model_file_item) tf.saved_model.simple_save(sess, "./form_savedmodel/", inputs={"inputs":model.input}, outputs = {"outputs":model.output}) def save_codesplit_model(): filepath_code = "../../dl_dev/projectCode/models/model_code.hdf5" graph = tf.Graph() with graph.as_default(): model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) sess = tf.Session() sess.run(tf.global_variables_initializer()) h5_to_graph(sess, graph, filepath_code) tf.saved_model.simple_save(sess, "./codesplit_savedmodel/", inputs={"input0":model_code.input[0], "input1":model_code.input[1], "input2":model_code.input[2]}, outputs={"outputs":model_code.output}) def save_timesplit_model(): filepath = '../time/model_label_time_classify.model.hdf5' with tf.Graph().as_default() as graph: time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) h5_to_graph(sess, graph, filepath) tf.saved_model.simple_save(sess, "./timesplit_model/", inputs={"input0":time_model.input[0], "input1":time_model.input[1]}, outputs={"outputs":time_model.output}) if __name__=="__main__": #save_role_model() # save_codename_model() # save_money_model() #save_person_model() #save_form_model() #save_codesplit_model() # save_timesplit_model() ''' # with tf.Session(graph=tf.Graph()) as sess: # from tensorflow.python.saved_model import tag_constants # meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel") # graph = tf.get_default_graph() # signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY # signature = meta_graph_def.signature_def # input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name) # input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name) # outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name) # x = load("person_x.pk") # _data = np.transpose(x,[1,0,2,3]) # y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]}) # print(np.argmax(y,-1)) ''' # MAX_LEN = 1000 # # vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk" # # vocab = load(vocabpath) # # word2index = dict((w, i) for i, w in enumerate(np.array(vocab))) # # index_unk = word2index.get("") # # sentence = "招标人:广州市重点公共建设项目管理中心,联系人:李工,联系方式:020-22905689,招标代理:广东重工建设监理有限公司," \ # # "代理联系人:薛家伟,代理联系方式:13535014481,招标监督机构:广州市重点公共建设项目管理中心,监督电话:020-22905690," \ # # "备注:以上为招标公告简要描述,招标公告详细信息请查看“招标公告”附件," # # sentence = sentence*5 # # list_sentence = [sentence]*200 # # # print(list_sentence) # # x = [[word2index.get(word, index_unk) for word in sentence] for sentence in # # list_sentence] # # x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x] # # # print(x_len) # # x = pad_sequences(x, maxlen=MAX_LEN, padding="post", truncating="post") # # # # requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len}, # # verify=True) # # # predict_y = json.loads(requests_result.text)['result'] # # print("cost_time:", json.loads(requests_result.text)['cost_time']) # # print(MAX_LEN, len(sentence), len(list_sentence)) # # requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len}, # # verify=True) # # # predict_y = json.loads(requests_result.text)['result'] # # print("cost_time:", json.loads(requests_result.text)['cost_time']) # # print(MAX_LEN, len(sentence), len(list_sentence)) # docid = "" # title = '' # with open('d:/html/2.html', 'r', encoding='utf-8') as f: # html = f.read() # product_attr = ProductAttributesPredictor() # rs = product_attr.predict(docid='', html=html, page_time="") # print(rs) # docid = "" # title = '' # with open('d:/html/2.html', 'r', encoding='utf-8') as f: # html = f.read() # tb_extract = TablePremExtractor() # rs = tb_extract.predict(html, [ # "广东省广裕集团嘉顺实业有限责任公司", # "广州顺为招标采购有限公司", # "中华人民共和国" # ], web_source_name = '河钢供应链管理平台') # print('标段数:',len(rs)) # print(rs) # # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770] # # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770] # # # ids = [42078089, 51828144, 60511017, 69042200, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770] # # # ids = [ 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770] # # # ids = [37756133, 39743626, 42068246, 51176657, 70624901, 75687028, 85489552, 95342532, 97337474, 109601526, 111464967, 112548665, 116223553, 117329696, 117850214, 120619166, 121717252, 122345499, 128511969, 133403846, 133602236, 136564970, 137772969, 138020374, 140929169, 147414295, 152659064, 155485083, 186412244, 195546784, 196135909, 202981523, 214647448, 216377830, 217957372, 218789230, 225050691, 228064464, 228590691, 236342514, 237352780, 239814252] # # # ids = [51176657, 70624901, 85489552, 95342532, 109601526, 111464967, 112548665, 116223553, 117329696, 117850214, 120619166, 121717252, 122345499, 128511969, 133403846, 133602236, 136564970, 137772969, 138020374, 140929169, 147414295, 152659064, 155485083, 186412244, 195546784, 196135909, 202981523, 214647448, 216377830, 217957372, 218789230, 225050691, 228064464, 228590691, 236342514, 237352780, 239814252] # ids = [31995310, 33586422, 34213587, 36093749, 37238528, 37739743, 39150739, 39281429, 40038908, 40289771, 40581071, 40591331, 42200293, 42739447, 42923948, 43351479, 44237678, 44506815, 44592013, 45106514, 45469037, 48411467, 51822565, 52127391, 54236264, 54706723, 54894477, 54898083, 55934378, 56104538, 56218948, 59606477, 60116927, 60638934, 61523351, 61685037, 61706106, 62187765, 62203118, 62843892, 63850238, 64139401, 65707507, 66072846, 66137391, 66738991, 67676932, 67902417, 69795866, 70868740, 71180456, 71796375, 77613620, 77641817, 77748144, 77761818, 78250390, 78606698, 78717682, 78854831, 79597122, 79597366, 79819968, 80377018, 82461832, 84018089, 84134439, 84815332, 85123470, 85123525, 85456789, 87474450, 88129399, 88288685, 88329278, 88342999, 88747517, 89632339, 89861712, 89985134, 91538446, 93323837, 94609104, 95522891, 97476802, 97629540, 98662744, 100207494, 100558146, 100755026, 101009561, 101275254, 101348782, 101462933, 101857772, 102924005, 103432276, 103459091, 104062674, 106601819, 106812124, 107065735, 107559314, 108201680, 108455612, 108544389, 108832580, 108995821, 109196083, 110726641, 110780095, 111234020, 111588327, 111656418, 111797176, 111993708, 114376859, 115869547, 117725909, 118032923, 118349683, 119080451, 119224972, 120120112, 120304657, 120830324, 122331341, 122856799, 123439110, 123641276, 123733047, 123733333, 123874242, 123918651, 124253086, 124942182, 125372140, 125464462, 125568385, 126185770, 126305386, 126512513, 126840529, 126844209, 126902118, 127254675, 127510817, 127670247, 128441465, 128498056, 129557176, 129833289, 129875792, 130121559, 130554345, 130556979, 131051006, 131142204, 131480539, 133743564, 133834740, 133984477, 134796953, 135533772, 135986763, 136777096, 137403576, 137864604, 138148591, 139840028, 139974803, 140105753, 145439181, 149105875, 150129836, 150828866, 152675649, 153688731, 155564708, 155599250, 155600699, 156728197, 161246902, 161775170, 162476194, 162914022, 162963943, 164007344, 164775490, 165339842, 175705079, 176218853, 176944891, 178251502, 178372090, 179732253, 180379187, 181626147, 184044160, 184404217, 186383436, 188468811, 192103014, 192574092, 192754157, 193358322, 195686462, 195868255, 196060419, 199113788, 201588003, 201874243, 201879319, 204796942, 205348530, 206735492, 208308899, 210310963, 210313993, 212124901, 212363133, 212389173, 213573782, 213818877, 214044075, 214989980, 215356671, 215367201, 215646443, 216212563, 216377823, 216490415, 217483041, 217486509, 218429429, 219181483, 219411056, 219971724, 220400698, 220780247, 221398716, 222545237, 223267606, 223906281, 224074580, 224383778, 224995705, 225390819, 227536610, 227829175, 227908020, 227980430, 229421942, 229862241, 230217038, 230227848, 230391553, 230592027, 233836843, 234465556, 235108306, 235217324, 235995802, 236010068, 236359727, 236419142, 236997002, 238069580, 238106585, 238534142, 238567209, 238839802, 239260141, 240214254, 240263848, 240535275, 240680028] # df = pd.read_csv('E:\产品单价数量/待预测数据html内容4.csv') # print('公告数:', len(df), len(ids)) # df = df[df['docid'].isin(ids)] # ids = [] # for docid,html in zip(df['docid'],df['dochtmlcon']): # product_attr = ProductAttributesPredictor() # rs, _ = product_attr.predict(docid='', html=html, page_time="") # # print(docid, rs) # # print(docid, rs[0]['product_attrs']['header_col']) # # print('*'*20) # if rs[0]['product_attrs']['header_col'] == []: # ids.append(docid) # print(docid, rs[0]['product_attrs']['header_col']) # print('*' * 20) # else: # print(docid, rs[0]['product_attrs']['header_col']) # print('*' * 20) # print(len(ids), ids) # role = RoleRulePredictor() # labels = [] # keywords = [] # # df = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果.xlsx') # df = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果60000-90000.xlsx') # columns = ['docid', 'type', 'label', 'value', 'front', 'behind', # 'front6', 'entity_text', 'behind6', 'front6_reverse', 'rule_label', 'keyword', 'pos'] # print(df.columns) # df.fillna('', inplace=True) # for front, center, behind, entity_text in zip(df['front'], df['entity_text'], df['behind'], df['entity_text']): # front = str(front) # behind = str(behind) # label, _prob, _flag, keyword = role.rule_predict(front, center, behind, entity_text) # labels.append(label) # keywords.append(keyword) # df['rule_label'] = pd.Series(labels) # df['keyword'] = pd.Series(keywords) # df['front6'] = df['front'].apply(lambda x: str(x)[-6:]) # df['behind6'] = df['behind'].apply(lambda x: str(x)[:6]) # df['pos'] = df.apply(lambda x: 1 if x['label']==x['rule_label'] else 0, axis=1) # # df.to_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果_rule_predict.xlsx', index=False, columns=columns) # df.to_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果60000-90000_rule_predict.xlsx', index=False, columns=columns)