predictor.py 583 KB


  1. '''
  2. Created on 2018年12月26日
  3. @author: User
  4. '''
  5. import os
  6. import sys
  7. from BiddingKG.dl.common.nerUtils import *
  8. sys.path.append(os.path.abspath("../.."))
  9. # from keras.engine import topology
  10. # from keras import models
  11. # from keras import layers
  12. # from keras_contrib.layers.crf import CRF
  13. # from keras.preprocessing.sequence import pad_sequences
  14. # from keras import optimizers,losses,metrics
  15. from BiddingKG.dl.common.Utils import *
  16. from BiddingKG.dl.interface.modelFactory import *
  17. import tensorflow as tf
  18. import pandas as pd
  19. from BiddingKG.dl.product.data_util import decode, process_data
  20. from BiddingKG.dl.interface.Entitys import Entity
  21. from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
  22. from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
  23. from bs4 import BeautifulSoup
  24. import copy
  25. import calendar
  26. import datetime
  27. from BiddingKG.dl.entityLink.entityLink import get_business_data
  28. from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
  29. # from BiddingKG.dl.interface.getAttributes import turnMoneySource
  30. from BiddingKG.dl.common.Utils import del_tabel_achievement, clean_company
  31. from BiddingKG.dl.interface.getAttributes import turnMoneySource, extract_serviceTime
  32. from BiddingKG.dl.time.re_servicetime import extract_servicetime
  33. # import fool # 统一用 selffool ,阿里云上只有selffool 包
  34. cpu_num = int(os.environ.get("CPU_NUM",0))
  35. sess_config = tf.ConfigProto(
  36. inter_op_parallelism_threads = cpu_num,
  37. intra_op_parallelism_threads = cpu_num,
  38. log_device_placement=True)
  39. sess_config = None
  40. file = os.path.dirname(__file__) + '/agency_set.pkl'
  41. with open(file, 'rb') as f:
  42. agency_set = pickle.load(f)
  43. with open(os.path.dirname(__file__) + '/header_set.pkl', 'rb') as f:
  44. header_set = pickle.load(f)
  45. def is_agency(entity_text):
  46. if re.search('(招投?标|采购|代理|咨询|管理|物资|事务所?|顾问|监理|拍卖)[()\w]{,4}(有限)?(责任)?公司|(采购|招投?标|交易|代理|咨询)[()\w]{,4}(中心|服务所)|法院$',
  47. entity_text) or entity_text in agency_set:
  48. return True
  49. return False
  50. def get_role(text, nlp_enterprise):
  51. '''
  52. 获取字符串text角色实体
  53. :param text: 待获取实体字符串
  54. :param nlp_enterprise: 公告中的角色实体列表
  55. :return:
  56. '''
  57. text = re.sub('主报名人:|联合报名人:|联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
  58. , ',', text)
  59. text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
  60. text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
  61. text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
  62. if text in nlp_enterprise:
  63. return text
  64. if len(text) > 50 or len(text)<4:
  65. return ''
  66. ners = getNers([text], useselffool=True)
  67. roles = []
  68. if ners:
  69. for ner in ners[0]:
  70. if ner[2] in ['org', 'company']:
  71. roles.append(ner[3])
  72. elif ner[2] in ['location'] and re.search('^\w{3,10}(海关|殡仪馆|店|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场)$', ner[3]):
  73. roles.append(ner[3])
  74. if roles and len(''.join(roles)) > len(text)*0.8:
  75. entity = clean_company(roles[0])
  76. return entity
  77. else:
  78. return ''
  79. from threading import RLock
  80. dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
  81. "prem":{"predictor":None,"Lock":RLock()},
  82. "epc":{"predictor":None,"Lock":RLock()},
  83. "roleRule":{"predictor":None,"Lock":RLock()},
  84. "roleRuleFinal":{"predictor":None,"Lock":RLock()},
  85. "tendereeRuleRecall":{"predictor":None,"Lock":RLock()},
  86. "form":{"predictor":None,"Lock":RLock()},
  87. "time":{"predictor":None,"Lock":RLock()},
  88. "punish":{"predictor":None,"Lock":RLock()},
  89. "product":{"predictor":None,"Lock":RLock()},
  90. "product_attrs":{"predictor":None,"Lock":RLock()},
  91. "channel": {"predictor": None, "Lock": RLock()},
  92. "deposit_payment_way": {"predictor": None, "Lock": RLock()},
  93. "total_unit_money": {"predictor": None, "Lock": RLock()},
  94. "industry": {"predictor": None, "Lock": RLock()},
  95. "rolegrade": {"predictor": None, "Lock": RLock()},
  96. "moneygrade": {"predictor": None, "Lock": RLock()},
  97. "district": {"predictor": None, "Lock": RLock()},
  98. 'tableprem': {"predictor": None, "Lock": RLock()},
  99. 'candidate': {"predictor": None, "Lock": RLock()},
  100. 'websource_tenderee': {"predictor": None, "Lock": RLock()},
  101. 'project_label': {"predictor": None, "Lock": RLock()},
  102. 'pb_extract': {"predictor": None, "Lock": RLock()},
  103. 'property_label': {"predictor": None, "Lock": RLock()},
  104. 'approval': {"predictor": None, "Lock": RLock()}, # 审批项目预测
  105. 'bid_score': {"predictor": None, "Lock": RLock()}, # 评标评分
  106. 'entity_type_rule': {"predictor": None, "Lock": RLock()}, # 地址、时间分类
  107. }
  108. def getPredictor(_type):
  109. if _type in dict_predictor:
  110. with dict_predictor[_type]["Lock"]:
  111. if dict_predictor[_type]["predictor"] is None:
  112. if _type == "codeName":
  113. dict_predictor[_type]["predictor"] = CodeNamePredict(config=sess_config)
  114. if _type == "prem":
  115. dict_predictor[_type]["predictor"] = PREMPredict(config=sess_config)
  116. if _type == "epc":
  117. dict_predictor[_type]["predictor"] = EPCPredict(config=sess_config)
  118. if _type == "roleRule":
  119. dict_predictor[_type]["predictor"] = RoleRulePredictor()
  120. if _type == "roleRuleFinal":
  121. dict_predictor[_type]["predictor"] = RoleRuleFinalAdd()
  122. if _type == "tendereeRuleRecall":
  123. dict_predictor[_type]["predictor"] = TendereeRuleRecall()
  124. if _type == "form":
  125. dict_predictor[_type]["predictor"] = FormPredictor(config=sess_config)
  126. if _type == "time":
  127. dict_predictor[_type]["predictor"] = TimePredictor(config=sess_config)
  128. if _type == "punish":
  129. dict_predictor[_type]["predictor"] = Punish_Extract()
  130. if _type == "product":
  131. dict_predictor[_type]["predictor"] = ProductPredictor(config=sess_config)
  132. if _type == "product_attrs":
  133. dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
  134. if _type == "channel":
  135. dict_predictor[_type]["predictor"] = DocChannel(config=sess_config)
  136. if _type == 'deposit_payment_way':
  137. dict_predictor[_type]["predictor"] = DepositPaymentWay()
  138. if _type == 'total_unit_money':
  139. dict_predictor[_type]["predictor"] = TotalUnitMoney()
  140. if _type == 'industry':
  141. dict_predictor[_type]["predictor"] = IndustryPredictor()
  142. if _type == 'rolegrade':
  143. dict_predictor[_type]["predictor"] = RoleGrade()
  144. if _type == 'moneygrade':
  145. dict_predictor[_type]["predictor"] = MoneyGrade()
  146. if _type == 'district':
  147. dict_predictor[_type]["predictor"] = DistrictPredictor()
  148. if _type == 'tableprem':
  149. dict_predictor[_type]["predictor"] = TablePremExtractor()
  150. if _type == 'candidate':
  151. dict_predictor[_type]["predictor"] = CandidateExtractor()
  152. if _type == 'websource_tenderee':
  153. dict_predictor[_type]['predictor'] = WebsourceTenderee()
  154. if _type == 'project_label':
  155. dict_predictor[_type]['predictor'] = ProjectLabel()
  156. if _type == 'pb_extract':
  157. dict_predictor[_type]['predictor'] = PBPredictor()
  158. if _type == 'property_label':
  159. dict_predictor[_type]['predictor'] = PropertyLabel()
  160. if _type == 'approval':
  161. dict_predictor[_type]['predictor'] = ApprovalPredictor()
  162. if _type == 'bid_score':
  163. dict_predictor[_type]['predictor'] = BiddingScore()
  164. if _type == 'entity_type_rule':
  165. dict_predictor[_type]['predictor'] = EntityTypeRulePredictor()
  166. return dict_predictor[_type]["predictor"]
  167. raise NameError("no this type of predictor")
  168. # 编号名称模型
  169. class CodeNamePredict():
  170. def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad(),config=None):
  171. self.model = None
  172. self.MAX_LEN = None
  173. self.model_code = None
  174. if EMBED_DIM is None:
  175. self.EMBED_DIM = 60
  176. else:
  177. self.EMBED_DIM = EMBED_DIM
  178. if BiRNN_UNITS is None:
  179. self.BiRNN_UNITS = 200
  180. else:
  181. self.BiRNN_UNITS = BiRNN_UNITS
  182. self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5"
  183. #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5"
  184. self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5"
  185. vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk"
  186. classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk"
  187. self.vocab = load(vocabpath)
  188. self.class_labels = load(classlabelspath)
  189. #生成提取编号和名称的正则
  190. id_PC_B = self.class_labels.index("PC_B")
  191. id_PC_M = self.class_labels.index("PC_M")
  192. id_PC_E = self.class_labels.index("PC_E")
  193. id_PN_B = self.class_labels.index("PN_B")
  194. id_PN_M = self.class_labels.index("PN_M")
  195. id_PN_E = self.class_labels.index("PN_E")
  196. self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E))
  197. self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E))
  198. # print("pc",self.PC_pattern)
  199. # print("pn",self.PN_pattern)
  200. self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab)))
  201. self.inputs = None
  202. self.outputs = None
  203. self.sess_codename = tf.Session(graph=tf.Graph(),config=config)
  204. self.sess_codesplit = tf.Session(graph=tf.Graph(),config=config)
  205. self.inputs_code = None
  206. self.outputs_code = None
  207. if not lazyLoad:
  208. self.getModel()
  209. self.getModel_code()
  210. def getModel(self):
  211. '''
  212. @summary: 取得编号和名称模型
  213. '''
  214. if self.inputs is None:
  215. log("get model of codename")
  216. with self.sess_codename.as_default():
  217. with self.sess_codename.graph.as_default():
  218. meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf")
  219. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  220. signature_def = meta_graph_def.signature_def
  221. self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name)
  222. self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name)
  223. self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name)
  224. self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name)
  225. self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name)
  226. return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
  227. else:
  228. return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
  229. '''
  230. if self.model is None:
  231. self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None)
  232. self.model.load_weights(self.filepath)
  233. return self.model
  234. '''
  235. def getModel_code(self):
  236. if self.inputs_code is None:
  237. log("get model of code")
  238. with self.sess_codesplit.as_default():
  239. with self.sess_codesplit.graph.as_default():
  240. meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel")
  241. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  242. signature_def = meta_graph_def.signature_def
  243. self.inputs_code = []
  244. self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
  245. self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
  246. self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name))
  247. self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
  248. self.sess_codesplit.graph.finalize()
  249. return self.inputs_code,self.outputs_code
  250. else:
  251. return self.inputs_code,self.outputs_code
  252. '''
  253. if self.model_code is None:
  254. log("get model of model_code")
  255. with self.sess_codesplit.as_default():
  256. with self.sess_codesplit.graph.as_default():
  257. self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  258. return self.model_code
  259. '''
  260. def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
  261. '''
  262. model = models.Sequential()
  263. model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
  264. model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
  265. crf = CRF(len(chunk_tags), sparse_target=True)
  266. model.add(crf)
  267. model.summary()
  268. model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
  269. return model
  270. '''
  271. input = layers.Input(shape=(None,))
  272. if weights is not None:
  273. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
  274. else:
  275. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
  276. bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
  277. bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
  278. crf = CRF(len(chunk_tags),sparse_target=True)
  279. crf_out = crf(bilstm_dense)
  280. model = models.Model(input=[input],output = [crf_out])
  281. model.summary()
  282. model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
  283. return model
  284. #根据规则补全编号或名称两边的符号
  285. def fitDataByRule(self,data):
  286. symbol_dict = {"(":")",
  287. "(":")",
  288. "[":"]",
  289. "【":"】",
  290. ")":"(",
  291. ")":"(",
  292. "]":"[",
  293. "】":"【"}
  294. leftSymbol_pattern = re.compile("[\((\[【]")
  295. rightSymbol_pattern = re.compile("[\))\]】]")
  296. leftfinds = re.findall(leftSymbol_pattern,data)
  297. rightfinds = re.findall(rightSymbol_pattern,data)
  298. result = data
  299. if len(leftfinds)+len(rightfinds)==0:
  300. return data
  301. elif len(leftfinds)==len(rightfinds):
  302. return data
  303. elif abs(len(leftfinds)-len(rightfinds))==1:
  304. if len(leftfinds)>len(rightfinds):
  305. if symbol_dict.get(data[0]) is not None:
  306. result = data[1:]
  307. else:
  308. #print(symbol_dict.get(leftfinds[0]))
  309. result = data+symbol_dict.get(leftfinds[0])
  310. else:
  311. if symbol_dict.get(data[-1]) is not None:
  312. result = data[:-1]
  313. else:
  314. result = symbol_dict.get(rightfinds[0])+data
  315. return result
  316. def decode(self,logits, trans, sequence_lengths, tag_num):
  317. viterbi_sequences = []
  318. for logit, length in zip(logits, sequence_lengths):
  319. score = logit[:length]
  320. viterbi_seq, viterbi_score = viterbi_decode(score, trans)
  321. viterbi_sequences.append(viterbi_seq)
  322. return viterbi_sequences
  323. def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
  324. #@summary: 获取每篇文章的code和name
  325. # pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
  326. pattern_score = re.compile('建设项目|服务项目|工程项目|工程施工|建设工程|服务中心|基础设施|物业管理|工程设计|妇幼保健|咨询服务|管理系统|管理中心|改建工程|配套工程|公安局|幼儿园|管理局|使用权|办公楼|教育局|管理处|图书馆|经营权|项目|采购|工程|改造|服务|设备|中心|医院|系统|建设|监理|施工|维修|学院|安装|设计|关于|标段|招标|技术|询价|管理|学校|小学|中学|平台|提升|设施|检测|整治|社区|装修|政府|绿化|物资|租赁|地块|医疗|编制|公开|规划|监控|教育|维护|校区|治理|升级|安置|竞价|购置|评估|勘察|承包|实验|大学|材料|生产|耗材|招租|硬化|维保|用地|消防|审计|拍卖|物业|入围|养护|机关|企业|用房|出让|资产|分局|验收|宣传|处置|校园|研究|咨询|修缮|更换|装饰|劳务|保养|物流|出租|局|院')
  327. result = []
  328. index_unk = self.word2index.get("<unk>")
  329. # index_pad = self.word2index.get("<pad>")
  330. if list_entitys is None:
  331. list_entitys = [[] for _ in range(len(list_sentences))]
  332. for list_sentence,list_entity in zip(list_sentences,list_entitys):
  333. if len(list_sentence)==0:
  334. result.append([{"code":[],"name":""}])
  335. continue
  336. doc_id = list_sentence[0].doc_id
  337. # sentences = []
  338. # for sentence in list_sentence:
  339. # if len(sentence.sentence_text)>MAX_AREA:
  340. # for _sentence_comma in re.split("[;;,\n]",sentence):
  341. # _comma_index = 0
  342. # while(_comma_index<len(_sentence_comma)):
  343. # sentences.append(_sentence_comma[_comma_index:_comma_index+MAX_AREA])
  344. # _comma_index += MAX_AREA
  345. # else:
  346. # sentences.append(sentence+"。")
  347. list_sentence.sort(key=lambda x:len(x.sentence_text),reverse=True)
  348. _begin_index = 0
  349. item = {"code":[],"name":""}
  350. code_set = set()
  351. dict_name_freq_score = dict()
  352. while(True):
  353. MAX_LEN = len(list_sentence[_begin_index].sentence_text)
  354. if MAX_LEN>MAX_AREA:
  355. MAX_LEN = MAX_AREA
  356. _LEN = MAX_AREA//MAX_LEN
  357. #预测
  358. x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
  359. # x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
  360. x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
  361. x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
  362. if USE_API:
  363. requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},verify=True)
  364. predict_y = json.loads(requests_result.text)['result']
  365. # print("cost_time:", json.loads(requests_result.text)['cost_time'])
  366. # print(MAX_LEN,_LEN,_begin_index)
  367. else:
  368. with self.sess_codename.as_default():
  369. t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel()
  370. _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x,
  371. t_input_length:x_len,
  372. t_keepprob:1.0})
  373. predict_y = self.decode(_logits,_trans,x_len,7)
  374. # print('==========',_logits)
  375. '''
  376. for item11 in np.argmax(predict_y,-1):
  377. print(item11)
  378. print(predict_y)
  379. '''
  380. # print(predict_y)
  381. for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)):
  382. pad_sentence = sentence.sentence_text[:MAX_LEN]
  383. join_predict = "".join([str(s) for s in predict])
  384. # print(pad_sentence)
  385. # print(join_predict)
  386. code_x = []
  387. code_text = []
  388. pre_text = []
  389. temp_entitys = []
  390. for iter in re.finditer(self.PC_pattern,join_predict):
  391. get_len = 40
  392. if iter.span()[0]<get_len:
  393. begin = 0
  394. else:
  395. begin = iter.span()[0]-get_len
  396. end = iter.span()[1]+get_len
  397. code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
  398. code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]].replace(",", ""))
  399. pre_text.append(pad_sentence[begin:iter.span()[0]])
  400. _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
  401. temp_entitys.append(_entity)
  402. #print("code",code_text)
  403. if len(code_x)>0:
  404. code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3))
  405. if USE_PAI_EAS:
  406. request = tf_predict_pb2.PredictRequest()
  407. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  408. request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0]))
  409. request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1))
  410. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  411. request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1]))
  412. request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1))
  413. request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
  414. request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2]))
  415. request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1))
  416. request_data = request.SerializeToString()
  417. list_outputs = ["outputs"]
  418. _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs)
  419. if _result is not None:
  420. predict_code = _result["outputs"]
  421. else:
  422. with self.sess_codesplit.as_default():
  423. with self.sess_codesplit.graph.as_default():
  424. predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
  425. else:
  426. with self.sess_codesplit.as_default():
  427. with self.sess_codesplit.graph.as_default():
  428. inputs_code,outputs_code = self.getModel_code()
  429. predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})[0]
  430. #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})
  431. #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
  432. for h in range(len(predict_code)):
  433. if predict_code[h][0]>0.5:
  434. the_code = self.fitDataByRule(code_text[h])
  435. # print(the_code)
  436. #add code to entitys
  437. list_entity.append(temp_entitys[h])
  438. if re.search(',|/|;|、|,', the_code) and len(the_code)>25:
  439. for it in re.split(',|/|;|、|,', the_code):
  440. if len(it) > 8:
  441. if it not in code_set:
  442. code_set.add(it)
  443. # item['code'].append(it)
  444. if re.search("(项目编号|招标编号):?$", pre_text[h]):
  445. item['code'].append((it, 0, sentence.sentence_index))
  446. elif re.search('采购(计划)?编号:?$', pre_text[h]):
  447. item['code'].append((it, 1, sentence.sentence_index))
  448. elif re.search('(询价|合同)编号:?$', pre_text[h]):
  449. item['code'].append((it, 2, sentence.sentence_index))
  450. elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
  451. item['code'].append((it, 2.5, sentence.sentence_index))
  452. else:
  453. item['code'].append((it, 3, sentence.sentence_index))
  454. elif len(item['code']) > 0:
  455. new_it = item['code'][-1][0] + re.search(',|/|;|、|,', the_code).group(0) + it
  456. if new_it not in code_set:
  457. code_set.add(new_it)
  458. # item['code'][-1] = new_it
  459. if re.search("(项目编号|招标编号):?$", pre_text[h]):
  460. item['code'][-1] = (new_it, 0, sentence.sentence_index)
  461. elif re.search('采购(计划)?编号:?$', pre_text[h]):
  462. item['code'][-1] = (new_it, 1, sentence.sentence_index)
  463. elif re.search('(询价|合同)编号:?$', pre_text[h]):
  464. item['code'][-1] = (new_it, 2, sentence.sentence_index)
  465. elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
  466. item['code'].append((new_it, 2.5, sentence.sentence_index))
  467. else:
  468. item['code'][-1] = (new_it, 3, sentence.sentence_index)
  469. else:
  470. if the_code not in code_set:
  471. code_set.add(the_code)
  472. # item['code'].append(the_code)
  473. if re.search("(项目编号|招标编号):?$", pre_text[h]):
  474. item['code'].append((the_code, 0, sentence.sentence_index))
  475. elif re.search('采购(计划)?编号:?$', pre_text[h]):
  476. item['code'].append((the_code, 1, sentence.sentence_index))
  477. elif re.search('(询价|合同)编号:?$', pre_text[h]):
  478. item['code'].append((the_code, 2, sentence.sentence_index))
  479. elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
  480. item['code'].append((the_code, 2.5, sentence.sentence_index))
  481. else:
  482. item['code'].append((the_code, 3, sentence.sentence_index))
  483. break
  484. elif the_code not in code_set:
  485. if len(the_code)<5: # 避免510545935 这种把 招标项目编号:2024年第二期 只提取2024
  486. continue
  487. code_set.add(the_code)
  488. # item['code'].append(the_code)
  489. if re.search("(项目编号|招标编号):?$", pre_text[h]):
  490. item['code'].append((the_code, 0, sentence.sentence_index))
  491. elif re.search('采购(计划)?编号:?$', pre_text[h]):
  492. item['code'].append((the_code, 1, sentence.sentence_index))
  493. elif re.search('(询价|合同)编号:?$', pre_text[h]):
  494. item['code'].append((the_code, 2, sentence.sentence_index))
  495. elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
  496. item['code'].append((the_code, 2.5, sentence.sentence_index))
  497. else:
  498. item['code'].append((the_code, 3, sentence.sentence_index))
  499. # if the_code not in code_set:
  500. # code_set.add(the_code)
  501. # item['code'] = list(code_set)
  502. for iter in re.finditer(self.PN_pattern,join_predict):
  503. _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  504. if len(_name)>200: # 避免模型预测类似 202750503 这种很长重复字很多的错误项目名称
  505. continue
  506. elif '公司:你单位在' in _name: # 避免类似 339900030 这种作为项目名称,导致中标角色作为招标角色
  507. continue
  508. elif _name.endswith('公司') and len(_name)<20: # 修复 456957250 雄县辉茂纸塑包装制品销售有限公司 作为项目名称
  509. continue
  510. #add name to entitys
  511. _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
  512. list_entity.append(_entity)
  513. # w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
  514. w = 1 if re.search('(项目|工程|招标|采购|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题|项目)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
  515. if _name not in dict_name_freq_score:
  516. # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
  517. len_name = len(_name) if len(_name) <50 else 100-len(_name) # 2023/03/02 超出50长度的逐渐递减
  518. dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len_name * 0.05), w]
  519. else:
  520. dict_name_freq_score[_name][0] += 1
  521. if w > dict_name_freq_score[_name][2]:
  522. dict_name_freq_score[_name][2] = w
  523. '''
  524. for iter in re.finditer(self.PN_pattern,join_predict):
  525. print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
  526. if item[1]['name']=="":
  527. for iter in re.finditer(self.PN_pattern,join_predict):
  528. #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  529. item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  530. break
  531. '''
  532. if _begin_index+_LEN>=len(list_sentence):
  533. break
  534. _begin_index += _LEN
  535. list_name_freq_score = []
  536. # print('模型预测项目名称:', dict_name_freq_score)
  537. # 2020/11/23 大网站规则调整
  538. if len(dict_name_freq_score) == 0:
  539. # name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
  540. name_re1 = '(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题)[::\s]+(?P<name>[^,。:;]{2,60})[,。]'
  541. name_re2 = '(合同|采购)包\d((?P<name>[^,。:;]{2,60}))[:,。]' # 20241202 补充合同包 包名表达 558410976
  542. for sentence in list_sentence:
  543. # pad_sentence = sentence.sentence_text
  544. othername = re.search(name_re1, sentence.sentence_text)
  545. if othername == None:
  546. othername = re.search(name_re2, sentence.sentence_text)
  547. if othername != None:
  548. project_name = othername.group('name')
  549. if re.search('[\u4e00-\u9fa5]+', project_name) == None: # 没有中文的项目名称去除
  550. # log('没有中文的项目名称去除')
  551. continue
  552. beg = find_index([project_name], sentence.sentence_text)[0]
  553. end = beg + len(project_name)
  554. _name = self.fitDataByRule(sentence.sentence_text[beg:end])
  555. # print('规则召回项目名称:', _name)
  556. # add name to entitys
  557. _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
  558. sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
  559. entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
  560. end_index=0, wordOffset_begin=beg, wordOffset_end=end,in_attachment=sentence.in_attachment)
  561. list_entity.append(_entity)
  562. w = 1
  563. if _name not in dict_name_freq_score:
  564. # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
  565. dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05), w]
  566. else:
  567. dict_name_freq_score[_name][0] += 1
  568. # othername = re.search(name_re1, sentence.sentence_text)
  569. # if othername != None:
  570. # _name = othername.group(3)
  571. # if _name not in dict_name_freq_score:
  572. # dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1]
  573. # else:
  574. # dict_name_freq_score[_name][0] += 1
  575. for _name in dict_name_freq_score.keys():
  576. list_name_freq_score.append([_name,dict_name_freq_score[_name]])
  577. # print(list_name_freq_score)
  578. if len(list_name_freq_score)>0:
  579. list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1]*x[1][2],reverse=True)
  580. item['name'] = list_name_freq_score[0][0]
  581. # for it in list_name_freq_score:
  582. # print('项目名称及分值:',it[0],it[1], it[1][0]*it[1][1])
  583. # if list_name_freq_score[0][1][0]>1:
  584. # item[1]['name'] = list_name_freq_score[0][0]
  585. # else:
  586. # list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True)
  587. # item[1]["name"] = list_name_freq_score[0][0]
  588. #下面代码加上去用正则添加某些识别不到的项目编号
  589. if item['code'] == []:
  590. for sentence in list_sentence:
  591. # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text)
  592. # if othercode != None:
  593. # item[1]['code'].append(othercode.group(2))
  594. # 2020/11/23 大网站规则调整
  595. othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P<code>[^,。;:、]{6,30}[a-zA-Z0-9\号期])[\),。\u4e00-\u9fa5]', sentence.sentence_text)
  596. if othercode != None:
  597. # item['code'].append(othercode.group('code'))
  598. if re.search("(项目编号|招标编号):?$", othercode.group(0)):
  599. item['code'].append((othercode.group('code'), 0, sentence.sentence_index))
  600. elif re.search('采购(计划)?编号:?$', othercode.group(0)):
  601. item['code'].append((othercode.group('code'), 1, sentence.sentence_index))
  602. elif re.search('(询价|合同)编号:?$', othercode.group(0)):
  603. item['code'].append((othercode.group('code'), 2, sentence.sentence_index))
  604. elif re.search('(询价|合同|采购|招标|项目)标号:?$', othercode.group(0)):
  605. item['code'].append((othercode.group('code'), 2.5, sentence.sentence_index))
  606. else:
  607. item['code'].append((othercode.group('code'), 3, sentence.sentence_index))
  608. # print('规则召回项目编号:', othercode.group('code'))
  609. # item['code'] = [code for code in item['code'] if len(code)<500]
  610. # item['code'].sort(key=lambda x:len(x),reverse=True)
  611. item['code'] = [code for code in item['code'] if len(code[0]) < 500]
  612. item['code'].sort(key=lambda x: [x[1],x[2]])
  613. item['code'] = [it[0] for it in item['code']]
  614. result.append(item)
  615. list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
  616. return result
  617. '''
  618. #当数据量过大时会报错
  619. def predict(self,articles,MAX_LEN = None):
  620. sentences = []
  621. for article in articles:
  622. for sentence in article.content.split("。"):
  623. sentences.append([sentence,article.id])
  624. if MAX_LEN is None:
  625. sent_len = [len(sentence[0]) for sentence in sentences]
  626. MAX_LEN = max(sent_len)
  627. #print(MAX_LEN)
  628. #若为空,则直接返回空
  629. result = []
  630. if MAX_LEN==0:
  631. for article in articles:
  632. result.append([article.id,{"code":[],"name":""}])
  633. return result
  634. index_unk = self.word2index.get("<unk>")
  635. index_pad = self.word2index.get("<pad>")
  636. x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences]
  637. x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
  638. predict_y = self.getModel().predict(x)
  639. last_doc_id = ""
  640. item = []
  641. for sentence,predict in zip(sentences,np.argmax(predict_y,-1)):
  642. pad_sentence = sentence[0][:MAX_LEN]
  643. doc_id = sentence[1]
  644. join_predict = "".join([str(s) for s in predict])
  645. if doc_id!=last_doc_id:
  646. if last_doc_id!="":
  647. result.append(item)
  648. item = [doc_id,{"code":[],"name":""}]
  649. code_set = set()
  650. code_x = []
  651. code_text = []
  652. for iter in re.finditer(self.PC_pattern,join_predict):
  653. get_len = 40
  654. if iter.span()[0]<get_len:
  655. begin = 0
  656. else:
  657. begin = iter.span()[0]-get_len
  658. end = iter.span()[1]+get_len
  659. code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
  660. code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
  661. if len(code_x)>0:
  662. code_x = np.transpose(np.array(code_x),(1,0,2,3))
  663. predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
  664. for h in range(len(predict_code)):
  665. if predict_code[h][0]>0.5:
  666. the_code = self.fitDataByRule(code_text[h])
  667. if the_code not in code_set:
  668. code_set.add(the_code)
  669. item[1]['code'] = list(code_set)
  670. if item[1]['name']=="":
  671. for iter in re.finditer(self.PN_pattern,join_predict):
  672. #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  673. item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  674. break
  675. last_doc_id = doc_id
  676. result.append(item)
  677. return result
  678. '''
  679. #角色金额模型
  680. class PREMPredict():
  681. def __init__(self,config=None):
  682. #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
  683. # self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
  684. self.model_role = Model_role_classify_word(config=config)
  685. self.model_money = Model_money_classify(config=config)
  686. # self.role_file = open('/data/python/lsm/role_model_predict.txt', 'a', encoding='utf-8')
  687. # self.money_file = open('/data/python/lsm/money_model_predict.txt', 'a', encoding='utf-8')
  688. return
  689. def search_role_data(self,list_sentences,list_entitys):
  690. '''
  691. @summary:根据句子list和实体list查询角色模型的输入数据
  692. @param:
  693. list_sentences:文章的sentences
  694. list_entitys:文章的entitys
  695. @return:角色模型的输入数据
  696. '''
  697. text_list = []
  698. data_x = []
  699. points_entitys = []
  700. for list_entity,list_sentence in zip(list_entitys,list_sentences):
  701. list_entity.sort(key=lambda x:x.sentence_index)
  702. list_sentence.sort(key=lambda x:x.sentence_index)
  703. p_entitys = 0
  704. p_sentences = 0
  705. while(p_entitys<len(list_entity)):
  706. entity = list_entity[p_entitys]
  707. if entity.entity_type in ['org','company']:
  708. while(p_sentences<len(list_sentence)):
  709. sentence = list_sentence[p_sentences]
  710. if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
  711. # text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-13):entity.wordOffset_end+10])
  712. text_sen = sentence.sentence_text
  713. b = entity.wordOffset_begin
  714. e = entity.wordOffset_end
  715. text_list.append((text_sen[max(0, b-13):b], text_sen[b:e], text_sen[e:e+15]))
  716. # item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
  717. # item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
  718. item_x = self.model_role.encode_word(sentence_text=text_sen, begin_index=entity.wordOffset_begin, end_index=entity.wordOffset_end, size=30)
  719. data_x.append(item_x)
  720. points_entitys.append(entity)
  721. break
  722. p_sentences += 1
  723. p_entitys += 1
  724. if len(points_entitys)==0:
  725. return None
  726. return [data_x,points_entitys, text_list]
  727. def search_money_data(self,list_sentences,list_entitys):
  728. '''
  729. @summary:根据句子list和实体list查询金额模型的输入数据
  730. @param:
  731. list_sentences:文章的sentences
  732. list_entitys:文章的entitys
  733. @return:金额模型的输入数据
  734. '''
  735. text_list = []
  736. data_x = []
  737. points_entitys = []
  738. for list_entity,list_sentence in zip(list_entitys,list_sentences):
  739. list_entity.sort(key=lambda x:x.sentence_index)
  740. list_sentence.sort(key=lambda x:x.sentence_index)
  741. p_entitys = 0
  742. while(p_entitys<len(list_entity)):
  743. entity = list_entity[p_entitys]
  744. if entity.entity_type=="money":
  745. p_sentences = 0
  746. while(p_sentences<len(list_sentence)):
  747. sentence = list_sentence[p_sentences]
  748. if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
  749. # text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 13):entity.wordOffset_begin])
  750. text_sen = sentence.sentence_text
  751. b = entity.wordOffset_begin
  752. e = entity.wordOffset_end
  753. text_list.append((text_sen[max(0, b - 13):b], text_sen[b:e], text_sen[e:e + 10]))
  754. #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
  755. #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
  756. item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
  757. data_x.append(item_x)
  758. points_entitys.append(entity)
  759. break
  760. p_sentences += 1
  761. p_entitys += 1
  762. if len(points_entitys)==0:
  763. return None
  764. return [data_x,points_entitys, text_list]
  765. def predict_role(self,list_sentences, list_entitys):
  766. datas = self.search_role_data(list_sentences, list_entitys)
  767. if datas is None:
  768. return
  769. points_entitys = datas[1]
  770. text_list = datas[2]
  771. if USE_PAI_EAS:
  772. _data = datas[0]
  773. _data = np.transpose(np.array(_data),(1,0,2))
  774. request = tf_predict_pb2.PredictRequest()
  775. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  776. request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
  777. request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
  778. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  779. request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
  780. request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
  781. request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
  782. request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
  783. request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
  784. request_data = request.SerializeToString()
  785. list_outputs = ["outputs"]
  786. _result = vpc_requests(role_url, role_authorization, request_data, list_outputs)
  787. if _result is not None:
  788. predict_y = _result["outputs"]
  789. else:
  790. predict_y = self.model_role.predict(datas[0])
  791. else:
  792. predict_y = self.model_role.predict(np.array(datas[0],dtype=np.float64))
  793. for i in range(len(predict_y)):
  794. entity = points_entitys[i]
  795. label = np.argmax(predict_y[i])
  796. values = predict_y[i]
  797. # text = text_list[i]
  798. text_tup = text_list[i]
  799. front, middle, behind = text_tup
  800. whole = "".join(text_tup)
  801. # print('模型预测角色:', front, entity.entity_text, behind,label, values)
  802. # if label in [0, 1, 2, 3, 4]:
  803. # self.role_file.write("{0}#split#{1}#split#{2}#split#{3}#split#{4}\n".format(front, entity.entity_text, behind,label, entity.doc_id))
  804. if re.search('^以\d+[\d,.]+万?元中标', behind) and label != 2: # 优化244261884预测错误 大连长之琳科技发展有限公司以7.63277万元中标
  805. label = 2
  806. values[label] = 0.8
  807. if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
  808. # print(' # 小于阈值的设为其他,让后面的规则召回重新判断', values[label])
  809. label = 5
  810. elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
  811. label = 5
  812. elif label == 0:
  813. if re.search('拟邀请$|受邀谈判方', front):
  814. label = 2
  815. values[label] = 0.501
  816. elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?|(采购|招标|发布)机构)(名称)?[是为:]+', front) and is_agency(entity.entity_text):
  817. label = 1
  818. values[label] = 0.501
  819. elif re.search('受托人((盖章))?:$', front):
  820. label = 1
  821. values[label] = 0.501
  822. elif re.search('采用$|异议受理部门|本次招标有:$|直购企业:$|主报名人:$|采购候选人:$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统- 标公告,本次招标有:内黄县汇融钢材有限公司、安阳正元建筑工程有限公司、内黄县鸿业贸易有限责任公司三家合格供应商进行报名投标。 438880541 直购企业可能为多个,其中一个中标
  823. label = 5
  824. elif re.search(',单位名称:$', front) and re.search('^,(中标|中选)价格', behind):
  825. label = 2
  826. values[label] = 0.501
  827. elif label == 2:
  828. if re.search('中标单位和.{,25}签订合同', whole):
  829. label = 0
  830. values[label] = 0.501
  831. elif re.search('尊敬的供应商:.{,25}我公司', whole):
  832. label = 0
  833. values[label] = 0.801
  834. elif re.search('尊敬的供应商:$|本项目确定1名中[标选]人为$', front):
  835. label = 0
  836. values[label] = 0.501
  837. elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$|第[4-9四五六七八九十]名', front): #修复第4以上的预测错为中标人
  838. label = 5
  839. values[2] = 0.5
  840. elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front): # or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
  841. values[2] = 0.5
  842. label = 5
  843. elif re.search('税费', front) and re.search('^承担', behind):
  844. label = 5
  845. elif re.search('第一候补|第一后备|备选', front):
  846. label = 3
  847. values[label] = 0.6
  848. elif re.search('^放弃中标资格|是否中标:否|^(中标|成交)(公示|公告)', behind):
  849. values[2] = 0.5
  850. label = 5
  851. elif re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]', front)==None and values[2]<0.7: #20241126补充条件避免漏提 560768263 第一候选人:单位名称: 上海理想信息产业(集团)有限公司 ,投标报价:
  852. values[2] = 0.5
  853. label = 5
  854. elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$|,承刻单位:$|乙方接受为$|丙方:$', front): # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位: 241929628 1月9,承刻单位: 肃宁县超凡网络光敏印章刻印部 ,印章预留印模
  855. label = 5
  856. elif re.search(',来源:$', front) and re.search('^,', behind): # 修复 472062585 项目采购-关于定制手机询比价采购中标公告,来源:深圳市网联安瑞网络科技有限公司 预测为中标
  857. label = 0
  858. values[label] = 0.5
  859. elif re.search('合同供方:?$|合同签约单位', front):
  860. label = 0
  861. values[label] = 0.5
  862. elif re.search('现由$', front) and re.search('^作为\d个单位的牵头(单位|公司)?', behind): # 修复 469369884 站源批量预测错误 现由第七合同段保利长大工程有限公司作为6个单位的牵头单位,
  863. label = 5
  864. elif re.search('(中标|成交)?|结果)?)(人|公告|公示),$|中标人信息:$', front): # 20250227修复中标错误 588005167 现确定贵公司为该项目的中标人,中国二冶集团有限公司,2025年01月26日,
  865. label = 5
  866. elif re.search('确定$', front) and re.search('^\w{,5}(项目|采购|招标)', behind):
  867. label = 5
  868. elif re.search('由$', front) and re.search('^进行招标', behind):
  869. label = 0
  870. values[0] = 0.5
  871. elif re.search('^为\w{,10}第二(成交|中标)单位', behind): # 中标预测错误,例:601143888 河南省创慧新材料科技有限公司为铸咀采购项目第二成交单位
  872. label = 3
  873. values[3] = 0.5
  874. elif re.search('是否中标:是,供应商', front) and label == 5:
  875. label = 2
  876. values[label] = 0.9
  877. elif label == 1:
  878. if re.search('委托(单位|人|方)[是为:]+',front) and re.search('受委托(单位|人|方)[是为:]+', front)==None:
  879. label = 0
  880. values[label] = 0.501
  881. elif re.search('([,。:]|^)(第一)?(服务|中选|中标)(中介服务|代理)?(公司|机构)(名称)?', front):
  882. label = 2
  883. values[label] = 0.501
  884. elif re.search('在中介超市委托$', front) and re.search('^负责', behind):
  885. label = 2
  886. values[label] = 0.501
  887. elif re.search('^:受', behind): # 354009560 附件格式问题 ,中选中介服务机构通知书,编号:HZ2305120541,中汕项目管理有限公司:受惠东县人民政府大岭街道办事处委托
  888. label = 5
  889. elif re.search('发布机构', front) and not is_agency(entity.entity_text):
  890. label = 0
  891. values[label] = 0.501
  892. elif re.search('开户银行:$|环境影响评价机构|环评机构|评价机构', front): # 368214232 法定代表人:委托代理人:开户银行:鸡东建行
  893. label = 5
  894. elif re.search('委托$', front) and re.search('^(抽样|送检|看样)', behind):
  895. label = 5
  896. elif re.search('推荐入围的招标代理单位:$', front): # 20240709 修复302505502预测错为代理
  897. label = 2
  898. values[label] = 0.501
  899. elif label in [3,4]:
  900. if re.search('第[二三]分(公司|店),中标(人|供应商|单位|公司):$', front):
  901. label = 2
  902. values[label] = 0.7
  903. elif re.search('决定选择第[二三]名', front) and re.search('^作为(中标|成交)(人|供应商|单位|公司)', behind):
  904. label = 2
  905. values[label] = 0.8
  906. elif re.search('\d+\.\d+,供应商名称:', front): # 341385226 30.2,供应商名称: 预测为第二名
  907. label = 2
  908. values[label] = 0.501
  909. elif re.search('\d+\.\d+[,、]?(中标|成交)候选人|[;,][23]、(中标|中选|成交)候选人:', front):
  910. label = 5
  911. values[label] = 0.501
  912. elif re.search('第一名:$', front):
  913. label = 2
  914. values[label] = 0.7
  915. elif re.search('(中标|成交)通知书[,:]$', front) and re.search('^:', behind) and label != 2:
  916. label = 2
  917. values[label] = 0.8
  918. elif label==5 and re.search('^拟(招标|采购)一批|^须购置一批', front):
  919. label = 0
  920. values[label] = 0.7
  921. entity.set_Role(label, values)
  922. def predict_money(self,list_sentences,list_entitys):
  923. datas = self.search_money_data(list_sentences, list_entitys)
  924. if datas is None:
  925. return
  926. points_entitys = datas[1]
  927. _data = datas[0]
  928. text_list = datas[2]
  929. if USE_PAI_EAS:
  930. _data = np.transpose(np.array(_data),(1,0,2,3))
  931. request = tf_predict_pb2.PredictRequest()
  932. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  933. request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
  934. request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
  935. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  936. request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
  937. request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
  938. request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
  939. request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
  940. request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
  941. request_data = request.SerializeToString()
  942. list_outputs = ["outputs"]
  943. _result = vpc_requests(money_url, money_authorization, request_data, list_outputs)
  944. if _result is not None:
  945. predict_y = _result["outputs"]
  946. else:
  947. predict_y = self.model_money.predict(_data)
  948. else:
  949. predict_y = self.model_money.predict(_data)
  950. for i in range(len(predict_y)):
  951. entity = points_entitys[i]
  952. label = np.argmax(predict_y[i])
  953. values = predict_y[i]
  954. # text = text_list[i]
  955. text_tup = text_list[i]
  956. front, middle, behind = text_tup
  957. whole = "".join(text_tup)
  958. # print('金额: ', entity.entity_text, label, values, front, middle, behind)
  959. # if label in [0, 1]:
  960. # self.money_file.write("{0} {1} {2} {3}\n".format(front, entity.entity_text, behind, label))
  961. if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
  962. # print('模型预测金额: ', entity.entity_text, label, values, front, middle, behind)
  963. label = 2
  964. elif label == 1: # 错误中标金额处理
  965. if re.search('[::,。](总金额|总价|单价|合价)((万?元))?:?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
  966. values[label] = 0.5
  967. elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', front): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
  968. values[label] = 0.49
  969. elif re.search('^(以[上下])?按[\d.%]+收取|^及?以[上下]|^[()]?[+×*-][\d.%]+', behind):
  970. values[label] = 0.49
  971. elif re.search('(含|在|包括|[大小等高低]于|达到)$|[\d.%]+[+×*-]$', front):
  972. values[label] = 0.49
  973. # elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释,单价单独存放
  974. # label = 2
  975. elif label ==0: # 错误招标金额处理
  976. if entity.notes in ["投资", "总投资","工程造价"] or re.search('投资(金额|规模):$', front): # 545988699 金额不大的投资金额作为备选招标金额
  977. values[label] = 0.51
  978. elif re.search('最低限价:?$|注册资本', front) or re.search('服务内容:([\d,.]+万?亿?元?-?)$', front):
  979. values[label] = 0.49
  980. label = 2
  981. elif re.search('^(以[上下])?按[\d.%]+收取|^及?以[上下]|^[()]?[+×*-][\d.%]+|(含)', behind):
  982. values[label] = 0.49
  983. # elif re.search('(含|在|包括|[大小等高低]于|如预算金额为)$|[\d.%]+((含))?[+×*-]$', front): # 2024/10/30 注销,避免漏提 预算金额:控制在26000元以内由合作银行出资 ;投资金额不低于人民币500万元
  984. # values[label] = 0.49
  985. # elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释,单价单独存放
  986. # label = 2
  987. elif re.search('招标金额|限价|预算|控制价|拦标价', front) == None and re.search('预计约?为?$',
  988. front): # 20241206纠正 565894149(预计约2500元)预测为预算
  989. label = 2
  990. elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
  991. label = 1
  992. values[label] = 0.8
  993. entity.set_Money(label, values)
  994. def correct_money_by_rule(self, title, list_entitys, list_articles):
  995. if (len(re.findall('监理|施工|设计|勘察', title)) == 1 and re.search('施工|总承包|epc|EPC', title) == None) or re.search('服务金额', list_articles[0].content):
  996. # keyword = re.search('监理|设计|勘察', title).group(0)
  997. for list_entity in list_entitys:
  998. for _entity in list_entity:
  999. # print('keyword:',keyword, '_entity.notes :',_entity.notes)
  1000. # if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label == 2:
  1001. if _entity.entity_type == "money" and _entity.notes == '招标或中标金额' and _entity.label == 2:
  1002. # if channel_dic['docchannel'] == "招标公告":
  1003. if re.search('中标|成交|中选|中价|中租|结果|入围', title + list_articles[0].content[:100]) == None:
  1004. _entity.values[0] = 0.55
  1005. _entity.set_Money(0, _entity.values) # 2021/11/18 根据公告类别把费用改为招标或中投标金额
  1006. else:
  1007. _entity.values[1] = 0.55
  1008. _entity.set_Money(1, _entity.values)
  1009. def predict(self,list_sentences,list_entitys):
  1010. self.predict_role(list_sentences,list_entitys)
  1011. self.predict_money(list_sentences,list_entitys)
  1012. #联系人模型
  1013. class EPCPredict():
  1014. def __init__(self,config=None):
  1015. self.model_person = Model_person_classify(config=config)
  1016. def search_person_data(self,list_sentences,list_entitys):
  1017. '''
  1018. @summary:根据句子list和实体list查询联系人模型的输入数据
  1019. @param:
  1020. list_sentences:文章的sentences
  1021. list_entitys:文章的entitys
  1022. @return:联系人模型的输入数据
  1023. '''
  1024. data_x = []
  1025. points_entitys = []
  1026. pre_texts = []
  1027. for list_entity,list_sentence in zip(list_entitys,list_sentences):
  1028. p_entitys = 0
  1029. dict_index_sentence = {}
  1030. for _sentence in list_sentence:
  1031. dict_index_sentence[_sentence.sentence_index] = _sentence
  1032. _list_entity = [entity for entity in list_entity if entity.entity_type=="person"]
  1033. while(p_entitys<len(_list_entity)):
  1034. entity = _list_entity[p_entitys]
  1035. if entity.entity_type=="person":
  1036. sentence = dict_index_sentence[entity.sentence_index]
  1037. item_x = self.model_person.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
  1038. data_x.append(item_x)
  1039. points_entitys.append(entity)
  1040. pre_texts.append(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20))
  1041. p_entitys += 1
  1042. if len(points_entitys)==0:
  1043. return None
  1044. # return [data_x,points_entitys,dianhua]
  1045. return [data_x,points_entitys, pre_texts]
  1046. def predict_person(self,list_sentences, list_entitys):
  1047. datas = self.search_person_data(list_sentences, list_entitys)
  1048. if datas is None:
  1049. return
  1050. points_entitys = datas[1]
  1051. pre_texts = datas[2]
  1052. # phone = datas[2]
  1053. if USE_PAI_EAS:
  1054. _data = datas[0]
  1055. _data = np.transpose(np.array(_data),(1,0,2,3))
  1056. request = tf_predict_pb2.PredictRequest()
  1057. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  1058. request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
  1059. request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
  1060. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  1061. request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
  1062. request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
  1063. request_data = request.SerializeToString()
  1064. list_outputs = ["outputs"]
  1065. _result = vpc_requests(person_url, person_authorization, request_data, list_outputs)
  1066. if _result is not None:
  1067. predict_y = _result["outputs"]
  1068. else:
  1069. predict_y = self.model_person.predict(datas[0])
  1070. else:
  1071. predict_y = self.model_person.predict(datas[0])
  1072. # assert len(predict_y)==len(points_entitys)==len(phone)
  1073. assert len(predict_y)==len(points_entitys)
  1074. for i in range(len(predict_y)):
  1075. entity = points_entitys[i]
  1076. label = np.argmax(predict_y[i])
  1077. pre_text = ''.join(pre_texts[i][0])
  1078. # print('pre_text', pre_text)
  1079. if label==0 and re.search('(谈判|磋商|询价|资格审查|评审专家|(评选|议标|评标|评审)委员会?|专家|评委)(小?组|小?组成员)?(成员|名单)[:,](\w{2,4}((组长)|(成员))?[、,,])*$', pre_text):
  1080. # print(entity.entity_text, re.search('(谈判|磋商|询价|资格审查|评审专家|(评选|议标|评标|评审)委员会?|专家|评委)(小?组|小?组成员)?(成员|名单)[:,](\w{2,4}((组长)|(成员))?[、,,])*$', pre_text).group(0))
  1081. label = 4
  1082. values = []
  1083. for item in predict_y[i]:
  1084. values.append(item)
  1085. # phone_number = phone[i]
  1086. # entity.set_Person(label,values,phone_number)
  1087. entity.set_Person(label,values,[])
  1088. # 为联系人匹配电话
  1089. # self.person_search_phone(list_sentences, list_entitys)
  1090. def person_search_phone(self,list_sentences, list_entitys):
  1091. def phoneFromList(phones):
  1092. # for phone in phones:
  1093. # if len(phone)==11:
  1094. # return re.sub('电话[:|:]|联系方式[:|:]','',phone)
  1095. return re.sub('电话[:|:]|联系方式[:|:]', '', phones[0])
  1096. for list_entity, list_sentence in zip(list_entitys, list_sentences):
  1097. # p_entitys = 0
  1098. # p_sentences = 0
  1099. #
  1100. # key_word = re.compile('电话[:|:].{0,4}\d{7,12}|联系方式[:|:].{0,4}\d{7,12}')
  1101. # # phone = re.compile('1[3|4|5|7|8][0-9][-—-]?\d{4}[-—-]?\d{4}|\d{3,4}[-—-]\d{7,8}/\d{3,8}|\d{3,4}[-—-]\d{7,8}转\d{1,4}|\d{3,4}[-—-]\d{7,8}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}') # 联系电话
  1102. # # 2020/11/25 增加发现的号码段
  1103. # phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-]?\d{4}[-—-]?\d{4}|'
  1104. # '\d{3,4}[-—-][1-9]\d{6,7}/\d{3,8}|'
  1105. # '\d{3,4}[-—-]\d{7,8}转\d{1,4}|'
  1106. # '\d{3,4}[-—-]?[1-9]\d{6,7}|'
  1107. # '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
  1108. # '[1-9]\d{6,7}') # 联系电话
  1109. # dict_index_sentence = {}
  1110. # for _sentence in list_sentence:
  1111. # dict_index_sentence[_sentence.sentence_index] = _sentence
  1112. #
  1113. # dict_context_itemx = {}
  1114. # last_person = "####****++++$$^"
  1115. # last_person_phone = "####****++++$^"
  1116. # _list_entity = [entity for entity in list_entity if entity.entity_type == "person"]
  1117. # while (p_entitys < len(_list_entity)):
  1118. # entity = _list_entity[p_entitys]
  1119. # if entity.entity_type == "person" and entity.label in [1,2,3]:
  1120. # sentence = dict_index_sentence[entity.sentence_index]
  1121. # # item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_PERSON_INPUT_SHAPE[1]),shape=settings.MODEL_PERSON_INPUT_SHAPE)
  1122. #
  1123. # # s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20)
  1124. #
  1125. # # 2021/5/8 取上下文的句子,解决表格处理的分句问题
  1126. # left_sentence = dict_index_sentence.get(entity.sentence_index - 1)
  1127. # left_sentence_tokens = left_sentence.tokens if left_sentence else []
  1128. # right_sentence = dict_index_sentence.get(entity.sentence_index + 1)
  1129. # right_sentence_tokens = right_sentence.tokens if right_sentence else []
  1130. # entity_beginIndex = entity.begin_index + len(left_sentence_tokens)
  1131. # entity_endIndex = entity.end_index + len(left_sentence_tokens)
  1132. # context_sentences_tokens = left_sentence_tokens + sentence.tokens + right_sentence_tokens
  1133. # s = spanWindow(tokens=context_sentences_tokens, begin_index=entity_beginIndex,
  1134. # end_index=entity_endIndex, size=20)
  1135. #
  1136. # _key = "".join(["".join(x) for x in s])
  1137. # if _key in dict_context_itemx:
  1138. # _dianhua = dict_context_itemx[_key][0]
  1139. # else:
  1140. # s1 = ''.join(s[1])
  1141. # # s1 = re.sub(',)', '-', s1)
  1142. # s1 = re.sub('\s', '', s1)
  1143. # have_key = re.findall(key_word, s1)
  1144. # have_phone = re.findall(phone, s1)
  1145. # s0 = ''.join(s[0])
  1146. # # s0 = re.sub(',)', '-', s0)
  1147. # s0 = re.sub('\s', '', s0)
  1148. # have_key2 = re.findall(key_word, s0)
  1149. # have_phone2 = re.findall(phone, s0)
  1150. #
  1151. # s3 = ''.join(s[1])
  1152. # # s0 = re.sub(',)', '-', s0)
  1153. # s3 = re.sub(',|,|\s', '', s3)
  1154. # have_key3 = re.findall(key_word, s3)
  1155. # have_phone3 = re.findall(phone, s3)
  1156. #
  1157. # s4 = ''.join(s[0])
  1158. # # s0 = re.sub(',)', '-', s0)
  1159. # s4 = re.sub(',|,|\s', '', s0)
  1160. # have_key4 = re.findall(key_word, s4)
  1161. # have_phone4 = re.findall(phone, s4)
  1162. #
  1163. # _dianhua = ""
  1164. # if have_phone:
  1165. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
  1166. # last_person_phone) != -1:
  1167. # if len(have_phone) > 1:
  1168. # _dianhua = phoneFromList(have_phone[1:])
  1169. # else:
  1170. # _dianhua = phoneFromList(have_phone)
  1171. # elif have_key:
  1172. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
  1173. # last_person_phone) != -1:
  1174. # if len(have_key) > 1:
  1175. # _dianhua = phoneFromList(have_key[1:])
  1176. # else:
  1177. # _dianhua = phoneFromList(have_key)
  1178. # elif have_phone2:
  1179. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
  1180. # last_person_phone) != -1:
  1181. # if len(have_phone2) > 1:
  1182. # _dianhua = phoneFromList(have_phone2[1:])
  1183. # else:
  1184. # _dianhua = phoneFromList(have_phone2)
  1185. # elif have_key2:
  1186. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
  1187. # last_person_phone) != -1:
  1188. # if len(have_key2) > 1:
  1189. # _dianhua = phoneFromList(have_key2[1:])
  1190. # else:
  1191. # _dianhua = phoneFromList(have_key2)
  1192. # elif have_phone3:
  1193. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
  1194. # last_person_phone) != -1:
  1195. # if len(have_phone3) > 1:
  1196. # _dianhua = phoneFromList(have_phone3[1:])
  1197. # else:
  1198. # _dianhua = phoneFromList(have_phone3)
  1199. # elif have_key3:
  1200. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
  1201. # last_person_phone) != -1:
  1202. # if len(have_key3) > 1:
  1203. # _dianhua = phoneFromList(have_key3[1:])
  1204. # else:
  1205. # _dianhua = phoneFromList(have_key3)
  1206. # elif have_phone4:
  1207. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
  1208. # last_person_phone) != -1:
  1209. # if len(have_phone4) > 1:
  1210. # _dianhua = phoneFromList(have_phone4)
  1211. # else:
  1212. # _dianhua = phoneFromList(have_phone4)
  1213. # elif have_key4:
  1214. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
  1215. # last_person_phone) != -1:
  1216. # if len(have_key4) > 1:
  1217. # _dianhua = phoneFromList(have_key4)
  1218. # else:
  1219. # _dianhua = phoneFromList(have_key4)
  1220. # else:
  1221. # _dianhua = ""
  1222. # # dict_context_itemx[_key] = [item_x, _dianhua]
  1223. # dict_context_itemx[_key] = [_dianhua]
  1224. # # points_entitys.append(entity)
  1225. # # dianhua.append(_dianhua)
  1226. # last_person = entity.entity_text
  1227. # if _dianhua:
  1228. # # 更新联系人entity联系方式(person_phone)
  1229. # entity.person_phone = _dianhua
  1230. # last_person_phone = _dianhua
  1231. # else:
  1232. # last_person_phone = "####****++++$^"
  1233. # p_entitys += 1
  1234. from scipy.optimize import linear_sum_assignment
  1235. from BiddingKG.dl.interface.Entitys import Match
  1236. def dispatch(match_list):
  1237. main_roles = list(set([match.main_role for match in match_list]))
  1238. attributes = list(set([match.attribute for match in match_list]))
  1239. label = np.zeros(shape=(len(main_roles), len(attributes)))
  1240. for match in match_list:
  1241. main_role = match.main_role
  1242. attribute = match.attribute
  1243. value = match.value
  1244. label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
  1245. # print(label)
  1246. gragh = -label
  1247. # km算法
  1248. row, col = linear_sum_assignment(gragh)
  1249. max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
  1250. return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
  1251. # km算法
  1252. key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)(\d{7,12})')
  1253. phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
  1254. '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
  1255. '0\d{2,3}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
  1256. '0\d{2,3}[-—-―]\d{7,8}转\d{1,4}|'
  1257. '0\d{2,3}[-—-―]?[1-9]\d{6,7}|'
  1258. '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
  1259. '[1-9]\d{6,7}')
  1260. phone_entitys = []
  1261. for _sentence in list_sentence:
  1262. sentence_text = _sentence.sentence_text
  1263. res_set = set()
  1264. for i in re.finditer(phone,sentence_text):
  1265. res_set.add((i.group(),i.start(),i.end()))
  1266. for i in re.finditer(key_word,sentence_text):
  1267. res_set.add((i.group(2),i.start()+len(i.group(1)),i.end()))
  1268. for item in list(res_set):
  1269. phone_left = sentence_text[max(0,item[1]-10):item[1]]
  1270. phone_right = sentence_text[item[2]:item[2]+8]
  1271. # 排除传真号 和 其它错误项
  1272. if re.search("传,?真|信,?箱|邮,?箱",phone_left):
  1273. if not re.search("电,?话",phone_left):
  1274. continue
  1275. if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]",phone_left):
  1276. continue
  1277. if re.search("[.,]\d{2,}",phone_right):
  1278. continue
  1279. _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2],in_attachment=_sentence.in_attachment)
  1280. phone_entitys.append(_entity)
  1281. person_entitys = []
  1282. for entity in list_entity:
  1283. if entity.entity_type == "person":
  1284. entity.person_phone = ""
  1285. person_entitys.append(entity)
  1286. _list_entity = phone_entitys + person_entitys
  1287. _list_entity = sorted(_list_entity,key=lambda x:(x.sentence_index,x.wordOffset_begin))
  1288. words_num_dict = dict()
  1289. last_words_num = 0
  1290. list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
  1291. for sentence in list_sentence:
  1292. _index = sentence.sentence_index
  1293. if _index == 0:
  1294. words_num_dict[_index] = 0
  1295. else:
  1296. words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
  1297. last_words_num = len(sentence.sentence_text)
  1298. match_list = []
  1299. for index in range(len(_list_entity)):
  1300. entity = _list_entity[index]
  1301. if entity.entity_type=="person" and entity.label in [1,2,3]:
  1302. match_nums = 0
  1303. for after_index in range(index + 1, min(len(_list_entity), index + 5)):
  1304. after_entity = _list_entity[after_index]
  1305. if after_entity.entity_type=="phone":
  1306. sentence_distance = after_entity.sentence_index - entity.sentence_index
  1307. distance = (words_num_dict[after_entity.sentence_index] + after_entity.wordOffset_begin) - (
  1308. words_num_dict[entity.sentence_index] + entity.wordOffset_end)
  1309. if sentence_distance < 2 and distance < 50:
  1310. value = (-1 / 2 * (distance ** 2)) / 10000
  1311. match_list.append(Match(entity, after_entity, value))
  1312. match_nums += 1
  1313. else:
  1314. break
  1315. if after_entity.entity_type=="person":
  1316. if after_entity.label not in [1,2,3]:
  1317. break
  1318. if not match_nums:
  1319. for previous_index in range(index-1, max(0,index-5), -1):
  1320. previous_entity = _list_entity[previous_index]
  1321. if previous_entity.entity_type == "phone":
  1322. sentence_distance = entity.sentence_index - previous_entity.sentence_index
  1323. distance = (words_num_dict[entity.sentence_index] + entity.wordOffset_begin) - (
  1324. words_num_dict[previous_entity.sentence_index] + previous_entity.wordOffset_end)
  1325. if sentence_distance < 1 and distance<30:
  1326. # 前向 没有 /10000
  1327. value = (-1 / 2 * (distance ** 2))
  1328. match_list.append(Match(entity, previous_entity, value))
  1329. else:
  1330. break
  1331. result = dispatch(match_list)
  1332. for match in result:
  1333. entity = match.main_role
  1334. # 更新 list_entity
  1335. entity_index = list_entity.index(entity)
  1336. list_entity[entity_index].person_phone = match.attribute.entity_text
  1337. def predict(self,list_sentences,list_entitys):
  1338. self.predict_person(list_sentences,list_entitys)
  1339. #表格预测
  1340. class FormPredictor():
  1341. def __init__(self,lazyLoad=getLazyLoad(),config=None):
  1342. self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5"
  1343. self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5"
  1344. self.model_form_item = Model_form_item(config=config)
  1345. self.model_dict = {"line":[None,self.model_file_line]}
  1346. self.model_form_context = Model_form_context(config=config)
  1347. def getModel(self,type):
  1348. if type=="item":
  1349. return self.model_form_item
  1350. elif type=="context":
  1351. return self.model_form_context
  1352. else:
  1353. return self.getModel(type)
  1354. def encode(self,data,**kwargs):
  1355. return encodeInput([data], word_len=50, word_flag=True,userFool=False)[0]
  1356. return encodeInput_form(data)
  1357. def predict(self,form_datas,type):
  1358. if type=="item":
  1359. return self.model_form_item.predict(form_datas)
  1360. elif type=="context":
  1361. return self.model_form_context.predict(form_datas)
  1362. else:
  1363. return self.getModel(type).predict(form_datas)
  1364. #角色规则
  1365. #依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率
  1366. class RoleRulePredictor():
  1367. def __init__(self):
  1368. # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
  1369. self.pattern_tenderee_left_55 = "(?P<tenderee_left_55>((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)" \
  1370. "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行)|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|采购(执行|实施)单位)"\
  1371. "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章|异议受理部门)[))])?(是|为|:|:|\s*)+$)"
  1372. self.pattern_tenderee_left_60 = "(?P<tenderee_left_60>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包)" \
  1373. "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行))"\
  1374. "[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章|异议受理部门)[))])?(是|为|:|:|,|\s*)+$)" # 367784094 隆道-大企业采购平台 采购商:C5石油树脂-中国建材集团有限公司-四川省/成都市/市辖区
  1375. self.pattern_tenderee_left_50 = "(?P<tenderee_left_50>((所需|需[用求]|购货|征集|发布|交易发起|开户|申报|填报|开票|收货)" \
  1376. "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
  1377. "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章|异议受理部门)[))])?(是|为|:|:|\s*)+$|(采购商|招标人):(\w{2,10}-)?$|实施主体(基本情况,)?名称:$)"
  1378. self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
  1379. self.pattern_tenderee_right = "(?P<tenderee_right>^(机关)?([((](以下简称)?[,\"“]*((招标|采购)(人|单位|机构)|(服务)?购买方)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位)|^关于)" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
  1380. self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
  1381. self.pattern_agency_left = "(?P<agency_left>((代理|拍卖)(?:人|机构|公司|企业|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构|(采购|招标)代理)(名称|.{,4}名,?称|全称)?(是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
  1382. self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
  1383. # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
  1384. self.pattern_winTenderer_left_50 = "(?P<winTenderer_left_51>" \
  1385. "(乙|竞得|受让|买受|签约|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?|银行)(:?单位名称|:?名称|盖章)?[::是为]+$" \
  1386. "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致|征集结果|选择中介|选择结果|成交对象|勘察人|(,|审计|处置|勘察|设计)服务单位|受托[人方])[::是为]+$" \
  1387. "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$|竞争性选择申请人名称:$" \
  1388. "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格),$|合作伙伴名称:$|供应商(乙方)-?$" \
  1389. "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$|经讨论,决定采用$|第\d+(包件?|标段?)(中标|中选|成交)候选人:$)" # 承办单位:不作为中标 83914772 |施工 单位不作为中标人 例:386692187
  1390. self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
  1391. "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
  1392. "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$|选取(情况|说明):中选,中介机构名称:$|排名如下:1、$|第[一1]名,?投标(人|单位|银行|公司):$)" # 解决表头识别不到加逗号情况,需前面为,。空 20240621补充 中选 云南省投资审批中介超市 补充排名如下 南阳师范学院
  1393. self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)" \
  1394. "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \
  1395. "|结果公示如下:摇出球号:\d+号,中介机构:$)" # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标 # |直购企业:$不能作为中标人,看到有些公告会又多个公司,然后还会发布中选结果的公告,其中一个公司中标
  1396. self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商|银行)))|" \
  1397. "^((报价|价格)最低,|以\w{5,10})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
  1398. "|^:贵公司参与|^:?你方于|^(胜出)?(中标|成交)[,。]|^取得中标(单位)?资格|^以\d+[\d,.]+万?元(中标|成交|中选)" \
  1399. "|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))" # 去掉 |\w{,20} 修复 460216955 网上公布的与本次采购项目有关的信息视为已送达各响应供应商。 作为中标
  1400. self.pattern_winTenderer_whole = "(?P<winTenderer_center>(贵公司|由).{,15}以\w{,15}中标|确定[\w()]{5,20}为[^,。;]{5,50}的?中标单位" \
  1401. "|选定报价最低的[“”\w()]{5,25}为[^,。;]{5,50}的?(服务|中标|成交)单位" \
  1402. "|拟邀请[\w()]{5,20}(进行)?单一来源谈判|(承办单位|报价人|投标人|中介机构)(名称)?:[\w()]{5,20},(中标|承办|中选)(价格|金额)" \
  1403. "|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购|供应商名称:[()\w]{5,20},独家采购原因)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
  1404. self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
  1405. self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
  1406. self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
  1407. self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
  1408. self.candidate_left = "(?P<candidate_left>(((中[标选商]|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人|[及与和](成交|中标)金额)?[::是为]+$)"
  1409. self.pattern_left = [
  1410. self.pattern_tenderee_left_60,
  1411. self.pattern_tenderee_left_55,
  1412. self.pattern_tenderee_left_50,
  1413. self.pattern_agency_left,
  1414. self.pattern_secondTenderer_left,
  1415. self.pattern_thirdTenderer_left,
  1416. self.pattern_winTenderer_left_60,
  1417. self.pattern_winTenderer_left_55,
  1418. self.pattern_winTenderer_left_50,
  1419. ]
  1420. self.pattern_whole = [
  1421. self.pattern_winTenderer_whole,
  1422. self.pattern_tenderee_center,
  1423. ]
  1424. self.pattern_right = [
  1425. self.pattern_thirdTenderer_right,
  1426. self.pattern_secondTenderer_right,
  1427. self.pattern_agency_right,
  1428. self.pattern_tendereeORagency_right,
  1429. self.pattern_tenderee_right,
  1430. self.pattern_winTenderer_right,
  1431. ]
  1432. self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
  1433. self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?[为:]+\w{2,4}资金|采购成本价|总费用约?为|(招标|采购)总?(规模|额度|资金)|资金来源|合同价暂定") # |建安费用 不作为招标金额
  1434. self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):|经评审的价格|报价不?含税") # 单写 总价 不能作为中标金额,很多表格有单价、总价
  1435. self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元(报价)?(中标|中选|成交)")
  1436. self.pattern_money_other = re.compile("代理费|服务费")
  1437. self.pattern_money_bank_tenderee = "存[款放](操作)?,?总?(金额|总额|规模|额度|资金)|招标的?资金总量|(项目|资金)总?(规模|额度)|现金管理的?(操作)?(额度|规模|总额)|定期存款|存款大?约|定期存储|竞争性存放|项目资金|日均存款|资金现状|存量金额|招标分配的资金量|资金总[量额]|总(规模|额度|金额)|投资金额" # 存款类招标金额
  1438. self.pattern_money_bank_tenderee_right = "^,?(提供定期存放服务|存[款放](期限|时间)|存期|结构性存款|期限|\w{,4}(定期存款|公款存放|资金存放))" # 存款类招标金额
  1439. self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
  1440. # self.role_file = open('/data/python/lsm/role_rule_predict.txt', 'a', encoding='utf-8')
  1441. def _check_input(self,text, ignore=False):
  1442. if not text:
  1443. return []
  1444. if not isinstance(text, list):
  1445. text = [text]
  1446. null_index = [i for i, t in enumerate(text) if not t]
  1447. if null_index and not ignore:
  1448. raise Exception("null text in input ")
  1449. return text
  1450. def ser_role(self, pattern_list, text, entity_text):
  1451. for _pattern in pattern_list:
  1452. for _iter in re.finditer(_pattern, text):
  1453. for _group, _v_group in _iter.groupdict().items():
  1454. if _v_group is not None and _v_group != "":
  1455. _role = _group.split("_")[0]
  1456. if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑
  1457. # print('p_entity_sentenceindex:', p_entity.sentence_index)
  1458. # if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', entity_text) \
  1459. # or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', entity_text) == None:
  1460. if is_agency(entity_text):
  1461. _role = 'tenderee'
  1462. else:
  1463. _role = "agency"
  1464. _direct = _group.split("_")[1]
  1465. # _weight = _group.split("_")[2] if len(_group.split("_")) == 3 else ""
  1466. prob = int(_group.split("_")[2])/100 if len(_group.split("_")) == 3 else 0.55
  1467. # print('实体召回概率:', prob)
  1468. _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1469. "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1470. return (_label, prob, _iter.group(0))
  1471. return (5, 0.5, '')
  1472. def rule_predict(self, before, center, after, entity_text):
  1473. # before = before if isinstance(before, str) else ""
  1474. # center = center if isinstance(center, str) else ""
  1475. # after = after if isinstance(after, str) else ""
  1476. _label, _prob, keyword = self.ser_role(self.pattern_left, before, entity_text) # 前文匹配
  1477. keyword = "left_" + keyword if keyword!="" else keyword
  1478. if _label == 2 and re.search(
  1479. '各.{,5}供应商|尊敬的供应商|[^\w]候选供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})|未(中[标选]|入围)|不得确定为|(响应|参[加与]报价|通过资格审查)的?供应商',
  1480. # 135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
  1481. before) != None:
  1482. _label = 5
  1483. elif _label == 2 and re.search('为$', before) and re.match('\w', after): # 排除错误 前文为结尾,后文不是标点符号结尾的,如 353824459 供应商为社会团体的, 供应商为玉田县中医医院提供安保服务
  1484. _label = 5
  1485. elif _label == 2 and re.search('评委|未中标', after[:5]): # 397194341 过滤掉错误召回中标人
  1486. _label = 5
  1487. elif _label == 2 and re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', after) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', before[-10:])==None: #20240705 处理类似 493939047 错误
  1488. _label = 5
  1489. if _label == 5:
  1490. _label, _prob, keyword = self.ser_role(self.pattern_whole, before + center + after, entity_text) # 前后文匹配
  1491. keyword = 'whole_'+ keyword[:keyword.find(entity_text)] if keyword!="" else keyword
  1492. if _label == 2 and re.search('以[^,。;]{10,30}为准', before + center + after)!=None:
  1493. _label = 5
  1494. if _label != 5 and self.ser_role(self.pattern_whole, before, entity_text)[0] != 5 or \
  1495. self.ser_role(self.pattern_whole, after, entity_text)[0] != 5:
  1496. _label = 5
  1497. if _label == 5:
  1498. _label, _prob, keyword = self.ser_role(self.pattern_right, after, entity_text) # 后文匹配
  1499. keyword = "right_" + keyword if keyword!="" else keyword
  1500. if _label==5 and re.search('(中标|中选|成交)?)(结果)?(公告|公示|通知书?),', before) and re.match(':', after):
  1501. _label = 2
  1502. _prob = 0.5
  1503. _flag = False if _label==5 else True
  1504. return (_label, _prob, _flag, keyword)
  1505. def predict(self, list_articles, list_sentences, list_entitys, list_codenames, channel_dic, on_value=0.5, all_winner=False, req_scope=[], deposit_project=False):
  1506. '''
  1507. :param list_articles:
  1508. :param list_sentences:
  1509. :param list_entitys:
  1510. :param list_codenames:
  1511. :param channel_dic:
  1512. :param on_value: 最低阈值
  1513. :param all_winner: 是否存款、入围等公告,不分排名作为中标人
  1514. :param req_scope: 大纲采购内容开始结束位置[((开头句子index, 开头位置), (结束句子index, 结束句子位置)]
  1515. :param is_deposit_project: 是否为银行存款类项目
  1516. :return:
  1517. '''
  1518. for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
  1519. list_codenames):
  1520. list_sentence.sort(key=lambda x: x.sentence_index) # 2022/1/5 按句子顺序排序
  1521. # list_name = list_codename["name"]
  1522. list_name = [] # 2022/1/5 改为实体列表内所有项目名称
  1523. name_entitys = [] # 2023/6/30 保存项目名称实体,直接通过位置判断角色是否在项目名称里面
  1524. candidates = [] # 保存不能确定为第几的候选人 2023/04/14
  1525. notfound_tenderer = True # 未找到前三候选人
  1526. deposit_moneys = [] # 保存存款类项目采购内容中大于百万的其他金额实体
  1527. for entity in list_entity:
  1528. if entity.entity_type == 'name':
  1529. list_name.append(entity.entity_text)
  1530. name_entitys.append(entity)
  1531. list_name = self._check_input(list_name) + [article.title]
  1532. for p_entity in list_entity:
  1533. if p_entity.entity_type in ["org", "company"]:
  1534. # 只解析角色为无的或者概率低于阈值的
  1535. if p_entity.label is None:
  1536. continue
  1537. # 将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人
  1538. if str(p_entity.label) == "0":
  1539. find_flag = False
  1540. for _sentence in list_sentence:
  1541. if _sentence.sentence_index == p_entity.sentence_index:
  1542. # _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
  1543. # end_index=p_entity.end_index, size=20, center_include=True,
  1544. # word_flag=True, use_text=True,
  1545. # text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
  1546. _span = get_context(_sentence.sentence_text, p_entity.wordOffset_begin,
  1547. p_entity.wordOffset_end, size=20, center_include=True)
  1548. if re.search(self.pattern_tenderee_left_50, _span[0]) or re.search(self.pattern_tenderee_left_55, _span[0]): # 前面有关键词的实体不判断是否在项目名称中出现
  1549. find_flag = True
  1550. break
  1551. if re.search('(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题):$', _span[0]):
  1552. find_flag = True
  1553. if re.search('(局|院|府|学|处|站|会|所|校|馆|队|厅|室|司|心|园|厂)$', p_entity.entity_text):
  1554. p_entity.values[0] = 0.6 if p_entity.values[0]>0.6 else 0.55
  1555. else:
  1556. p_entity.values[0] = on_value # 项目名称里面实体修改为最低概率
  1557. break
  1558. for _name in name_entitys:
  1559. if _name.sentence_index == p_entity.sentence_index and p_entity.wordOffset_begin >=_name.wordOffset_begin and p_entity.wordOffset_end < _name.wordOffset_end:
  1560. find_flag = True
  1561. if re.search('(局|院|府|学|处|站|会|所|校|馆|队|厅|室|司|心|园|厂)$', p_entity.entity_text):
  1562. p_entity.values[0] = 0.6 if p_entity.values[0] > 0.6 else 0.55
  1563. else:
  1564. p_entity.values[0] = on_value # 项目名称里面实体修改为最低概率
  1565. break
  1566. # if p_entity.values[0] > on_value:
  1567. # p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
  1568. # else:
  1569. # p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
  1570. # for _name in list_name:
  1571. # if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0: #加上前面一些信息,修复公司不在项目名称开头的,检测不到
  1572. # find_flag = True
  1573. # if p_entity.values[0] > on_value:
  1574. # p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
  1575. # else:
  1576. # p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
  1577. if find_flag:
  1578. continue
  1579. # 正则从概率低于阈值或其他类别中召回角色
  1580. role_prob = float(p_entity.values[int(p_entity.label)])
  1581. if role_prob < on_value or str(p_entity.label) == "5":
  1582. # 将标题中的实体置为招标人
  1583. _list_name = self._check_input(list_name, ignore=True)
  1584. find_flag = False
  1585. for _name in _list_name: # 2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
  1586. if str(_name).find(p_entity.entity_text) >= 0 and p_entity.sentence_index < 4:
  1587. for _sentence in list_sentence:
  1588. if _sentence.sentence_index == p_entity.sentence_index:
  1589. # _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
  1590. # end_index=p_entity.end_index, size=20, center_include=True,
  1591. # word_flag=True, use_text=True, text=p_entity.entity_text)
  1592. _span = get_context(_sentence.sentence_text, p_entity.wordOffset_begin,
  1593. p_entity.wordOffset_end, size=20, center_include=True)
  1594. if _span[2].startswith(":"): # 实体后面为冒号的不作为招标人,避免项目名称出错中标变招标 368122675 陇西兴恒建建筑有限责任公司:线路安全保护区内环境治理专项整改(第二标段)项目
  1595. break
  1596. if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
  1597. _name) >= 0 or str(_name).startswith(p_entity.entity_text): # 20240621 补充公司开头的项目名称召回,避免name太长召回失败 例 367033697
  1598. # if p_entity.entity_text in agency_set or re.search('(代理|管理|咨询|招投?标|采购)\w{,6}公司', p_entity.entity_text): # 在代理人集合的作为代理人
  1599. if is_agency(p_entity.entity_text): # 2024/3/29 统一方法判断是否为代理
  1600. find_flag = True
  1601. _label = 1
  1602. p_entity.label = _label
  1603. p_entity.values[int(_label)] = on_value
  1604. break
  1605. else:
  1606. find_flag = True
  1607. _label = 0
  1608. p_entity.label = _label
  1609. p_entity.values[int(_label)] = on_value + p_entity.values[int(_label)] / 10
  1610. if 6<len(p_entity.entity_text) < 20 and p_entity.entity_type == 'org': # 标题中角色长度在一定范围内的加分 优化类似367720967 标题中两个实体选择错误问题
  1611. p_entity.values[int(_label)] += 0.005
  1612. break
  1613. if p_entity.sentence_index >= 4:
  1614. break
  1615. if find_flag:
  1616. break
  1617. # 若是实体在标题中,默认为招标人,不进行以下的规则匹配
  1618. if find_flag:
  1619. continue
  1620. for s_index in range(len(list_sentence)):
  1621. if p_entity.doc_id == list_sentence[s_index].doc_id and p_entity.sentence_index == \
  1622. list_sentence[s_index].sentence_index:
  1623. tokens = list_sentence[s_index].tokens
  1624. begin_index = p_entity.begin_index
  1625. end_index = p_entity.end_index
  1626. size = 40 #15
  1627. spans = spanWindow(tokens, begin_index, end_index, size, center_include=True,
  1628. word_flag=True, use_text=False)
  1629. # _flag = False
  1630. # 添加中标通知书类型特殊处理
  1631. try:
  1632. if s_index == 0 and re.search('中标通知书.{,30}[,:]%s:'%p_entity.entity_text.replace('(', '').replace(')', ''),
  1633. list_sentence[s_index].sentence_text.replace('(', '').replace(')', '')[:100]):
  1634. p_entity.label = 2
  1635. p_entity.values[2] = 0.5
  1636. notfound_tenderer = False
  1637. # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
  1638. break
  1639. except Exception as e:
  1640. print('正则报错:', e)
  1641. before, center, after = spans[0], spans[1], spans[2]
  1642. entity_text = p_entity.entity_text
  1643. _label, _prob, _flag, kw = self.rule_predict(before, center, after, entity_text)
  1644. if _label == 5 and re.search(':(1[.、])?$', before) and re.search('^[、;,&/。]', after) and re.search('(监督|管理)(机构|部门|单位):', before)==None and re.search(
  1645. '(中标|成交|中选))?(人|单位|供应商|银行|合作伙伴)?(公示)?(信息|情况|结果|如下)(公[示告]如下)?:|(遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取)结果(如下)(公[示告]如下)?:', list_sentence[s_index].sentence_text[:p_entity.wordOffset_begin]): # 补充召回 例:514053647 标段1:中国建设银行西安南大街支行,标段2:中国农业银行股份有限公司西安分行,
  1646. _flag = True
  1647. _label = 2
  1648. _prob = 0.5
  1649. elif _label == 5 and all_winner==1 or (all_winner==2 and re.search('(排[名序]|名次|顺序|第):?[0-9一二三四五六七八九十]+', before)==None):
  1650. if re.search('(中标|中选|成交|入围|入选)(人|单位|供应商|银行)(名称)?:', before) and re.search('未(中标|中选|成交|入围|入选)', before)==None:
  1651. _flag = True
  1652. _label = 2
  1653. _prob = 0.55
  1654. elif re.search('(:|[::,]\d{1,2}[.、])$', before) and re.search('^[、;,&/。]', after) and re.search('(监督|管理)(机构|部门|单位):', before)==None and re.search('(入围|合格)(人|单位|供应商|银行|候选人|合作伙伴)?(信息|情况|结果|如下)(公[示告]如下)?(:|,?((入围)?排名不分先后))', list_sentence[s_index].sentence_text[:p_entity.wordOffset_begin]):
  1655. _flag = True
  1656. _label = 2
  1657. _prob = 0.51
  1658. elif re.search('(候选|投标|应答|响应)(人|单位|供应商|银行)(名称)?:', before):
  1659. _flag = True
  1660. _label = 2
  1661. _prob = 0.5
  1662. # if _label in [0, 1, 2, 3, 4]:
  1663. # self.role_file.write("{0}#split#{1}#split#{2}#split#{3}#split#{4}\n".format(before,
  1664. # entity.entity_text,
  1665. # after,
  1666. # _label,
  1667. # entity.doc_id))
  1668. # 得到结果
  1669. if _flag:
  1670. if _label in [2, 3, 4]:
  1671. notfound_tenderer = False
  1672. p_entity.label = _label
  1673. # p_entity.values[int(_label)] = on_value + p_entity.values[
  1674. # int(_label)] / 10
  1675. p_entity.values[_label] = _prob + p_entity.values[int(_label)] / 10
  1676. # log('正则召回实体: %s, %s, %d, %.4f, %s'%(kw, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], before+" "+after))
  1677. break
  1678. if re.search(self.candidate_left, before) and re.search('尊敬的|各', before[-10:])==None:
  1679. candidates.append(p_entity)
  1680. elif channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告'] and re.search(':$', before) and re.search('^[,。]', after) and re.search('候选人', before): # 补充 577756336 候选人,三期A160、A166地块:中国建设银行成都第九支行,
  1681. candidates.append(p_entity)
  1682. # # 使用正则+距离解决冲突
  1683. # # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
  1684. # list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:25], spans[2]] # 实体左、中、右 信息
  1685. # for _i_span in range(len(list_spans)):
  1686. # _flag = False
  1687. # _prob_weight = 1
  1688. #
  1689. # # print(list_spans[_i_span],p_entity.entity_text)
  1690. # for _pattern in self.pattern_whole:
  1691. # for _iter in re.finditer(_pattern, list_spans[_i_span]):
  1692. # for _group, _v_group in _iter.groupdict().items():
  1693. # if _v_group is not None and _v_group != "":
  1694. # _role = _group.split("_")[0]
  1695. # if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑
  1696. # # print('p_entity_sentenceindex:', p_entity.sentence_index)
  1697. # if p_entity.sentence_index>=1: # 只在第一句进行这种模糊匹配
  1698. # continue
  1699. # if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
  1700. # or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None:
  1701. # _role = 'tenderee'
  1702. # else:
  1703. # _role = "agency"
  1704. # _direct = _group.split("_")[1]
  1705. # _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
  1706. # # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1707. # # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1708. # if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
  1709. # list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
  1710. # _flag = True
  1711. # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1712. # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1713. # _prob_weight = 1.2 if _weight=='w1' else 1
  1714. # # print('_v_group:',_group, _v_group, p_entity.entity_text)
  1715. #
  1716. # if _i_span == 1 and _direct == "center" and _v_group.find(p_entity.entity_text) != -1 and re.search('以[^,。;]{10,30}为准', list_spans[1])==None:
  1717. # _flag = True
  1718. # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1719. # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1720. # _prob_weight = 1.2 if _weight == 'w1' else 1
  1721. # # print('_v_group:', _group, _v_group, p_entity.entity_text)
  1722. #
  1723. # if _i_span == 2 and _direct == "right":
  1724. # _flag = True
  1725. # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1726. # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1727. # _prob_weight = 1.2 if _weight == 'w1' else 1
  1728. # # print('_v_group:', _group, _v_group, p_entity.entity_text)
  1729. # # 得到结果
  1730. # if _flag:
  1731. # if _label in [2, 3, 4]:
  1732. # notfound_tenderer = False
  1733. # p_entity.label = _label
  1734. # p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
  1735. # # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
  1736. # break
  1737. # if _i_span == 0 and re.search(self.candidate_left, list_spans[_i_span]):
  1738. # candidates.append(p_entity)
  1739. elif str(p_entity.label) in ['2', '3', '4']:
  1740. notfound_tenderer = False
  1741. # 其他金额通过正则召回可能是招标或中投标的金额
  1742. if p_entity.entity_type in ["money"]:
  1743. if str(p_entity.label) == "2":
  1744. for _sentence in list_sentence:
  1745. if _sentence.sentence_index == p_entity.sentence_index:
  1746. # _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
  1747. # end_index=p_entity.end_index, size=10, center_include=True,
  1748. # word_flag=True, text=p_entity.entity_text)
  1749. _span = get_context(_sentence.sentence_text, p_entity.wordOffset_begin, p_entity.wordOffset_end, size=20, center_include=True) # 20241101 修复spanWindow方法取词错误, ['金额(万元:', '27000,存', '期:3个月,四、投标人资格:1.在嘉兴']
  1750. if re.search('(含|在|包括)(\d+)?$', _span[0]):
  1751. continue
  1752. if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
  1753. self.pattern_money_other, _span[0]) is None:
  1754. front_text = _span[0][re.search(self.pattern_money_tenderee, _span[0]).end():]
  1755. if re.search('\d[万亿]?元|元)?:?\d', front_text): # 当前金额与关键词中间有金额的过滤掉
  1756. break
  1757. p_entity.values[0] = 0.62 + p_entity.values[0] / 10
  1758. p_entity.label = 0
  1759. elif deposit_project:
  1760. if re.search(self.pattern_money_bank_tenderee,
  1761. _span[0]) is not None and re.search(
  1762. self.pattern_money_other, _span[0]) is None:
  1763. front_text = _span[0][re.search(self.pattern_money_bank_tenderee, _span[0]).end():]
  1764. if re.search('\d[万亿]?元|元)?:?\d', front_text): # 当前金额与关键词中间有金额的过滤掉
  1765. break
  1766. p_entity.values[0] = 0.6 + p_entity.values[0] / 10
  1767. p_entity.label = 0
  1768. elif re.search(self.pattern_money_bank_tenderee_right, _span[2]):
  1769. p_entity.values[0] = 0.55 + p_entity.values[0] / 10
  1770. p_entity.label = 0
  1771. elif (re.search('存款|总额度', _span[0]) or re.search('存[款放]|专项债资金', _span[2])):
  1772. front_text = _span[0][(re.search('存款|总额度', _span[0]) or re.search('存[款放]|专项债资金', _span[2])).end():]
  1773. if re.search('\d[万亿]?元|元)?:?\d', front_text): # 当前金额与关键词中间有金额的过滤掉
  1774. break
  1775. p_entity.values[0] = 0.55
  1776. p_entity.label = 0
  1777. # print('规则召回预算金额 4:', p_entity.entity_text, _span[0],p_entity.values[0])
  1778. if re.search(self.pattern_money_tenderer, _span[0]) is not None:
  1779. front_text = _span[0][re.search(self.pattern_money_tenderer, _span[0]).end():]
  1780. if re.search('\d[万亿]?元|元)?:?\d', front_text): # 当前金额与关键词中间有金额的过滤掉
  1781. break
  1782. elif re.search('合同价暂定为?$', _span[0]): # 20250310 修复 598504921 合同价暂定 为招标金额
  1783. break
  1784. if re.search(self.pattern_money_other, _span[0]) is not None:
  1785. if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
  1786. re.search(self.pattern_money_other, _span[0]).span()[1]:
  1787. p_entity.values[1] = 0.6 + p_entity.values[1] / 10
  1788. p_entity.label = 1
  1789. else:
  1790. p_entity.values[1] = 0.6 + p_entity.values[1] / 10
  1791. p_entity.label = 1
  1792. if re.search(self.pattern_money_tenderer_whole,"".join(_span)) and re.search(self.pattern_money_tenderer_whole, _span[0])==None \
  1793. and re.search(self.pattern_money_tenderer_whole, _span[2])==None and re.search(self.pattern_money_other,_span[0])==None:
  1794. p_entity.values[1] = 0.6 + p_entity.values[1] / 10
  1795. p_entity.label = 1
  1796. elif re.search('(预算金额|最高(投标)?上?限[价额]?格?|招标控制价))?:?([\d.,]+万?元[,(]其中)?(第?[一二三四五0-9](标[段|包]|[分子]包):?[\d.,]+万?元,)*第?[一二三四五0-9](标[段|包]|[分子]包):?$'
  1797. , _sentence.sentence_text[:p_entity.wordOffset_begin]): # 处理几个标段金额相邻情况 例子:191705231
  1798. p_entity.values[0] = 0.6 + p_entity.values[0] / 10
  1799. p_entity.label = 0
  1800. if deposit_project and p_entity.label in [1,2]:
  1801. if req_scope and float(p_entity.entity_text)>1000000 and (p_entity.sentence_index>req_scope[0][0]\
  1802. or (p_entity.sentence_index==req_scope[0][0] and p_entity.wordOffset_begin>req_scope[0][1])) and (p_entity.sentence_index<req_scope[1][0]\
  1803. or (p_entity.sentence_index==req_scope[1][0] and p_entity.wordOffset_end<=req_scope[1][1])):
  1804. deposit_moneys.append(p_entity)
  1805. if deposit_moneys:
  1806. moneys = [float(p.entity_text) for p in deposit_moneys]
  1807. for p in deposit_moneys:
  1808. if float(p.entity_text)==max(moneys):
  1809. p.values[0] = 0.55
  1810. p.label = 0
  1811. else:
  1812. p.values[0] = 0.5
  1813. p.label = 0
  1814. if notfound_tenderer and len(set([ent.entity_text for ent in candidates])) == 1 and channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告']:
  1815. for p_entity in candidates:
  1816. # print('只有一个候选人的作为中标人', p_entity.entity_text)
  1817. p_entity.label = 2
  1818. p_entity.values[2] = on_value
  1819. # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
  1820. list_p = []
  1821. state = 0
  1822. for p_entity in list_entity:
  1823. for _sentence in list_sentence:
  1824. if _sentence.sentence_index == p_entity.sentence_index:
  1825. # _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
  1826. # end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
  1827. # text=p_entity.entity_text)
  1828. _span = get_context(_sentence.sentence_text, p_entity.wordOffset_begin, p_entity.wordOffset_end,
  1829. size=30, center_include=True)
  1830. if state == 2:
  1831. for _p in list_p[1:]:
  1832. if _p.label == 2:
  1833. _p.values[0] = 0.5 + _p.values[0] / 10
  1834. _p.label = 0
  1835. state = 0
  1836. list_p = []
  1837. if state == 0:
  1838. if p_entity.entity_type in ["money"]:
  1839. if str(p_entity.label) == "0" and re.search(self.pattern_pack,
  1840. _span[0] + "-" + _span[2]) is not None:
  1841. state = 1
  1842. list_p.append(p_entity)
  1843. elif state == 1:
  1844. if p_entity.entity_type in ["money"]:
  1845. if str(p_entity.label) in ["0", "2"] and re.search(self.pattern_pack,
  1846. _span[0] + "-" + _span[
  1847. 2]) is not None and re.search(
  1848. self.pattern_money_other,
  1849. _span[0] + "-" + _span[2]) is None and p_entity.sentence_index == list_p[
  1850. 0].sentence_index:
  1851. list_p.append(p_entity)
  1852. else:
  1853. state = 2
  1854. if len(list_p) > 1:
  1855. for _p in list_p[1:]:
  1856. # print("==",_p.entity_text,_p.sentence_index,_p.label)
  1857. if _p.label == 2:
  1858. _p.values[0] = 0.5 + _p.values[0] / 10
  1859. _p.label = 0
  1860. state = 0
  1861. list_p = []
  1862. for p_entity in list_entity:
  1863. # 将属于集合中的不可能是中标人的标签置为无
  1864. if p_entity.entity_text in self.SET_NOT_TENDERER:
  1865. p_entity.label = 5
  1866. '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
  1867. class RoleRuleFinalAdd():
  1868. def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
  1869. '''
  1870. 最终规则召回角色
  1871. :param list_articles:
  1872. :param list_sentences:
  1873. :param list_entitys:
  1874. :param list_codenames:
  1875. :return:
  1876. '''
  1877. # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
  1878. main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
  1879. if len(list_sentences[0])>0 and list_sentences[0][-1].in_attachment:
  1880. main_sentences = list_sentences[0][-1:] + main_sentences[-2:]
  1881. if len(main_sentences)==0:
  1882. return 0
  1883. # end_tokens = []
  1884. for sentence in main_sentences[-5:][::-1]: # 402073799 最后五句由后往前,匹配文末角色,日期
  1885. # end_tokens.extend(sentence.tokens)
  1886. # text_end = "".join(end_tokens[-30:])
  1887. # text_end = "".join(end_tokens)
  1888. text_end = "".join(sentence.tokens)
  1889. text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
  1890. text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:.{,100})', '', text_end)[-200:] # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
  1891. # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
  1892. sear_ent = re.search('([,。;]|^)(?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
  1893. if sear_ent:
  1894. b, e = sear_ent.span()
  1895. if re.search('报价记录|竞价成交', text_end[max(b-10, 0):b] + text_end[e:]):
  1896. sear_ent = None
  1897. break
  1898. if sear_ent == None:
  1899. text_end = list_articles[0].content[-100:]
  1900. sear_ent = re.search(
  1901. '([,。;]|^)(?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?',
  1902. text_end)
  1903. if sear_ent:
  1904. b, e = sear_ent.span()
  1905. if re.search('报价记录|竞价成交', text_end[max(b-10, 0):b] + text_end[e:]):
  1906. sear_ent = None
  1907. sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
  1908. sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
  1909. if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
  1910. sear_ent2 = None
  1911. sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
  1912. sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
  1913. sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]
  1914. tenderee_notfound = True
  1915. agency_notfound = True
  1916. tenderee_list = []
  1917. agency_list = []
  1918. ents = []
  1919. for ent in list_entitys[0]:
  1920. if ent.entity_type in ['org', 'company']:
  1921. if ent.label == 0 and ent.values[ent.label]>0.55:
  1922. if '公共资源交易中心' in ent.entity_text: # 公共资源交易中心不算招标或代理,只算平台
  1923. # ent.label = 5
  1924. ent.values[ent.label] = 0.6 if ent.values[ent.label]>0.6 else 0.5 # 改为降低概率,不改类别,防止 382573066 明显招标人表达不提取
  1925. continue
  1926. tenderee_list.append(ent.entity_text)
  1927. tenderee_notfound = False
  1928. elif ent.label == 1 and ent.values[ent.label]>0.55:
  1929. agency_list.append(ent.entity_text)
  1930. agency_notfound = False
  1931. elif ent.label == 5:
  1932. if '公共资源交易中心' in ent.entity_text:
  1933. continue
  1934. ents.append(ent)
  1935. if sear_ent or sear_ent1 or sear_ent2 or sear_ent3 or sear_ent4:
  1936. for _sear_ent in [_sear for _sear in sear_list if _sear]:
  1937. ent_re = _sear_ent.group('entity')
  1938. ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
  1939. if tenderee_notfound or agency_notfound:
  1940. n = 0
  1941. for i in range(len(ents) - 1, -1, -1):
  1942. if not ents[i].in_attachment:
  1943. n += 1
  1944. if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
  1945. break
  1946. elif _sear_ent==sear_ent and ents[i].label != 5: # 后面有角色的实体的停止继续往前
  1947. break
  1948. if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and re.search('(大学|中学|小学|幼儿园|医院)$', ents[i].entity_text)) or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
  1949. if agency_notfound and is_agency(ents[i].entity_text) and ents[i].entity_text not in tenderee_list:
  1950. ents[i].label = 1
  1951. ents[i].values[1] = 0.51 # 修改为比标题概率略高
  1952. agency_notfound = False
  1953. elif tenderee_notfound and not is_agency(ents[i].entity_text) and ents[i].entity_text not in agency_list:
  1954. ents[i].label = 0
  1955. ents[i].values[0] = 0.51 # 修改为比标题概率略高
  1956. tenderee_notfound = False
  1957. # log('正则最后补充实体: %s'%(ent_re))
  1958. break
  1959. if not tenderee_notfound:
  1960. break
  1961. # 招标人角色召回规则
  1962. class TendereeRuleRecall():
  1963. def __init__(self):
  1964. # self.tenderee_left = re.compile("(发布(人|单位|机构)|需求方(信息[,:])?(单位|公司)?名称|购买主体|收货单位|项目申请单位|发起组织|联系单位|"
  1965. # "询价(机构|企业)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::是为][^。;,]{,5}$")
  1966. # self.tenderee_left_1 = re.compile("采购商公司|询价单位|项目法人单位|项目法人|项目业主名称|申购单位|预算单位|预算单位名称|预算单位单位名称|买方单位|需求公司|寻源单位|项目业主|采购商|业主单位咨询电话|需用单位|采购工厂|征集单位")
  1967. self.tenderee_left_1 = re.compile("((?:采购商|项目法人|项目业主)(名称)?|(?:采购商|询价|项目法人|项目业主|申购|预算|买方|需求|寻源|需用|征集)(单位|公司)((?:单位|公司)?名称)?|询价企业|"
  1968. "业主单位咨询电话|购买主体|采购工厂|需求方(信息[,:])?(单位|公司)?名称|采购单位[\((].{1,6}[\))])[::是为][^。;,]{,2}$")
  1969. self.tenderee_left_2 = re.compile("(招标承办单位|交易人(?:名称)?|招标人代表|(采购|招标)联系人|交易单位|发起(单位|组织)|收货单位|使用方|买家信息)[::是为][^。;,]{,2}$")
  1970. self.tenderee_left_3 = re.compile("[本我](?:公司|单位)[\(\[(【]?$")
  1971. # self.tenderee_left_4 = re.compile("(采购机构|组织机构|组织方|执行单位|采购组织单位|招标组织单位|招标组织部门|采购执行方|采购执行单位|询价执行组织|组织单位|联系单位|联系部门)[::是为][^。;,]{,2}$")
  1972. self.tenderee_left_4 = re.compile("(采购机构|(?:采购|招标|询价)?(组织|执行)(机构|方|单位|部门|组织)|联系(单位|部门)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::是为][^。;,]{,2}$")
  1973. self.tenderee_left_5 = re.compile("(撰写单位|发布(?:人|单位|机构|公司|部门|企业))[^。;,]{,2}$")
  1974. self.tenderee_right = re.compile("^[^。;::]{,5}[((](以?下简?称)?,?[,\"“]*[我本][\u4e00-\u9fa5]{1,2}[,\"”]*[))]|"
  1975. "^[\((][^。;::\))]{,5}称(?:招标|采购)(?:人|单位)|"
  1976. "^[^。;::]{,10}[对就][^。;,]+,?[^。;,]{,20}进行[^。;,]*(采购|询比?价|遴选|招投?标|征集)|"
  1977. "^[^。;::]{,10}关于[^。;,]+,?[^。;,]{,20}的[^。;,]{,20}公告|"
  1978. "^[^。;,::]{,10}的[^。;,]+,?[^。;,]{,20}正在[^。;,]{,5}进行|"
  1979. "^[^。;,::]{,10}的[^。;,]+,?[^。,;]{,20}已?[^。;,]{,20}批准|"
  1980. "^[^。;,::]{,15}(选定|选取|征集|遴选)[^。;,]{,20}(供应商|(代理|咨询|设计)[^。;,]{,5}机构|代理人)")
  1981. self.tenderee_right2 = re.compile("^[^。;,::]{,10}(招标办|采购部|办事处|采购小?组)")
  1982. self.tenderee_right3 = re.compile("^[^。;,::]{,10}(对|就|关于|的)(?P<project>[^。;,?!::]{4,40})")
  1983. # 公告主语判断规则
  1984. self.subject = re.compile("[我本][院校局]")
  1985. # 未识别实体召回正则
  1986. self.unrecognized1 = re.compile("(?P<tenderee_left>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
  1987. "(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \
  1988. "(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
  1989. self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|选取|抽取|抽选|出售|标卖|比价|处置)" \
  1990. "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
  1991. "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
  1992. # 未识别实体尾部判断
  1993. # self.unrecognized_end1 = re.compile(
  1994. # "^[\u4e00-\u9fa5]{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心|联合社|合作社)")
  1995. # self.unrecognized_end2 = re.compile("^[\u4e00-\u9fa5]{4,}(?:署|局|厅|处|室|科|部|站|所|股|行|园)")
  1996. def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
  1997. self.get_tenderee = False
  1998. ents = []
  1999. list_name = []
  2000. agency_set = set()
  2001. for ent in list_entitys[0]:
  2002. if ent.entity_type == 'name':
  2003. list_name.append(ent.entity_text)
  2004. if ent.entity_type in ['org', 'company']:
  2005. if ent.label == 0 and ent.values[ent.label]>=0.5:
  2006. self.get_tenderee = True
  2007. break
  2008. elif ent.label == 1:
  2009. if ent.values[ent.label]>0.5:
  2010. agency_set.add(ent.entity_text)
  2011. elif ent.label == 5:
  2012. if len(ent.entity_text)>=4:
  2013. ents.append(ent)
  2014. if not self.get_tenderee:
  2015. self.entity_context_rule(ents,list_name,list_sentences,list(agency_set))
  2016. if not self.get_tenderee:
  2017. self.subject_rule(ents,list_articles,list_sentences)
  2018. # if not self.get_tenderee:
  2019. # self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
  2020. # if not self.get_tenderee:
  2021. # self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
  2022. #entity上下文正则判断
  2023. def entity_context_rule(self,entitys,list_name,list_sentences,list_agency):
  2024. list_sentences[0].sort(key=lambda x:x.sentence_index)
  2025. entity_data = []
  2026. for ent in entitys:
  2027. _sentence = list_sentences[0][ent.sentence_index]
  2028. _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
  2029. end_index=ent.end_index, size=40, center_include=True,
  2030. word_flag=True, use_text=True,
  2031. text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
  2032. entity_data.append((ent,_span))
  2033. if not self.get_tenderee:
  2034. for _data in entity_data:
  2035. ent = _data[0]
  2036. _span = _data[1]
  2037. if re.search(self.tenderee_left_1,_span[0]):
  2038. ent.label = 0
  2039. ent.values[0] = 0.5 + ent.values[0] / 10
  2040. self.get_tenderee = True
  2041. if not self.get_tenderee:
  2042. for _data in entity_data:
  2043. ent = _data[0]
  2044. _span = _data[1]
  2045. if re.search(self.tenderee_left_2,_span[0]):
  2046. ent.label = 0
  2047. ent.values[0] = 0.5 + ent.values[0] / 10
  2048. self.get_tenderee = True
  2049. if not self.get_tenderee:
  2050. for _data in entity_data:
  2051. ent = _data[0]
  2052. _span = _data[1]
  2053. if re.search(self.tenderee_left_3,_span[0]):
  2054. ent.label = 0
  2055. ent.values[0] = 0.5 + ent.values[0] / 10
  2056. self.get_tenderee = True
  2057. if not self.get_tenderee:
  2058. for _data in entity_data:
  2059. ent = _data[0]
  2060. _span = _data[1]
  2061. if re.search(self.tenderee_left_4,_span[0]):
  2062. if len(list_agency)>0:
  2063. _same = False
  2064. for agency in list_agency:
  2065. if ent.entity_text in agency or agency in ent.entity_text:
  2066. _same = True
  2067. break
  2068. if not _same:
  2069. ent.label = 0
  2070. ent.values[0] = 0.5 + ent.values[0] / 10
  2071. self.get_tenderee = True
  2072. else:
  2073. if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent.entity_text
  2074. ) or not re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent.entity_text) or re.search("自行.?采购",list_sentences[0][ent.sentence_index].sentence_text):
  2075. ent.label = 0
  2076. ent.values[0] = 0.5 + ent.values[0] / 10
  2077. self.get_tenderee = True
  2078. if not self.get_tenderee:
  2079. for _data in entity_data:
  2080. ent = _data[0]
  2081. _span = _data[1]
  2082. if re.search(self.tenderee_left_5,_span[0]):
  2083. if len(list_agency)>0:
  2084. _same = False
  2085. for agency in list_agency:
  2086. if ent.entity_text in agency or agency in ent.entity_text:
  2087. _same = True
  2088. break
  2089. if not _same:
  2090. ent.label = 0
  2091. ent.values[0] = 0.5 + ent.values[0] / 10
  2092. self.get_tenderee = True
  2093. else:
  2094. if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent.entity_text
  2095. ) or not re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent.entity_text):
  2096. ent.label = 0
  2097. ent.values[0] = 0.5 + ent.values[0] / 10
  2098. self.get_tenderee = True
  2099. if not self.get_tenderee:
  2100. for _data in entity_data:
  2101. ent = _data[0]
  2102. _span = _data[1]
  2103. if re.search(self.tenderee_right, _span[2]):
  2104. ent.label = 0
  2105. ent.values[0] = 0.5 + ent.values[0] / 10
  2106. self.get_tenderee = True
  2107. if not self.get_tenderee:
  2108. for _data in entity_data:
  2109. ent = _data[0]
  2110. _span = _data[1]
  2111. if re.search(self.tenderee_right2, _span[2]):
  2112. ent.label = 0
  2113. ent.values[0] = 0.5 + ent.values[0] / 10
  2114. self.get_tenderee = True
  2115. if not self.get_tenderee:
  2116. if list_name:
  2117. for _data in entity_data:
  2118. ent = _data[0]
  2119. _span = _data[1]
  2120. pj_name = re.search(self.tenderee_right3, _span[2])
  2121. if pj_name:
  2122. pj_name = pj_name.groupdict()["project"]
  2123. for _name in list_name:
  2124. if _name in pj_name:
  2125. ent.label = 0
  2126. ent.values[0] = 0.5
  2127. self.get_tenderee = True
  2128. break
  2129. # for _data in entity_data:
  2130. # ent = _data[0]
  2131. # _span = _data[1]
  2132. # if re.search(self.tenderee_left,_span[0]):
  2133. # ent.label = 0
  2134. # ent.values[0] = 0.5 + ent.values[0] / 10
  2135. # self.get_tenderee = True
  2136. # elif re.search(self.tenderee_right,_span[2]):
  2137. # ent.label = 0
  2138. # ent.values[0] = 0.5 + ent.values[0] / 10
  2139. # self.get_tenderee = True
  2140. # elif re.search(self.tenderee_right2, _span[2]):
  2141. # ent.label = 0
  2142. # ent.values[0] = 0.5 + ent.values[0] / 10
  2143. # self.get_tenderee = True
  2144. # elif list_name:
  2145. # pj_name = re.search(self.tenderee_right3, _span[2])
  2146. # if pj_name:
  2147. # pj_name = pj_name.groupdict()["project"]
  2148. # for _name in list_name:
  2149. # if _name in pj_name:
  2150. # ent.label = 0
  2151. # ent.values[0] = 0.5
  2152. # self.get_tenderee = True
  2153. # break
  2154. # 公告主语判断
  2155. def subject_rule(self, entitys,list_articles,list_sentences):
  2156. content = list_articles[0].content.split('##attachment##')[0]
  2157. if re.search(self.subject,content):
  2158. _subject = re.search(self.subject,content).group()
  2159. for ent in entitys:
  2160. if re.search("院",_subject) and re.search("医院|学院",ent.entity_text):
  2161. ent.label = 0
  2162. ent.values[0] = 0.5 + ent.values[0] / 10
  2163. self.get_tenderee = True
  2164. elif re.search("校",_subject) and re.search("学校|学院|大学|高中|初中|中学|小学",ent.entity_text):
  2165. ent.label = 0
  2166. ent.values[0] = 0.5 + ent.values[0] / 10
  2167. self.get_tenderee = True
  2168. elif re.search("局", _subject) and re.search("局", ent.entity_text):
  2169. _sentence = list_sentences[0][ent.sentence_index]
  2170. _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
  2171. end_index=ent.end_index, size=20, center_include=True,
  2172. word_flag=True, use_text=True,
  2173. text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
  2174. if not re.search("监督|投诉",_span[0][-10:]):
  2175. ent.label = 0
  2176. ent.values[0] = 0.5 + ent.values[0] / 10
  2177. self.get_tenderee = True
  2178. # 正则召回未识别实体
  2179. # def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
  2180. # list_sentence = list_sentences[0]
  2181. # for in_attachment in [False,True]:
  2182. # for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
  2183. # sentence_text = sentence.sentence_text
  2184. # tokens = sentence.tokens
  2185. # doc_id = sentence.doc_id
  2186. # in_attachment = sentence.in_attachment
  2187. # list_tokenbegin = []
  2188. # begin = 0
  2189. # for i in range(0, len(tokens)):
  2190. # list_tokenbegin.append(begin)
  2191. # begin += len(str(tokens[i]))
  2192. # list_tokenbegin.append(begin + 1)
  2193. # for _match in re.finditer(pattern,sentence_text):
  2194. # _groupdict = _match.groupdict()
  2195. # _match_text = _match.group()
  2196. # _unrecognized_text = _groupdict["unrecognized"]
  2197. # _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
  2198. # if not _unrecognized:
  2199. # _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
  2200. # if _unrecognized:
  2201. # _unrecognized = _unrecognized.group()
  2202. # else:
  2203. # continue
  2204. # # print(_unrecognized)
  2205. # if re.search("某|乙方|代理",_unrecognized) or len(_unrecognized)>15:
  2206. # continue
  2207. # begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
  2208. # for j in range(len(list_tokenbegin)):
  2209. # if list_tokenbegin[j] == begin_index_temp:
  2210. # begin_index = j
  2211. # break
  2212. # elif list_tokenbegin[j] > begin_index_temp:
  2213. # begin_index = j - 1
  2214. # break
  2215. # index = begin_index_temp + len(_unrecognized)
  2216. # end_index_temp = index
  2217. # for j in range(begin_index, len(list_tokenbegin)):
  2218. # if list_tokenbegin[j] >= index:
  2219. # end_index = j - 1
  2220. # break
  2221. # entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
  2222. # entity_text = _unrecognized
  2223. # new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
  2224. # begin_index_temp, end_index_temp, in_attachment=in_attachment)
  2225. # new_entity.label = 0
  2226. # new_entity.values = [on_value,0,0,0,0,0]
  2227. # list_entitys[0].append(new_entity)
  2228. # self.get_tenderee = True
  2229. # if self.get_tenderee:
  2230. # list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
  2231. # break
  2232. class RoleGrade():
  2233. def __init__(self):
  2234. self.tenderee_left_9 = "(?P<tenderee_left_9>(招标|采购|遴选|寻源|竞价|议价|比选|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|方|单位))"
  2235. self.tenderee_center_8 = "(?P<tenderee_center_8>受.{5,20}委托)"
  2236. self.tenderee_left_8 = "(?P<tenderee_left_8>(尊敬的供应商|项目法人|(需求|最终|发包|征集|甲|转让|出租|处置)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
  2237. self.tenderee_left_6 = "(?P<tenderee_left_6>(业主|建设|委托)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|业主|买方)"
  2238. self.tenderee_left_5 = "(?P<tenderee_left_5>(发布)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|买方|发布机构)"
  2239. self.agency_left_9 = "(?P<agency_left_9>代理)"
  2240. self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一](名|候选)|排[名序]:1|名次:1)"
  2241. self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方|最[终后]选[择取]))" # 229435497 最后选择西平,县中原彩印有限公司,作为此项目中标供应商,
  2242. self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
  2243. self.winTenderer_right_9 = "(?P<winTenderer_right_9>^(为(中标|成交|中选)(人|单位|供应商|公司)|以\d+[\d.,]+万?元中标))"
  2244. self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2](名|候选)|排[名序]:2|名次:2))"
  2245. self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3](名|候选)|排[名序]:3|名次:3))"
  2246. self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.tenderee_left_5,self.agency_left_9,
  2247. self.winTenderer_left_9,self.winTenderer_left_8, self.winTenderer_right_9, self.winTenderer_left_6, self.secondTenderer_left_9, self.thirdTenderer_left_9] # 概率要由高到低 274941849
  2248. def predict(self, list_sentences, list_entitys, original_docchannel, span=15, min_prob=0.7):
  2249. '''
  2250. 根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
  2251. 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
  2252. :param list_articles:
  2253. :param list_sentences:
  2254. :param list_entitys:
  2255. :param codeName:
  2256. :return:
  2257. '''
  2258. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  2259. role2id = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}
  2260. org_winner = []
  2261. company_winner = []
  2262. org_tenderee = []
  2263. agency_l = []
  2264. agency_like_tenderee = [] # 类似招标人的代理人实体列表
  2265. low_prob_agency = []
  2266. low_prob_tenderee = []
  2267. low_prob_winner = []
  2268. all_tenderee_agency = []
  2269. for entity in list_entitys[0]:
  2270. if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> min_prob:
  2271. text = sentences[entity.sentence_index].sentence_text
  2272. in_att = sentences[entity.sentence_index].in_attachment
  2273. pre_prob = entity.values[entity.label] # 模型预测角色概率
  2274. b = entity.wordOffset_begin
  2275. e = entity.wordOffset_end
  2276. not_found = 1
  2277. if re.search('(乙方:甲方:|甲方((买方)?,|:)乙方((卖方)?)?:)$', text[max(0, b-span):b]):
  2278. entity.label = 0 if entity.entity_type == 'org' else 5 # 修复 290777022 乙方:甲方: 重庆机场集团有限公司 错分为中标
  2279. entity.values[entity.label] = 0.55
  2280. continue
  2281. elif re.search('(采购|招标)人(?或其?(采购|招标)?代理机构)?', text[max(0, b-span-2):b]): # 修复 275206588 招标人或其招标代理机构:(盖章)
  2282. entity.label = 1 if is_agency(entity.entity_text) else 0
  2283. entity.values[entity.label] = 0.8
  2284. continue
  2285. elif re.search('(采购|招标|询比?价|遴选|寻源|比选)机构[是为:]+', text[max(0, b-span):b]) and not is_agency(entity.entity_text):
  2286. agency_like_tenderee.append(entity)
  2287. for pattern in self.pattern_list:
  2288. if 'left' in pattern:
  2289. context = text[max(0, b-span):b]
  2290. elif 'right' in pattern:
  2291. context = text[e:e+span]
  2292. elif 'center' in pattern:
  2293. context = text[max(0, b-span):e+span]
  2294. else:
  2295. print('规则错误', pattern)
  2296. ser = re.search(pattern, context)
  2297. if ser:
  2298. groupdict = pattern.split('>')[0].replace('(?P<', '')
  2299. _role, _direct, _prob = groupdict.split('_')
  2300. _label = role2id.get(_role)
  2301. if _label != entity.label:
  2302. continue
  2303. _prob = int(_prob)*0.1
  2304. # print('规则修改角色概率前:', entity.entity_text, entity.label, entity.values)
  2305. if in_att:
  2306. _prob = _prob - 0.1 # 0.2
  2307. if pre_prob < _prob: # 如果模型预测概率小于关键词概率
  2308. _prob = 0.65
  2309. if len(entity.entity_text) < 6 and re.search('大学|医院', entity.entity_text)==None: # 如果实体名称小于6个字,概率再降0.05
  2310. _prob -= 0.05
  2311. if re.search('(地址|联系方式):$', context): # 地址结尾的概率 概率降低
  2312. _prob -= 0.05
  2313. if _label == 0 and is_agency(entity.entity_text): # 20250116 修复 584333688 同时有招标单位 : 安徽省招标集团股份有限公司,.采购人信息 名 称:安徽开放大学
  2314. _prob -= 0.1
  2315. entity.values[_label] = _prob + entity.values[_label] / 20
  2316. not_found = 0
  2317. # print('规则修改角色概率后:', entity.entity_text, entity.label, entity.values)
  2318. break
  2319. if not_found and entity.values[entity.label]> min_prob:
  2320. _prob = min_prob - 0.1 if in_att else min_prob
  2321. entity.values[entity.label] = _prob + entity.values[entity.label] / 20
  2322. # print('找不到规则修改角色概率:', entity.entity_text, entity.label, entity.values)
  2323. if entity.label == 2 and entity.values[entity.label]> min_prob:
  2324. if entity.entity_type == 'org':
  2325. org_winner.append(entity)
  2326. elif entity.entity_type == 'company':
  2327. company_winner.append(entity) # 保存中标人实体
  2328. if entity.label == 0 and entity.values[entity.label]> min_prob:
  2329. org_tenderee.append(entity.entity_text) # 保存所有招标人名称
  2330. elif entity.label == 1 and entity.values[entity.label]> min_prob:
  2331. agency_l.append(entity.entity_text)
  2332. # if entity.entity_type in ['org', 'company'] and entity.label == 0 and entity.entity_text in agency_set and entity.values[entity.label]<0.6: # 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
  2333. # # log('修改概率小于0.6的且在大数据代理集合里面的招标人为代理人%s:'%entity.entity_text)
  2334. # entity.label = 1
  2335. # entity.values[entity.label] = 0.5
  2336. elif entity.entity_type in ['org', 'company'] and entity.label in [1, 0] and 0.5<=entity.values[entity.label]<0.6:
  2337. if entity.label == 1:
  2338. low_prob_agency.append(entity)
  2339. else:
  2340. low_prob_tenderee.append(entity)
  2341. elif entity.entity_type in ['org', 'company'] and entity.label == 2 and 0.5<=entity.values[entity.label]<0.6:
  2342. low_prob_winner.append(entity)
  2343. if entity.entity_type in ['org', 'company'] and entity.label in [1, 0] and 0.6<entity.values[entity.label]: # 由0.5调为0.6,避免367217504 同时为低概率招标、中标被改
  2344. all_tenderee_agency.append(entity.entity_text)
  2345. if org_tenderee == [] and agency_like_tenderee:
  2346. for entity in agency_like_tenderee:
  2347. entity.label = 0
  2348. entity.values[entity.label] = 0.6
  2349. for entity in low_prob_agency: # 如果低概率代理在招标人列表,改为招标人
  2350. if entity.entity_text in org_tenderee:
  2351. entity.label = 0
  2352. entity.values[entity.label] = 0.6
  2353. for entity in low_prob_tenderee:
  2354. if entity.entity_text in agency_l:
  2355. entity.label = 1
  2356. entity.values[entity.label] = 0.6
  2357. for entity in low_prob_winner: # 如果低概率中标人在招标或代理列表,改为非角色
  2358. if entity.entity_text in all_tenderee_agency:
  2359. entity.label = 5
  2360. # elif entity.in_attachment: # 附件低概率中标角色不要 避免:516109391 桂林银行崇左宁明支行,宁明县城中镇兴宁大道中70号,预测为中标 20241126 注释掉,558294326 附件单个候选人漏提取
  2361. # entity.label = 5
  2362. if org_winner != []:
  2363. flag = 0
  2364. if org_tenderee != []:
  2365. for ent in org_winner:
  2366. if ent.entity_text in org_tenderee:
  2367. # log('如果org中标人同时为招标人角色,降低中标概率:%s, %s' % (ent.entity_text, ent.label))
  2368. ent.values[2] = 0.6
  2369. flag = 1
  2370. # if flag == 0 and company_winner != []: # 2024/04/18 注释掉 避免提取不到 273351465 供应商(乙方:湖南省第二测绘院
  2371. # for ent in org_winner:
  2372. # if ent.label == 2 and ent.values[2] > 0.6:
  2373. # # log('如果同时包含org和company中标人,降低org中标人概率为0.6:%s, %s' % (ent.entity_text, ent.values[2]))
  2374. # ent.values[2] = 0.6
  2375. class MoneyGrade():
  2376. def __init__(self):
  2377. self.tenderee_money_left_9 = "(?P<tenderee_left_9>最高(投标)?限价)|控制价|拦标价"
  2378. self.tenderee_money_left_8 = "(?P<tenderee_left_8>预算|限价|起始|起拍|底价|标底)"
  2379. self.tenderer_money_left_9 = "(?P<tenderer_left_9>(中标|成交|合同|总报价))"
  2380. self.tenderer_money_left_8 = "(?P<tenderer_left_8>(投标|总价))"
  2381. self.pattern_list = [self.tenderee_money_left_8, self.tenderer_money_left_8, self.tenderee_money_left_9, self.tenderer_money_left_9]
  2382. def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
  2383. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  2384. role2id = {"tenderee": 0, "tenderer": 1}
  2385. for entity in list_entitys[0]:
  2386. if entity.entity_type in ['money'] and entity.label in [0, 1] and entity.values[entity.label]> 0.6:
  2387. text = sentences[entity.sentence_index].sentence_text
  2388. in_att = sentences[entity.sentence_index].in_attachment
  2389. b = entity.wordOffset_begin
  2390. e = entity.wordOffset_end
  2391. context = text[max(0, b - span):b]
  2392. not_found = 1
  2393. for pattern in self.pattern_list:
  2394. ser = re.search(pattern, context)
  2395. if ser:
  2396. groupdict = pattern.split('>')[0].replace('(?P<', '')
  2397. _role, _direct, _prob = groupdict.split('_')
  2398. if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or entity.notes == '总投资':# or float(entity.entity_text)<100:
  2399. _prob = 6
  2400. _label = role2id.get(_role)
  2401. if _label != entity.label:
  2402. continue
  2403. _prob = int(_prob) * 0.1
  2404. # print('规则修改金额概率前:', entity.entity_text, entity.label, entity.values)
  2405. if in_att:
  2406. _prob = max(0.5, _prob - 0.2)
  2407. entity.values[_label] = _prob + entity.values[_label] / 20
  2408. not_found = 0
  2409. # print('规则修改金额概率后:', entity.entity_text, entity.label, entity.values)
  2410. break
  2411. if not_found and entity.values[entity.label] > min_prob:
  2412. if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or float(entity.entity_text)<100:
  2413. _prob = 0.6
  2414. elif in_att:
  2415. _prob = max(0.5, min_prob - 0.1)
  2416. else:
  2417. _prob = min_prob
  2418. # _prob = min_prob - 0.1 if in_att else min_prob
  2419. entity.values[entity.label] = _prob + entity.values[entity.label] / 20
  2420. # print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values)
  2421. # if entity.entity_type in ['money'] and entity.label in [0, 1] and 0.5<=entity.values[entity.label]<0.75 and float(entity.entity_text)<100: # 20241011 低概率小金额改为其他金额 # 20241128 小金额可能为单价,放单价存放
  2422. # entity.label = 2
  2423. # 时间类别
  2424. class TimePredictor():
  2425. def __init__(self,config=None):
  2426. self.sess = tf.Session(graph=tf.Graph(),config=config)
  2427. self.inputs_code = None
  2428. self.outputs_code = None
  2429. self.input_shape = (2,40,128)
  2430. self.load_model()
  2431. def load_model(self):
  2432. model_path = os.path.dirname(__file__)+'/timesplit_model'
  2433. if self.inputs_code is None:
  2434. log("get model of time")
  2435. with self.sess.as_default():
  2436. with self.sess.graph.as_default():
  2437. meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path)
  2438. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  2439. signature_def = meta_graph_def.signature_def
  2440. self.inputs_code = []
  2441. self.inputs_code.append(
  2442. self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
  2443. self.inputs_code.append(
  2444. self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
  2445. self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
  2446. return self.inputs_code, self.outputs_code
  2447. else:
  2448. return self.inputs_code, self.outputs_code
  2449. def search_time_data(self,list_sentences,list_entitys):
  2450. data_x = []
  2451. points_entitys = []
  2452. for list_sentence, list_entity in zip(list_sentences, list_entitys):
  2453. p_entitys = 0
  2454. p_sentences = 0
  2455. list_sentence.sort(key=lambda x: x.sentence_index)
  2456. while(p_entitys<len(list_entity)):
  2457. entity = list_entity[p_entitys]
  2458. if entity.entity_type in ['time']:
  2459. while(p_sentences<len(list_sentence)):
  2460. sentence = list_sentence[p_sentences]
  2461. if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
  2462. # left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
  2463. # right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
  2464. s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=self.input_shape[1])
  2465. left = s[0]
  2466. right = s[1]
  2467. context = [left, right]
  2468. x = self.embedding_words(context, shape=self.input_shape)
  2469. data_x.append(x)
  2470. points_entitys.append(entity)
  2471. break
  2472. p_sentences += 1
  2473. p_entitys += 1
  2474. if len(points_entitys)==0:
  2475. return None
  2476. data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
  2477. return [data_x, points_entitys]
  2478. def embedding_words(self, datas, shape):
  2479. '''
  2480. @summary:查找词汇对应的词向量
  2481. @param:
  2482. datas:词汇的list
  2483. shape:结果的shape
  2484. @return: array,返回对应shape的词嵌入
  2485. '''
  2486. model_w2v = getModel_w2v()
  2487. embed = np.zeros(shape)
  2488. length = shape[1]
  2489. out_index = 0
  2490. for data in datas:
  2491. index = 0
  2492. for item in data:
  2493. item_not_space = re.sub("\s*", "", item)
  2494. if index >= length:
  2495. break
  2496. if item_not_space in model_w2v.vocab:
  2497. embed[out_index][index] = model_w2v[item_not_space]
  2498. index += 1
  2499. else:
  2500. embed[out_index][index] = model_w2v['unk']
  2501. index += 1
  2502. out_index += 1
  2503. return embed
  2504. def predict(self, list_sentences,list_entitys):
  2505. datas = self.search_time_data(list_sentences, list_entitys)
  2506. if datas is None:
  2507. return
  2508. points_entitys = datas[1]
  2509. with self.sess.as_default():
  2510. predict_y = limitRun(self.sess,[self.outputs_code], feed_dict={self.inputs_code[0]:datas[0][0]
  2511. ,self.inputs_code[1]:datas[0][1]})[0]
  2512. for i in range(len(predict_y)):
  2513. entity = points_entitys[i]
  2514. label = np.argmax(predict_y[i])
  2515. values = []
  2516. for item in predict_y[i]:
  2517. values.append(item)
  2518. if label != 0:
  2519. if not timeFormat(entity.entity_text):
  2520. label = 0
  2521. values[0] = 0.5
  2522. entity.set_Role(label, values)
  2523. # 产品字段提取
  2524. class ProductPredictor():
  2525. def __init__(self,config=None):
  2526. vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
  2527. self.vocab = load(vocabpath)
  2528. self.word2index = dict((w, i) for i, w in enumerate(np.array(self.vocab)))
  2529. self.sess = tf.Session(graph=tf.Graph(),config=config)
  2530. self.load_model()
  2531. def load_model(self):
  2532. # model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb'
  2533. model_path = os.path.dirname(__file__)+'/product_savedmodel/productAndfailreason.pb'
  2534. with self.sess.as_default():
  2535. with self.sess.graph.as_default():
  2536. output_graph_def = tf.GraphDef()
  2537. with open(model_path, 'rb') as f:
  2538. output_graph_def.ParseFromString(f.read())
  2539. tf.import_graph_def(output_graph_def, name='')
  2540. self.sess.run(tf.global_variables_initializer())
  2541. self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0')
  2542. self.length = self.sess.graph.get_tensor_by_name("Sum:0")
  2543. self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0")
  2544. self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0")
  2545. self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0")
  2546. def decode(self,logits, lengths, matrix):
  2547. paths = []
  2548. small = -1000.0
  2549. # start = np.asarray([[small] * 4 + [0]])
  2550. start = np.asarray([[small]*7+[0]])
  2551. for score, length in zip(logits, lengths):
  2552. score = score[:length]
  2553. pad = small * np.ones([length, 1])
  2554. logits = np.concatenate([score, pad], axis=1)
  2555. logits = np.concatenate([start, logits], axis=0)
  2556. path, _ = viterbi_decode(logits, matrix)
  2557. paths.append(path[1:])
  2558. return paths
  2559. def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000, out_lines=[]):
  2560. '''
  2561. 预测实体代码,每个句子最多取MAX_AREA个字,超过截断
  2562. :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
  2563. :param list_entitys: 多篇公告实体列表
  2564. :param MAX_AREA: 每个句子最多截取多少字
  2565. :return: 把预测出来的实体放进实体类
  2566. '''
  2567. p = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设|分包)(的?(主要|简要|基本|具体|名称及))?" \
  2568. "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况|名称)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
  2569. "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模|(设备|材料|仪器|需求|产品|采购单?)(清单|名称|信息))为?([::,]|$)"
  2570. # sentence_range = [] #20240827 取消,修复线上接口产品耗时长问题
  2571. # if len(out_lines) >= 3: # 三个以上大纲
  2572. # for i in range(len(out_lines)-1):
  2573. # text, s1, b1 = out_lines[i]
  2574. # _, s2, b2 = out_lines[i+1]
  2575. # if 3<text.find(':')<20:
  2576. # text = text.split(':')[0]
  2577. # if re.search(p, text[:15]):
  2578. # sentence_range.append((s1, s2))
  2579. with self.sess.as_default() as sess:
  2580. with self.sess.graph.as_default():
  2581. result = []
  2582. product_list = []
  2583. if fail and list_articles!=[]:
  2584. text_list = [list_articles[0].content[:MAX_AREA]]
  2585. chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in text] for text in text_list]
  2586. if USE_API:
  2587. requests_result = requests.post(API_URL + "/predict_product",
  2588. json={"inputs": chars}, verify=True)
  2589. batch_paths = json.loads(requests_result.text)['result']
  2590. lengths = json.loads(requests_result.text)['lengths']
  2591. else:
  2592. lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
  2593. feed_dict={
  2594. self.char_input: np.asarray(chars),
  2595. self.dropout: 1.0
  2596. })
  2597. batch_paths = self.decode(scores, lengths, tran_)
  2598. for text, path, length in zip(text_list, batch_paths, lengths):
  2599. tags = ''.join([str(it) for it in path[:length]])
  2600. # 提取产品
  2601. for it in re.finditer("12*3", tags):
  2602. start = it.start()
  2603. end = it.end()
  2604. _entity = Entity(doc_id=list_articles[0].id, entity_id="%s_%s_%s_%s" % (
  2605. list_articles[0].doc_id, 0, start, end),
  2606. entity_text=text[start:end],
  2607. entity_type="product", sentence_index=0,
  2608. begin_index=0, end_index=0, wordOffset_begin=start,
  2609. wordOffset_end=end)
  2610. list_entitys[0].append(_entity)
  2611. product_list.append(text[start:end])
  2612. # 提取失败原因
  2613. for it in re.finditer("45*6", tags):
  2614. start = it.start()
  2615. end = it.end()
  2616. result.append(text[start:end].replace('?', '').strip())
  2617. reasons = []
  2618. for it in result:
  2619. if "(√)" in it or "(√)" in it:
  2620. reasons = [it]
  2621. break
  2622. if reasons != [] and (it not in reasons[-1] and it not in reasons):
  2623. reasons.append(it)
  2624. elif reasons == []:
  2625. reasons.append(it)
  2626. if reasons == []: # 如果模型识别不到失败原因 就用规则补充
  2627. for text in text_list:
  2628. ser1 = re.search('\w{,4}(理由|原因):\s*((第\d+包|标项\d+|原因类型)?[::]?[\s*\w,]{2,30}((不满?足|少于|未达)((法定)?[123一二三两]家|(规定)?要求)|(项目|采购)(终止|废标)),?)+',text)
  2629. ser2 = re.search(
  2630. '\w{,4}(理由|原因):\s*(第\d+包|标项\d+|原因类型)?[::]?[\s*\w]{4,30},', text)
  2631. if ser1:
  2632. reasons.append(ser1.group(0))
  2633. break
  2634. elif ser2:
  2635. reasons.append(ser2.group(0))
  2636. break
  2637. return {'fail_reason':';'.join(reasons)}, product_list
  2638. if list_entitys is None:
  2639. list_entitys = [[] for _ in range(len(list_sentences))]
  2640. for list_sentence, list_entity in zip(list_sentences,list_entitys):
  2641. if len(list_sentence)==0:
  2642. result.append({"product":[]})
  2643. continue
  2644. # 20240827 取消,修复线上接口产品耗时长问题
  2645. # if sentence_range: # 20240815 如果有招标内容大纲,只从前两句及大纲内提取产品,避免类似 514920213 提取错其他内容 银行流水
  2646. # new_list = []
  2647. # word_num = 0
  2648. # for sentence in list_sentence:
  2649. # if sentence.sentence_index<2:
  2650. # new_list.append(sentence)
  2651. # continue
  2652. # for s1, s2 in sentence_range:
  2653. # if sentence.sentence_index < s1:
  2654. # continue
  2655. # elif s1<=sentence.sentence_index <=s2:
  2656. # new_list.append(sentence)
  2657. # word_num += len(sentence.sentence_text)
  2658. # elif sentence.sentence_index >= s2:
  2659. # break
  2660. # if word_num > 100:
  2661. # list_sentence = new_list
  2662. list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
  2663. _begin_index = 0
  2664. item = {"product":[]}
  2665. temp_list = []
  2666. while True:
  2667. MAX_LEN = len(list_sentence[_begin_index].sentence_text)
  2668. if MAX_LEN > MAX_AREA:
  2669. MAX_LEN = MAX_AREA
  2670. _LEN = MAX_AREA//MAX_LEN
  2671. chars = [sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
  2672. chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in l] for l in chars]
  2673. chars = pad_sequences(chars, maxlen=MAX_LEN, padding="post", truncating="post")
  2674. if USE_API:
  2675. requests_result = requests.post(API_URL + "/predict_product",
  2676. json={"inputs": chars.tolist()}, verify=True)
  2677. batch_paths = json.loads(requests_result.text)['result']
  2678. lengths = json.loads(requests_result.text)['lengths']
  2679. else:
  2680. lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
  2681. feed_dict={
  2682. self.char_input: np.asarray(chars),
  2683. self.dropout: 1.0
  2684. })
  2685. batch_paths = self.decode(scores, lengths, tran_)
  2686. for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths):
  2687. tags = ''.join([str(it) for it in path[:length]])
  2688. for it in re.finditer("12*3", tags):
  2689. start = it.start()
  2690. end = it.end()
  2691. _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
  2692. sentence.doc_id, sentence.sentence_index, start, end),
  2693. entity_text=sentence.sentence_text[start:end],
  2694. entity_type="product", sentence_index=sentence.sentence_index,
  2695. begin_index=0, end_index=0, wordOffset_begin=start,
  2696. wordOffset_end=end,in_attachment=sentence.in_attachment)
  2697. list_entity.append(_entity)
  2698. temp_list.append(sentence.sentence_text[start:end])
  2699. product_list.append(sentence.sentence_text[start:end])
  2700. # item["product"] = list(set(temp_list))
  2701. # result.append(item)
  2702. if _begin_index+_LEN >= len(list_sentence):
  2703. break
  2704. _begin_index += _LEN
  2705. item["product"] = list(set(temp_list))
  2706. result.append(item) # 修正bug
  2707. return {'fail_reason': ""},product_list
  2708. # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
  2709. class ProductAttributesPredictor():
  2710. def __init__(self,):
  2711. self.p0 = '(类别|类型|物类|目录|类目|分类)(名称|$)|^品名|^品类|^品目|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)(名称|内容|描述)'
  2712. self.p1 = '(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体)[\))的]?([、\w]{,4}名称|内容|描述)'
  2713. self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)(名称|内容|描述)'
  2714. # self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
  2715. # self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
  2716. with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
  2717. self.header_set = pickle.load(f)
  2718. self.tb = TableTag2List()
  2719. def isTrueTable(self, table):
  2720. '''真假表格规则:
  2721. 1、包含<caption>或<th>标签为真
  2722. 2、包含大量链接、表单、图片或嵌套表格为假
  2723. 3、表格尺寸太小为假
  2724. 4、外层<table>嵌套子<table>,一般子为真,外为假'''
  2725. if table.find_all(['caption', 'th']) != []:
  2726. return True
  2727. elif len(table.find_all(['form', 'a', 'img'])) > 5:
  2728. # print('过滤表格:包含链接图片等大于5的为假表格')
  2729. return False
  2730. elif len(table.find_all(['tr'])) < 2:
  2731. # print('过滤表格:行数小于2的为假表格')
  2732. return False
  2733. elif len(table.find_all(['table'])) >= 1:
  2734. # print('过滤表格:包含多个表格的为假表格')
  2735. return False
  2736. else:
  2737. return True
  2738. def getTrs(self, tbody):
  2739. # 获取所有的tr
  2740. trs = []
  2741. objs = tbody.find_all(recursive=False)
  2742. for obj in objs:
  2743. if obj.name == "tr":
  2744. trs.append(obj)
  2745. if obj.name == "tbody":
  2746. for tr in obj.find_all("tr", recursive=False):
  2747. trs.append(tr)
  2748. return trs
  2749. def getTable(self, tbody):
  2750. trs = self.getTrs(tbody)
  2751. inner_table = []
  2752. if len(trs) < 2:
  2753. return inner_table
  2754. for tr in trs:
  2755. tr_line = []
  2756. tds = tr.findChildren(['td', 'th'], recursive=False)
  2757. if len(tds) < 2:
  2758. continue
  2759. for td in tds:
  2760. # td_text = re.sub('\s+|…', ' ', td.get_text()).strip()
  2761. td_text = re.sub('…', '', td.get_text()).strip()
  2762. td_text = td_text.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '/').replace('"', '') # 修复272144312 # 产品单价数量提取结果有特殊符号\ 气动执行装置备件\密封组件\NBR+PT
  2763. td_text = td_text.replace("(", "(").replace(")", ")").replace(':', ':')
  2764. tr_line.append(td_text)
  2765. inner_table.append(tr_line)
  2766. return inner_table
  2767. def fixSpan(self, tbody):
  2768. # 处理colspan, rowspan信息补全问题
  2769. trs = self.getTrs(tbody)
  2770. ths_len = 0
  2771. ths = list()
  2772. trs_set = set()
  2773. # 修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
  2774. # 遍历每一个tr
  2775. for indtr, tr in enumerate(trs):
  2776. ths_tmp = tr.findChildren('th', recursive=False)
  2777. # 不补全含有表格的tr
  2778. if len(tr.findChildren('table')) > 0:
  2779. continue
  2780. if len(ths_tmp) > 0:
  2781. ths_len = ths_len + len(ths_tmp)
  2782. for th in ths_tmp:
  2783. ths.append(th)
  2784. trs_set.add(tr)
  2785. # 遍历每行中的element
  2786. tds = tr.findChildren(recursive=False)
  2787. if len(tds) < 3:
  2788. continue # 列数太少的不补全
  2789. for indtd, td in enumerate(tds):
  2790. # 若有colspan 则补全同一行下一个位置
  2791. if 'colspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['colspan']))) != "":
  2792. col = int(re.sub("[^0-9]", "", str(td['colspan'])))
  2793. if col < 10 and len(td.get_text()) < 500:
  2794. td['colspan'] = 1
  2795. for i in range(1, col, 1):
  2796. td.insert_after(copy.copy(td))
  2797. for indtr, tr in enumerate(trs):
  2798. ths_tmp = tr.findChildren('th', recursive=False)
  2799. # 不补全含有表格的tr
  2800. if len(tr.findChildren('table')) > 0:
  2801. continue
  2802. if len(ths_tmp) > 0:
  2803. ths_len = ths_len + len(ths_tmp)
  2804. for th in ths_tmp:
  2805. ths.append(th)
  2806. trs_set.add(tr)
  2807. # 遍历每行中的element
  2808. tds = tr.findChildren(recursive=False)
  2809. same_span = 0
  2810. if len(tds) > 1 and 'rowspan' in tds[0].attrs:
  2811. span0 = tds[0].attrs['rowspan']
  2812. for td in tds:
  2813. if 'rowspan' in td.attrs and td.attrs['rowspan'] == span0:
  2814. same_span += 1
  2815. if same_span == len(tds):
  2816. continue
  2817. for indtd, td in enumerate(tds):
  2818. # 若有rowspan 则补全下一行同样位置
  2819. if 'rowspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['rowspan']))) != "":
  2820. row = int(re.sub("[^0-9]", "", str(td['rowspan'])))
  2821. td['rowspan'] = 1
  2822. for i in range(1, row, 1):
  2823. # 获取下一行的所有td, 在对应的位置插入
  2824. if indtr + i < len(trs):
  2825. tds1 = trs[indtr + i].findChildren(['td', 'th'], recursive=False)
  2826. if len(tds1) >= (indtd) and len(tds1) > 0:
  2827. if indtd > 0:
  2828. tds1[indtd - 1].insert_after(copy.copy(td))
  2829. else:
  2830. tds1[0].insert_before(copy.copy(td))
  2831. elif len(tds1) > 0 and len(tds1) == indtd - 1:
  2832. tds1[indtd - 2].insert_after(copy.copy(td))
  2833. def get_monthlen(self, year, month):
  2834. '''输入年份、月份 int类型 得到该月份天数'''
  2835. try:
  2836. weekday, num = calendar.monthrange(int(year), int(month))
  2837. except:
  2838. num = 30
  2839. return str(num)
  2840. def fix_time(self, text, html, page_time):
  2841. '''输入日期字段返回格式化日期'''
  2842. for it in [('十二', '12'),('十一', '11'),('十','10'),('九','9'),('八','8'),('七','7'),
  2843. ('六','6'),('五','5'),('四','4'),('三','3'),('二','2'),('一','1')]:
  2844. if it[0] in text:
  2845. text = text.replace(it[0], it[1])
  2846. if re.search('^\d{1,2}月$', text):
  2847. m = re.search('^(\d{1,2})月$', text).group(1)
  2848. if len(m) < 2:
  2849. m = '0' + m
  2850. year = re.search('(\d{4})年(.{,12}采购意向)?', html)
  2851. if year:
  2852. y = year.group(1)
  2853. num = self.get_monthlen(y, m)
  2854. if len(num) < 2:
  2855. num = '0' + num
  2856. order_begin = "%s-%s-01" % (y, m)
  2857. order_end = "%s-%s-%s" % (y, m, num)
  2858. elif page_time != "":
  2859. year = re.search('\d{4}', page_time)
  2860. if year:
  2861. y = year.group(0)
  2862. num = self.get_monthlen(y, m)
  2863. if len(num) < 2:
  2864. num = '0' + num
  2865. order_begin = "%s-%s-01" % (y, m)
  2866. order_end = "%s-%s-%s" % (y, m, num)
  2867. else:
  2868. y = str(datetime.datetime.now().year)
  2869. num = self.get_monthlen(y, m)
  2870. if len(num) < 2:
  2871. num = '0' + num
  2872. order_begin = "%s-%s-01" % (y, m)
  2873. order_end = "%s-%s-%s" % (y, m, num)
  2874. else:
  2875. y = str(datetime.datetime.now().year)
  2876. num = self.get_monthlen(y, m)
  2877. if len(num) < 2:
  2878. num = '0' + num
  2879. order_begin = "%s-%s-01" % (y, m)
  2880. order_end = "%s-%s-%s" % (y, m, num)
  2881. return order_begin, order_end
  2882. t1 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})月?$', text)
  2883. if t1:
  2884. year = t1.group(1)
  2885. month = t1.group(3)
  2886. num = self.get_monthlen(year, month)
  2887. if len(month)<2:
  2888. month = '0'+month
  2889. if len(num) < 2:
  2890. num = '0'+num
  2891. order_begin = "%s-%s-01" % (year, month)
  2892. order_end = "%s-%s-%s" % (year, month, num)
  2893. return order_begin, order_end
  2894. t2 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})(月|/|\.|-)(\d{1,2})日?$', text)
  2895. if t2:
  2896. y = t2.group(1)
  2897. m = t2.group(3)
  2898. d = t2.group(5)
  2899. m = '0'+ m if len(m)<2 else m
  2900. d = '0'+d if len(d)<2 else d
  2901. order_begin = order_end = "%s-%s-%s"%(y,m,d)
  2902. return order_begin, order_end
  2903. # 时间样式:"202105"
  2904. t3 = re.search("^(20\d{2})(\d{1,2})$",text)
  2905. if t3:
  2906. year = t3.group(1)
  2907. month = t3.group(2)
  2908. if int(month)>0 and int(month)<=12:
  2909. num = self.get_monthlen(year, month)
  2910. if len(month) < 2:
  2911. month = '0' + month
  2912. if len(num) < 2:
  2913. num = '0' + num
  2914. order_begin = "%s-%s-01" % (year, month)
  2915. order_end = "%s-%s-%s" % (year, month, num)
  2916. return order_begin, order_end
  2917. # 时间样式:"20210510"
  2918. t4 = re.search("^(20\d{2})(\d{2})(\d{2})$", text)
  2919. if t4:
  2920. year = t4.group(1)
  2921. month = t4.group(2)
  2922. day = t4.group(3)
  2923. if int(month) > 0 and int(month) <= 12 and int(day)>0 and int(day)<=31:
  2924. order_begin = order_end = "%s-%s-%s"%(year,month,day)
  2925. return order_begin, order_end
  2926. all_match = re.finditer('^(?P<y1>\d{4})(年|/|\.)(?P<m1>\d{1,2})(?:(月|/|\.)(?:(?P<d1>\d{1,2})日)?)?'
  2927. '(到|至|-)(?:(?P<y2>\d{4})(年|/|\.))?(?P<m2>\d{1,2})(?:(月|/|\.)'
  2928. '(?:(?P<d2>\d{1,2})日)?)?$', text)
  2929. y1 = m1 = d1 = y2 = m2 = d2 = ""
  2930. found_math = False
  2931. for _match in all_match:
  2932. if len(_match.group()) > 0:
  2933. found_math = True
  2934. for k, v in _match.groupdict().items():
  2935. if v!="" and v is not None:
  2936. if k == 'y1':
  2937. y1 = v
  2938. elif k == 'm1':
  2939. m1 = v
  2940. elif k == 'd1':
  2941. d1 = v
  2942. elif k == 'y2':
  2943. y2 = v
  2944. elif k == 'm2':
  2945. m2 = v
  2946. elif k == 'd2':
  2947. d2 = v
  2948. if not found_math:
  2949. return "", ""
  2950. y2 = y1 if y2 == "" else y2
  2951. d1 = '1' if d1 == "" else d1
  2952. d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
  2953. m1 = '0' + m1 if len(m1) < 2 else m1
  2954. m2 = '0' + m2 if len(m2) < 2 else m2
  2955. d1 = '0' + d1 if len(d1) < 2 else d1
  2956. d2 = '0' + d2 if len(d2) < 2 else d2
  2957. order_begin = "%s-%s-%s"%(y1,m1,d1)
  2958. order_end = "%s-%s-%s"%(y2,m2,d2)
  2959. return order_begin, order_end
  2960. def fix_quantity(self, quantity_text, header_quan_unit):
  2961. '''
  2962. 产品数量标准化,统一为数值型字符串
  2963. :param quantity_text: 原始数量字符串
  2964. :param header_quan_unit: 表头数量单位字符串
  2965. :return: 返回数量及单位
  2966. '''
  2967. quantity = quantity_text
  2968. quantity = re.sub('[一壹]', '1', quantity)
  2969. quantity = re.sub('[,,约]|(\d+)', '', quantity)
  2970. ser = re.search('^(\d+\.?\d*)(?([㎡\w/]{,5})', quantity)
  2971. if ser:
  2972. quantity = str(ser.group(1))
  2973. quantity_unit = ser.group(2)
  2974. if quantity_unit == "" and header_quan_unit != "":
  2975. quantity_unit = header_quan_unit
  2976. else:
  2977. quantity = ""
  2978. quantity_unit = ""
  2979. return quantity, quantity_unit
  2980. def find_header(self, items,p0, p1, p2):
  2981. '''
  2982. inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容
  2983. :param items: 列表,内容为每个td 文本内容
  2984. :param p1: 优先表头正则
  2985. :param p2: 第二表头正则
  2986. :return: 表头所在列序号,是否表头,表头内容
  2987. '''
  2988. items = [re.sub('\s', '', it) for it in items]
  2989. flag = False
  2990. header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':'', '备注':'','发布日期':'', '品目号':'', '品目名':''}
  2991. product = "" # 产品
  2992. quantity = "" # 数量
  2993. quantity_unit = "" # 数量单位
  2994. unitPrice = "" # 单价
  2995. brand = "" # 品牌
  2996. specs = "" # 规格
  2997. demand = "" # 采购需求
  2998. budget = "" # 预算金额
  2999. order_time = "" # 采购时间
  3000. total_price = "" # 总价
  3001. category = "" # 品目
  3002. parameter = "" # 参数
  3003. tenderee = "" # 采购人
  3004. notes = "" # 备注 2024/3/27 达仁 需求
  3005. issue_date = "" # 发布日期 2024/3/27 达仁 需求
  3006. pinmu_no = "" # 品目号
  3007. pinmu_name = "" # 品目名称
  3008. # for i in range(min(6, len(items))):
  3009. for i in range(len(items)):
  3010. it = items[i]
  3011. if len(it) < 15 and re.search(p0, it) != None:
  3012. flag = True
  3013. if category != "" and category != it:
  3014. continue
  3015. category = it
  3016. header_dic['品目'] = i
  3017. elif len(it) < 15 and re.search(p1, it) != None:
  3018. flag = True
  3019. if product !='' and product != it:
  3020. break
  3021. product = it
  3022. header_dic['名称'] = i
  3023. # break
  3024. # if not flag:
  3025. if product == "":
  3026. # for i in range(min(4, len(items))):
  3027. for i in range(len(items)):
  3028. it = items[i]
  3029. if len(it) < 15 and it != category and re.search(p2, it) and (re.search('^名称|^品名|^品目', it) or re.search(
  3030. '编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None):
  3031. flag = True
  3032. product = it
  3033. header_dic['名称'] = i
  3034. break
  3035. if flag == False and len(items)>3 and re.search('^第[一二三四五六七八九十](包|标段)$', items[0]):
  3036. product = items[0]
  3037. header_dic['名称'] = 0
  3038. flag = True
  3039. if flag:
  3040. # for j in range(i + 1, len(items)):
  3041. for j in range(len(items)):
  3042. if header_dic['品目号'] == "" and re.search('(品目|品类)(编?号|编码|序号)', items[j]):
  3043. header_dic['品目号'] = j
  3044. pinmu_no = items[j]
  3045. elif header_dic['品目名'] == "" and re.search('(品目|品类)名称|采购(品目|品类)$', items[j]):
  3046. header_dic['品目名'] = j
  3047. pinmu_name = items[j]
  3048. if items[j] in [product, category]:
  3049. continue
  3050. if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
  3051. continue
  3052. if header_dic['数量']=="" and re.search('数量|采购量', items[j]) and re.search('单价|用途|要求|规格|型号|运输|承运', items[j])==None:
  3053. header_dic['数量'] = j
  3054. quantity = items[j]
  3055. elif header_dic['单位']=="" and re.search('^(数量单位|计量单位|单位)$', items[j]):
  3056. header_dic['单位'] = j
  3057. quantity_unit = items[j]
  3058. elif re.search('单价', items[j]) and re.search('数量|规格|型号|品牌|供应商', items[j])==None:
  3059. header_dic['单价'] = j
  3060. unitPrice = items[j]
  3061. elif re.search('品牌', items[j]):
  3062. header_dic['品牌'] = j
  3063. brand = items[j]
  3064. elif re.search('规格|型号', items[j]):
  3065. header_dic['规格'] = j
  3066. specs = items[j]
  3067. elif re.search('参数', items[j]):
  3068. header_dic['参数'] = j
  3069. parameter = items[j]
  3070. elif re.search('预算单位|(采购|招标|购买)(单位|人|方|主体)|项目业主|采购商|申购单位|需求单位|业主单位',items[j]) and len(items[j])<=8:
  3071. header_dic['采购人'] = j
  3072. tenderee = items[j]
  3073. elif re.search('需求|服务要求|服务标准', items[j]):
  3074. header_dic['需求'] = j
  3075. demand = items[j]
  3076. elif re.search('预算|控制金额', items[j]) and not re.search('预算单位',items[j]):
  3077. header_dic['预算'] = j
  3078. budget = items[j]
  3079. elif re.search('时间|采购实施月份|采购月份|采购日期|预计(招标|采购|发标|发包)(时间|月份)', items[j]):
  3080. header_dic['时间'] = j
  3081. order_time = items[j]
  3082. elif re.search('总价|(成交|中标|验收|合同|预算|控制|总|合计))?([金总]额|价格?)|最高限价|价格|金额', items[j]) and re.search('数量|规格|型号|品牌|供应商', items[j])==None:
  3083. header_dic['总价'] = j
  3084. total_price = items[j]
  3085. elif re.search('^备\s*注$|资质要求|预留面向中小企业|是否适宜中小企业采购预算预留|公开征集信息', items[j]):
  3086. header_dic['备注'] = j
  3087. notes = items[j]
  3088. elif re.search('^\w{,4}发布(时间|日期)$', items[j]):
  3089. header_dic['发布日期'] = j
  3090. issue_date = items[j]
  3091. if header_dic.get('名称', "") != "" or header_dic.get('品目', "") != "":
  3092. # num = 0
  3093. # for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time, total_price):
  3094. # if it != "":
  3095. # num += 1
  3096. # if num >=2:
  3097. # return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
  3098. if set([quantity, brand, specs, unitPrice, total_price])!=set([""]) or set([demand, budget])!=set([""]):
  3099. return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter, pinmu_no, pinmu_name), (product, demand, budget, order_time,tenderee, notes,issue_date)
  3100. flag = False
  3101. return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter, pinmu_no, pinmu_name), (product, demand, budget, order_time,tenderee,notes,issue_date)
  3102. def predict(self, docid='', html='', page_time=""):
  3103. '''
  3104. 正则寻找table表格内 产品相关信息
  3105. :param html:公告HTML原文
  3106. :return:公告表格内 产品、数量、单价、品牌、规格 ,表头,表头列等信息
  3107. '''
  3108. html = html.replace('<br>', '\n').replace('<br/>', '\n')
  3109. html = re.sub("<html>|</html>|<body>|</body>","",html)
  3110. html = re.sub("##attachment##","",html)
  3111. soup = BeautifulSoup(html, 'lxml')
  3112. # flag_yx = True if re.search('采购意向', html) else False
  3113. flag_yx = True if re.search('采购意向|招标意向|选取意向|意向公告|意向公示|意向公开', html) else False
  3114. tables = soup.find_all(['table'])
  3115. headers = []
  3116. headers_demand = []
  3117. header_col = []
  3118. product_link = []
  3119. demand_link = []
  3120. product_set = set()
  3121. total_product_money = 0
  3122. unit_price_list = [] # 单价列表,用于判断是否重复单价,避免多个表格重复提取造成合计产品价格错误。
  3123. total_price_list = [] # 总价列表,拥有判断是否为几行产品合计总价
  3124. # print('表格数:', len(tables))
  3125. for i in range(len(tables)): # (len(tables)-1, -1, -1) 由从最后到前改为 前到后
  3126. table = tables[i]
  3127. if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
  3128. table.string = table.get_text()
  3129. table.name = 'turntable'
  3130. # print('过滤表格:表格父节点为td,且表格td数量小于等于3')
  3131. continue
  3132. if not self.isTrueTable(table):
  3133. continue
  3134. # self.fixSpan(table)
  3135. # inner_table = self.getTable(table)
  3136. inner_table = self.tb.table2list(table)
  3137. table.extract()
  3138. # print(inner_table)
  3139. i = 0
  3140. found_header = False
  3141. header_quan_unit = "" # 数量表头 包含单位
  3142. header_colnum = 0
  3143. if flag_yx:
  3144. # print('意向公告, 提取意向信息')
  3145. col0_l = []
  3146. col1_l = []
  3147. for tds in inner_table:
  3148. if len(tds) == 2:
  3149. col0_l.append(re.sub('[::]', '', tds[0])) # 处理只有两列的情况
  3150. col1_l.append(tds[1])
  3151. elif len(tds)>=4 and len(inner_table)==2: # 处理只有两行的情况
  3152. col0_l = inner_table[0]
  3153. col1_l = inner_table[1]
  3154. break
  3155. # print(set(col0_l))
  3156. # print('head: ',set(col0_l) & self.header_set)
  3157. if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2 and len(col0_l)==len(col1_l): # 保证两个列数一致
  3158. header_list2 = []
  3159. product = demand = budget = order_begin = order_end = ""
  3160. tenderee = ""
  3161. notes = ''
  3162. issue_date = ''
  3163. for i in range(len(col0_l)):
  3164. if re.search('项目名称', col0_l[i]):
  3165. header_list2.append(col0_l[i])
  3166. product = col1_l[i]
  3167. elif re.search('采购需求|需求概况|招标内容|项目概况', col0_l[i]):
  3168. header_list2.append(col0_l[i])
  3169. demand = col1_l[i]
  3170. elif re.search('采购预算|预算金额|控制金额', col0_l[i]):
  3171. header_list2.append(col0_l[i])
  3172. _budget = col1_l[i]
  3173. re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", _budget)
  3174. if re_price:
  3175. # _budget = re_price[0]
  3176. # if '万元' in col0_l[i] and '万' not in _budget:
  3177. # _budget += '万元'
  3178. # budget = str(getUnifyMoney(_budget))
  3179. _budget, _money_unit = money_process(_budget, col0_l[i])
  3180. budget = str(_budget)
  3181. if '.' in budget:
  3182. budget = budget.rstrip('0').rstrip('.')
  3183. if float(budget)>= 500*100000000:
  3184. budget = ""
  3185. elif re.search('预算单位|(采购|招标|购买)(单位|人|方|主体)|项目业主|采购商|申购单位|需求单位|业主单位', col0_l[i]):
  3186. header_list2.append(col0_l[i])
  3187. tenderee = re.sub("\s","",col1_l[i])
  3188. if len(tenderee) > 20:
  3189. tenderee = ""
  3190. elif re.search('采购时间|采购实施月份|采购月份|采购日期|预计(招标|采购|发标|发包)(时间|月份)', col0_l[i]):
  3191. header_list2.append(col0_l[i])
  3192. order_time = col1_l[i].strip()
  3193. order_begin, order_end = self.fix_time(order_time, html, page_time)
  3194. elif re.search('^备\s*注$|资质要求|预留面向中小企业|是否适宜中小企业采购预算预留|公开征集信息', col0_l[i]):
  3195. header_list2.append(col0_l[i])
  3196. notes = col1_l[i].strip()
  3197. elif re.search('^\w{,4}发布(时间|日期)$', col0_l[i]):
  3198. header_list2.append(col0_l[i])
  3199. issue_date = self.fix_time(col1_l[i].strip(), '', '')[0]
  3200. if order_begin != "" and order_end!="":
  3201. order_begin_year = int(order_begin.split("-")[0])
  3202. order_end_year = int(order_end.split("-")[0])
  3203. # 限制附件错误识别时间
  3204. if order_begin_year>=2050 or order_end_year>=2050:
  3205. order_begin = order_end = ""
  3206. # print(product,demand,budget,order_begin)
  3207. if product!= "" and demand != "" and budget!="" and order_begin != "":
  3208. link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
  3209. 'order_begin': order_begin, 'order_end': order_end ,'tenderee':tenderee, 'notes':notes, 'issue_date':issue_date}
  3210. if link not in demand_link:
  3211. demand_link.append(link)
  3212. headers_demand.append('_'.join(header_list2))
  3213. continue
  3214. if len(inner_table)>3 and len(inner_table[0])==2 and len(inner_table[1])==2: # 只有两列且第一列为表头的,行列切换
  3215. col0_l = []
  3216. col1_l = []
  3217. for tds in inner_table:
  3218. if len(tds) == 2:
  3219. col0_l.append(re.sub('[::]', '', tds[0])) # 处理只有两列的情况
  3220. col1_l.append(tds[1])
  3221. else:
  3222. break
  3223. if len(set(col0_l) & self.header_set) > len(col0_l) * 0.5 and len(col0_l) == len(col1_l):
  3224. inner_table = [col0_l, col1_l]
  3225. elif len(inner_table)>2 and len(inner_table[0])==4 and len(inner_table[1])==4 and len(set(inner_table[0]) & self.header_set)==2: # 只有两列且第一列为表头的,行列切换
  3226. col0_l = []
  3227. col1_l = []
  3228. col2_l = []
  3229. col3_l = []
  3230. for tds in inner_table:
  3231. if len(tds) == 4 and len(set(tds))>2:
  3232. col0_l.append(re.sub('[::]', '', tds[0])) # 处理只有两列的情况
  3233. col1_l.append(tds[1])
  3234. col2_l.append(re.sub('[::]', '', tds[2])) # 处理只有两列的情况
  3235. col3_l.append(tds[3])
  3236. else:
  3237. break
  3238. if len(set(col0_l) & self.header_set) > len(col0_l) * 0.5 and len(set(col2_l) & self.header_set) > len(col2_l) * 0.5:
  3239. inner_table = [col0_l+col2_l, col1_l+col3_l]
  3240. while i < (len(inner_table)):
  3241. tds = inner_table[i]
  3242. not_empty = [it for it in tds if re.sub('\s', '', it) != ""]
  3243. if len(set(not_empty))<2 or len(set(tds))<2 or (len(set(tds))==2 and re.search('总计|合计|汇总', tds[0])): # 非空列或者不重复内容小于两列的 继续
  3244. i += 1
  3245. # print('表格产品提取:非空列或者不重复内容小于两列的 继续', i, tds)
  3246. continue
  3247. product = "" # 产品
  3248. quantity = "" # 数量
  3249. quantity_unit = "" # 数量单位
  3250. unitPrice = "" # 单价
  3251. brand = "" # 品牌
  3252. specs = "" # 规格
  3253. demand = "" # 采购需求
  3254. budget = "" # 预算金额
  3255. order_time = "" # 采购时间
  3256. order_begin = ""
  3257. order_end = ""
  3258. total_price = "" # 总金额
  3259. parameter = "" # 参数
  3260. tenderee = "" # 采购人
  3261. notes = '' # 备注
  3262. issue_date = '' # 发布日期
  3263. pinmu_no = '' # 品目号
  3264. pinmu_name = '' # 品目名称
  3265. if len(set([re.sub('[::\s]','',td) for td in tds]) & self.header_set) > len(tds) * 0.4:
  3266. # if len(set(tds) & self.header_set) > len(tds) * 0.2:
  3267. header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
  3268. if found_header:
  3269. header_colnum = len(tds) # 保存表头所在行列数
  3270. if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
  3271. quantity_header = header_list[1].replace('单位:', '')
  3272. if re.search('(([\w/]{,5}))', quantity_header):
  3273. header_quan_unit = re.search('(([\w/]{,5}))', quantity_header).group(1)
  3274. else:
  3275. header_quan_unit = ""
  3276. if found_header and ('_'.join(header_list) not in headers or '_'.join(header_list2) not in headers_demand):# and len(headers)<1: # 只保留出现的第一个表头
  3277. headers.append('_'.join(header_list))
  3278. headers_demand.append('_'.join(header_list2))
  3279. header_col.append('_'.join(tds))
  3280. i += 1
  3281. # print('表头数量占行列数0.4倍不做内容匹配', set([re.sub('[::]','',td) for td in tds]) & self.header_set, tds)
  3282. continue
  3283. elif found_header:
  3284. if len(tds) > header_colnum or len(tds)-1<max([it for it in header_dic.values() if it!=""]): # 表头、属性列数不一致跳过
  3285. i += 1
  3286. # print('表头、属性列数不一致跳过', len(tds), header_colnum, tds)
  3287. continue
  3288. id0 = header_dic.get('品目', "")
  3289. id1 = header_dic.get('名称', "")
  3290. id2 = header_dic.get('数量', "")
  3291. id2_2 = header_dic.get('单位', "")
  3292. id3 = header_dic.get('单价', "")
  3293. id4 = header_dic.get('品牌', "")
  3294. id5 = header_dic.get('规格', "")
  3295. id6 = header_dic.get('需求', "")
  3296. id7 = header_dic.get('预算', "")
  3297. id8 = header_dic.get('时间', "")
  3298. id9 = header_dic.get("总价", "")
  3299. id10 = header_dic.get('参数', "")
  3300. id11 = header_dic.get('采购人', "")
  3301. id12 = header_dic.get('备注', "")
  3302. id13 = header_dic.get('发布日期', "")
  3303. id14 = header_dic.get('品目号', "")
  3304. id15 = header_dic.get('品目名', "")
  3305. not_attr = 0
  3306. for k, v in header_dic.items():
  3307. if isinstance(v, int):
  3308. if v >= len(tds) or tds[v] in self.header_set:
  3309. # print('内容属性在表头集合里面', tds[v], v >= len(tds))
  3310. not_attr = 1
  3311. # break
  3312. if not_attr>=2: # 只要属性里面有两项为表头,停止匹配
  3313. i += 1
  3314. found_header = False
  3315. # print('只要属性里面有两项为表头,停止匹配')
  3316. continue
  3317. if id1!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
  3318. re.search('备注|汇总|合计|总价|价格|金额|^详见|无$|xxx', tds[id1]) == None:
  3319. product = tds[id1]
  3320. if id0!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id0]) and tds[id0] not in self.header_set and \
  3321. re.search('备注|汇总|合计|总价|价格|金额|^详见|无$|xxx', tds[id0]) == None:
  3322. category = tds[id0]
  3323. product = "%s_%s"%(category, product) if product!="" and product!=category else category
  3324. if product != "" and product not in ['工程类', '服务类', '货物类', '工程', '服务', '货物']:
  3325. # print('匹配产品内容: ', product)
  3326. if id2 != "":
  3327. if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
  3328. # if re.search('(^\d{,3}(,?\d{3}){2,}(\.\d{2,7},?)$)|万?元', tds[id2]): # 254816100 这篇数量很大,貌似正常
  3329. # i += 1
  3330. # print('过滤:数量包含金额单位或值很大类似金额', tds[id2])
  3331. # continue
  3332. quantity = tds[id2]
  3333. elif re.search('\w{5,}', tds[id2]) and re.search('^详见|^详情', tds[id2])==None:
  3334. i += 1
  3335. # print('过滤:数量包含五个字符以上且不包含^详见|^详情等字符', tds[id2])
  3336. continue
  3337. if id2_2 != "":
  3338. if re.search('^\w{1,4}$', tds[id2_2]) and re.search('元', tds[id2_2])==None:
  3339. quantity_unit = tds[id2_2]
  3340. if id3 != "":
  3341. if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
  3342. unitPrice = tds[id3]
  3343. elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id3].strip()):
  3344. unitPrice = tds[id3]
  3345. elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', tds[id3])) > 5 and re.search('^详见|^详情', tds[id3])==None:
  3346. i += 1
  3347. # print('过滤:产品单价包含金额外的字符数大于5个', tds[id3])
  3348. continue
  3349. else:
  3350. unitPrice = tds[id3]
  3351. if id4 != "":
  3352. if re.search('\w', tds[id4]):
  3353. brand = tds[id4]
  3354. if re.match('^详见|^详情', brand.strip()):
  3355. brand = ""
  3356. else:
  3357. brand = ""
  3358. if id5 != "":
  3359. if re.search('\w', tds[id5]):
  3360. specs = tds[id5][:500] # 限制最多500字
  3361. if re.match('^详见|^详情', specs.strip()):
  3362. specs = ""
  3363. else:
  3364. specs = ""
  3365. if id6 != "":
  3366. if re.search('\w', tds[id6]):
  3367. demand = tds[id6]
  3368. else:
  3369. demand = ""
  3370. if id7 != "":
  3371. if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
  3372. budget = tds[id7]
  3373. if id8 != "":
  3374. if re.search('\w', tds[id8]):
  3375. order_time = tds[id8].strip()
  3376. order_begin, order_end = self.fix_time(order_time, html, page_time)
  3377. if id9 != "":
  3378. if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id9]):
  3379. total_price = tds[id9]
  3380. elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id9].strip()):
  3381. total_price = tds[id9]
  3382. elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', tds[id9])) > 5 and re.search('^详见|^详情', tds[id9])==None:
  3383. i += 1
  3384. # print('过滤:产品总价包含金额外的字符数大于5个', tds[id9])
  3385. continue
  3386. if id10 != "":
  3387. parameter = tds[id10][:500]
  3388. if re.match('^详见|^详情', parameter.strip()):
  3389. parameter = ""
  3390. if id11 != "":
  3391. tenderee = re.sub("\s","",tds[id11])
  3392. if len(tenderee) > 30:
  3393. tenderee = ""
  3394. if id12 != "":
  3395. notes = tds[id12].strip()
  3396. if id13 != "":
  3397. issue_date = self.fix_time(tds[id13].strip(), '', '')[0]
  3398. if id14 != "":
  3399. pinmu_no = tds[id14].strip()
  3400. if id15 != "":
  3401. pinmu_name = tds[id15].strip()
  3402. # print('数量:{0}, 单价:{1}, 品牌:{2}, 规格:{3},总价:{4}'.format(quantity ,unitPrice, brand, specs, total_price))
  3403. if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price or '单价' in header_dic or '总价' in header_dic:
  3404. if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]+', tds[id2])) > 1 and len(re.split('[;;、,\n]+', tds[id1])) == len(re.split('[;;、,\n]+', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
  3405. products = re.split('[;;、,\n]+', tds[id1])
  3406. quantitys = re.split('[;;、,\n]+', tds[id2])
  3407. unitPrices = re.split('[;;、,\n]+', tds[id3])
  3408. total_prices = re.split('[;;、,\n]+', total_price)
  3409. brands = re.split('[;;、,\n]+', brand) if re.search('等$', brand)==None else [brand]
  3410. specses = re.split('[;;、,\n]+', specs) if re.search('等$', specs)==None else [specs]
  3411. parameters = re.split('[;;、,\n]+', parameter) if re.search('等$', parameter)==None else [parameter]
  3412. unitPrices = [""]*len(products) if len(unitPrices)==1 else unitPrices
  3413. total_prices = [""]*len(products) if len(total_prices)==1 else total_prices
  3414. brands = brands*len(products) if len(brands)==1 else brands
  3415. specses = specses*len(products) if len(specses)==1 else specses
  3416. brands = [brand]*len(products) if len(brands) < len(products) else brands
  3417. specses = [specs] * len(products) if len(specses) < len(products) else specses
  3418. parameters = parameters*len(products) if len(parameters)==1 else parameters
  3419. # print('产品拆分:', len(products),len(quantitys) , len(unitPrices),len(brands),len(specses))
  3420. if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len(specses):
  3421. for product, quantity, unitPrice, brand, specs, total_price, parameter in zip(products,quantitys,unitPrices, brands, specses, total_prices, parameters):
  3422. if product.strip() == '': # 20241219修复 572876124 最后一个符号分割产品所有要素为空问题
  3423. continue
  3424. if quantity != "":
  3425. quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit)
  3426. quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
  3427. if unitPrice != "":
  3428. unitPrice, _money_unit = money_process(unitPrice, header_list[3])
  3429. unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
  3430. if budget != "":
  3431. budget, _money_unit = money_process(budget, header_list2[2])
  3432. budget = str(budget) if budget != 0 and budget<50000000000 else ''
  3433. if total_price != "":
  3434. total_price, _money_unit = money_process(total_price, header_list[6])
  3435. total_price_list.append(total_price)
  3436. total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
  3437. link = {'product': product, 'quantity': quantity,
  3438. 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
  3439. 'brand': brand[:50], 'specs': specs, 'total_price': total_price, 'parameter': parameter}
  3440. # if link not in product_link:
  3441. # product_link.append(link)
  3442. # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
  3443. # if link['unitPrice'] != "" and mat:
  3444. # try:
  3445. # total_product_money += float(link['unitPrice']) * float(
  3446. # mat.group(1).replace(',', '')) if float(
  3447. # mat.group(1).replace(',', '')) < 50000 else 0
  3448. # except:
  3449. # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
  3450. # link['unitPrice'], link['quantity']))
  3451. if (product, specs, unitPrice, quantity) not in product_set:
  3452. product_set.add((product, specs, unitPrice, quantity))
  3453. product_link.append(link)
  3454. if link['unitPrice'] != "" and link['quantity'] != '':
  3455. try:
  3456. total_product_money += float(link['unitPrice']) * float(
  3457. link['quantity']) if float(link['quantity']) < 50000 else 0
  3458. except:
  3459. log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
  3460. link['unitPrice'], link['quantity']))
  3461. elif len(product)>100: # 产品名称长于100字
  3462. i += 1
  3463. # print('过滤: 产品名称长于100字',)
  3464. continue
  3465. else:
  3466. if quantity != "":
  3467. quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit)
  3468. quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
  3469. if unitPrice != "":
  3470. unitPrice, _money_unit = money_process(unitPrice, header_list[3])
  3471. unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
  3472. if budget != "":
  3473. budget, _money_unit = money_process(budget, header_list2[2])
  3474. budget = str(budget) if budget != 0 and budget<50000000000 else ''
  3475. if total_price != "":
  3476. total_price, _money_unit = money_process(total_price, header_list[6])
  3477. total_price_list.append(total_price)
  3478. total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
  3479. link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
  3480. 'brand': brand[:50], 'specs':specs, 'total_price': total_price, 'parameter': parameter,
  3481. 'pinmu_no': pinmu_no, 'pinmu_name': pinmu_name}
  3482. # if link not in product_link:
  3483. # product_link.append(link)
  3484. # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
  3485. # if link['unitPrice'] != "" and mat:
  3486. # try:
  3487. # total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', '')) if float(mat.group(1).replace(',', ''))<50000 else 0
  3488. # except:
  3489. # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
  3490. # if (product, unitPrice, quantity) not in product_set:
  3491. # product_set.add((product, unitPrice, quantity))
  3492. if (product, unitPrice,) not in product_set: # 2023/09/22 改为只判断产品/单价,只要两个一样就不作为新产品 避免多个表格重复表达有些没数量造成重复提取 353858683
  3493. product_set.add((product, unitPrice))
  3494. product_link.append(link)
  3495. if link['unitPrice']:
  3496. unit_price_list.append(link['unitPrice'])
  3497. if link['unitPrice'] != "" and link['quantity'] != '':
  3498. try:
  3499. total_product_money += float(link['unitPrice'])*float(link['quantity']) if float(link['quantity'])<50000 else 0
  3500. if float(link['unitPrice'])>10000 and float(link['quantity'])>100: # 修复 325105750 总价做单价 造成中标金额错误
  3501. total_product_money = 0
  3502. except:
  3503. log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
  3504. if order_begin != "" and order_end != "":
  3505. order_begin_year = int(order_begin.split("-")[0])
  3506. order_end_year = int(order_end.split("-")[0])
  3507. # 限制附件错误识别时间
  3508. if order_begin_year >= 2050 or order_end_year >= 2050:
  3509. order_begin = order_end = ""
  3510. # print(budget,order_time)
  3511. if budget != "" and order_time != "":
  3512. link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end, 'tenderee':tenderee,'notes':notes,'issue_date':issue_date}
  3513. if link not in demand_link:
  3514. demand_link.append(link)
  3515. i += 1
  3516. else:
  3517. i += 1
  3518. if len(total_price_list)>1 and len(set(total_price_list))/len(total_price_list)<=0.5: # 2023/7/27 总价一半以上重复的为多行一个总价,需去掉
  3519. # print('总价一半以上重复的为多行一个总价,需去掉', total_price_list)
  3520. for link in product_link: # 预防最后一列总价为所有产品总价,列补全后所有产品总价一样情况
  3521. if 'total_price' in link:
  3522. link['total_price'] = ""
  3523. if len(demand_link) > 2 and demand_link[0].get('budget', '') != '' and len(set([d.get('budget', '') for d in demand_link])) == 1: # 20250310 去掉多项目共用招标金额 例:598019007
  3524. for d in demand_link:
  3525. if 'budget' in d:
  3526. d['budget'] = ""
  3527. if len(unit_price_list)>0 and len(unit_price_list)==len(product_link) and len(set(unit_price_list))/len(unit_price_list)<=0.5: # 2023/7/18 如果单价重复率高不算总产品价避免错误
  3528. # print('如果单价重复率高不算总产品价避免错误')
  3529. total_product_money = 0
  3530. # for link in product_link:
  3531. # if 'unitPrice' in link:
  3532. # link['unitPrice'] = ""
  3533. if len(product_link)>0:
  3534. product_link = [{k:v for k,v in d.items() if v!=''} for d in product_link]
  3535. attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
  3536. else:
  3537. attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
  3538. if len(demand_link)>0:
  3539. demand_link = [{k: v for k, v in d.items() if v != ''} for d in demand_link]
  3540. demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
  3541. else:
  3542. demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
  3543. # print('表格产品属性提取:', attr_dic)
  3544. return [attr_dic, demand_dic], total_product_money
  3545. def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
  3546. if len(prem[0]['prem'])==1:
  3547. list_sentences[0].sort(key=lambda x:x.sentence_index)
  3548. list_sentence = list_sentences[0]
  3549. list_entity = list_entitys[0]
  3550. _data = product_attrs[1]['demand_info']['data']
  3551. re_bidding_time = re.compile("(采购|采购实施|预计(招标|采购|发标|发包))(时间|月份|日期)[::,].{0,2}$")
  3552. order_times = []
  3553. for entity in list_entity:
  3554. if entity.entity_type=='time':
  3555. sentence = list_sentence[entity.sentence_index]
  3556. s = spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index,
  3557. end_index=entity.end_index,size=20)
  3558. entity_left = "".join(s[0])
  3559. if re.search(re_bidding_time,entity_left):
  3560. time_text = entity.entity_text.strip()
  3561. standard_time = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*((?P<day>\d{1,2})日?)?)")
  3562. time_match = re.search(standard_time,time_text)
  3563. # print(time_text, time_match)
  3564. if time_match:
  3565. time_text = time_match.group()
  3566. order_times.append(time_text)
  3567. # print(order_times)
  3568. order_times = [tuple(self.fix_time(order_time, html, page_time)) for order_time in order_times]
  3569. order_times = [order_time for order_time in order_times if order_time[0]!=""]
  3570. if len(set(order_times))==1:
  3571. order_begin,order_end = order_times[0]
  3572. project_name = codeName[0]['name']
  3573. pack_info = [pack for pack in prem[0]['prem'].values()]
  3574. budget = pack_info[0].get('tendereeMoney',0)
  3575. product = prem[0]['product']
  3576. link = {'project_name': project_name, 'product': product, 'demand': project_name, 'budget': budget,
  3577. 'order_begin': order_begin, 'order_end': order_end}
  3578. _data.append(link)
  3579. product_attrs[1]['demand_info']['data'] = _data
  3580. # print('predict_without_table: ', product_attrs)
  3581. return product_attrs
  3582. def predict_by_text(self,product_attrs,html,list_outlines,product_list,page_time=""):
  3583. product_entity_list = list(set(product_list))
  3584. list_outline = list_outlines[0]
  3585. get_product_attrs = False
  3586. for _outline in list_outline:
  3587. if re.search("信息|情况|清单|概况",_outline.outline_summary):
  3588. outline_text = _outline.outline_text
  3589. outline_text = outline_text.replace(_outline.outline_summary,"")
  3590. key_value_list = [_split for _split in re.split("[,。;]",outline_text) if re.search("[::]",_split)]
  3591. if not key_value_list:
  3592. continue
  3593. head_list = []
  3594. head_value_list = []
  3595. for key_value in key_value_list:
  3596. key_value = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key_value)
  3597. temp = re.split("[::]",key_value)
  3598. if len(temp)>2:
  3599. if temp[0] in head_list:
  3600. key = temp[0]
  3601. value = "".join(temp[1:])
  3602. else:
  3603. key = temp[-2]
  3604. value = temp[-1]
  3605. else:
  3606. key = temp[0]
  3607. value = temp[1]
  3608. key = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key)
  3609. head_list.append(key)
  3610. head_value_list.append(value)
  3611. head_set = set(head_list)
  3612. # print('head_set',head_set)
  3613. if len(head_set & self.header_set) > len(head_set)*0.2:
  3614. loop_list = []
  3615. begin_list = [0]
  3616. for index,head in enumerate(head_list):
  3617. if head not in loop_list:
  3618. if re.search('第[一二三四五六七八九十](包|标段)', head) and re.search('第[一二三四五六七八九十](包|标段)', '|'.join(loop_list)):
  3619. begin_list.append(index)
  3620. loop_list = []
  3621. loop_list.append(head)
  3622. else:
  3623. loop_list.append(head)
  3624. else:
  3625. begin_list.append(index)
  3626. loop_list = []
  3627. loop_list.append(head)
  3628. headers = []
  3629. headers_demand = []
  3630. header_col = []
  3631. product_link = []
  3632. demand_link = []
  3633. product_set = set()
  3634. for idx in range(len(begin_list)):
  3635. if idx==len(begin_list)-1:
  3636. deal_list = head_value_list[begin_list[idx]:]
  3637. tmp_head_list = head_list[begin_list[idx]:]
  3638. else:
  3639. deal_list = head_value_list[begin_list[idx]:begin_list[idx+1]]
  3640. tmp_head_list = head_list[begin_list[idx]:begin_list[idx+1]]
  3641. product = "" # 产品
  3642. quantity = "" # 数量
  3643. quantity_unit = "" # 单位
  3644. unitPrice = "" # 单价
  3645. brand = "" # 品牌
  3646. specs = "" # 规格
  3647. demand = "" # 采购需求
  3648. budget = "" # 预算金额
  3649. order_time = "" # 采购时间
  3650. order_begin = ""
  3651. order_end = ""
  3652. total_price = "" # 总金额
  3653. parameter = "" # 参数
  3654. header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p0, self.p1,self.p2)
  3655. if found_header:
  3656. headers.append('_'.join(header_list))
  3657. headers_demand.append('_'.join(header_list2))
  3658. header_col.append('_'.join(tmp_head_list))
  3659. # print('header_dic: ',header_dic)
  3660. id0 = header_dic.get('品目', "")
  3661. id1 = header_dic.get('名称', "")
  3662. id2 = header_dic.get('数量', "")
  3663. id2_2 = header_dic.get('单位', "")
  3664. id3 = header_dic.get('单价', "")
  3665. id4 = header_dic.get('品牌', "")
  3666. id5 = header_dic.get('规格', "")
  3667. id6 = header_dic.get('需求', "")
  3668. id7 = header_dic.get('预算', "")
  3669. id8 = header_dic.get('时间', "")
  3670. id9 = header_dic.get("总价", "")
  3671. id10 = header_dic.get('参数', "")
  3672. if id1!='' and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
  3673. re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
  3674. product = deal_list[id1]
  3675. if id0 != "" and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id0]) and deal_list[id0] not in self.header_set and \
  3676. re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id0]) == None:
  3677. category = deal_list[id0]
  3678. product = "%s_%s" % (category, product) if product != "" else category
  3679. if product == "":
  3680. # print(deal_list[id4],deal_list[id5],tmp_head_list,deal_list)
  3681. if (id4 != "" and deal_list[id4] != "") or (id5 != "" and deal_list[id5] != ""):
  3682. for head,value in zip(tmp_head_list,deal_list):
  3683. if value and value in product_entity_list:
  3684. product = value
  3685. break
  3686. if product != "":
  3687. if id2 != "":
  3688. if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
  3689. quantity = deal_list[id2]
  3690. quantity = re.sub('[()(),,约]', '', quantity)
  3691. quantity = re.sub('[一壹]', '1', quantity)
  3692. ser = re.search('^(\d+(?:\.\d+)?)([㎡\w/]{,5})', quantity)
  3693. if ser:
  3694. quantity = str(ser.group(1))
  3695. quantity_unit = ser.group(2)
  3696. if float(quantity)>=10000*10000:
  3697. quantity = ""
  3698. quantity_unit = ""
  3699. else:
  3700. quantity = ""
  3701. quantity_unit = ""
  3702. if id2_2 != "":
  3703. if re.search('^\w{1,4}$', deal_list[id2_2]):
  3704. quantity_unit = deal_list[id2_2]
  3705. else:
  3706. quantity_unit = ""
  3707. # if id2 != "":
  3708. # if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
  3709. # quantity = deal_list[id2]
  3710. # else:
  3711. # quantity = ""
  3712. if id3 != "":
  3713. if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id3]):
  3714. _unitPrice = deal_list[id3]
  3715. re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
  3716. if re_price:
  3717. # _unitPrice = re_price[0]
  3718. # if '万元' in header_list[3] and '万' not in _unitPrice:
  3719. # _unitPrice += '万元'
  3720. # unitPrice = getUnifyMoney(_unitPrice)
  3721. # if unitPrice>=10000*10000:
  3722. # unitPrice = ""
  3723. # unitPrice = str(unitPrice)
  3724. _unitPrice, _money_unit = money_process(_unitPrice, header_list[3])
  3725. if _unitPrice >= 10000 * 10000:
  3726. _unitPrice = ""
  3727. unitPrice = str(_unitPrice)
  3728. if '.' in unitPrice:
  3729. unitPrice = unitPrice.rstrip('0').rstrip('.')
  3730. if id4 != "":
  3731. if re.search('\w', deal_list[id4]):
  3732. brand = deal_list[id4]
  3733. if re.match('^详见|^详情', brand.strip()):
  3734. brand = ""
  3735. else:
  3736. brand = ""
  3737. if id5 != "":
  3738. if re.search('\w', deal_list[id5]):
  3739. specs = deal_list[id5][:500]
  3740. if re.match('^详见|^详情', specs.strip()):
  3741. brand = ""
  3742. else:
  3743. specs = ""
  3744. if id6 != "":
  3745. if re.search('\w', deal_list[id6]):
  3746. demand = deal_list[id6]
  3747. else:
  3748. demand = ""
  3749. if id7 != "":
  3750. if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id7]):
  3751. _budget = deal_list[id7]
  3752. re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_budget)
  3753. if re_price:
  3754. # _budget = re_price[0]
  3755. # if '万元' in header_list2[2] and '万' not in _budget:
  3756. # _budget += '万元'
  3757. # budget = str(getUnifyMoney(_budget))
  3758. _budget, _money_unit = money_process(_budget, header_list2[2])
  3759. budget = str(_budget)
  3760. if '.' in budget:
  3761. budget = budget.rstrip('0').rstrip('.')
  3762. if float(budget)>= 100000*10000:
  3763. budget = ""
  3764. if id8 != "":
  3765. if re.search('\w', deal_list[id8]) and re.search("(采购|采购实施|预计(招标|采购|发标|发包))(时间|月份|日期)",header_list2[3]):
  3766. order_time = deal_list[id8].strip()
  3767. order_begin, order_end = self.fix_time(order_time, html, page_time)
  3768. if id9 != "":
  3769. if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id9]):
  3770. total_price = deal_list[id9]
  3771. elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', deal_list[id9].strip()):
  3772. total_price = deal_list[id9]
  3773. if id10 != "":
  3774. parameter = deal_list[id10][:500]
  3775. if re.match('^详见|^详情', parameter.strip()):
  3776. parameter = ""
  3777. if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
  3778. if id1 != "" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]', deal_list[id2])) > 1 and len(
  3779. re.split('[;;、,\n]', deal_list[id1])) == len(re.split('[;;、,\n]', deal_list[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
  3780. products = re.split('[;;、,\n]', deal_list[id1])
  3781. quantitys = re.split('[;;、,\n]', deal_list[id2])
  3782. unitPrices = re.split('[;;、,\n]', deal_list[id3])
  3783. total_prices = re.split('[;;、,\n]', total_price)
  3784. brands = re.split('[;;、,\n]', brand) if re.search('等$', brand) == None else [brand]
  3785. specses = re.split('[;;、,\n]', specs) if re.search('等$', specs) == None else [specs]
  3786. parameters = re.split('[;;、,\n]', parameter) if re.search('等$', parameter) == None else [parameter]
  3787. unitPrices = [""] * len(products) if len(unitPrices) == 1 else unitPrices
  3788. total_prices = [""] * len(products) if len(total_prices) == 1 else total_prices
  3789. brands = brands * len(products) if len(brands) == 1 else brands
  3790. specses = specses * len(products) if len(specses) == 1 else specses
  3791. parameters = parameters * len(products) if len(parameters) == 1 else parameters
  3792. if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len(
  3793. specses):
  3794. for product, quantity, unitPrice, brand, specs, total_price, parameter in zip(
  3795. products, quantitys, unitPrices, brands, specses, total_prices,
  3796. parameters):
  3797. if quantity != "":
  3798. quantity, quantity_unit_ = self.fix_quantity(quantity,quantity_unit)
  3799. quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
  3800. if unitPrice != "":
  3801. unitPrice, _money_unit = money_process(unitPrice, header_list[3])
  3802. unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
  3803. if budget != "":
  3804. budget, _money_unit = money_process(budget, header_list2[2])
  3805. budget = str(budget) if budget != 0 and budget<50000000000 else ''
  3806. if total_price != "":
  3807. total_price, _money_unit = money_process(total_price,
  3808. header_list[6])
  3809. total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
  3810. link = {'product': product, 'quantity': quantity,
  3811. 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
  3812. 'brand': brand[:50], 'specs': specs, 'total_price': total_price,
  3813. 'parameter': parameter}
  3814. if (product, specs, unitPrice, quantity) not in product_set:
  3815. product_set.add((product, specs, unitPrice, quantity))
  3816. product_link.append(link)
  3817. # if link['unitPrice'] != "" and link['quantity'] != '':
  3818. # try:
  3819. # total_product_money += float(link['unitPrice']) * float(
  3820. # link['quantity']) if float(
  3821. # link['quantity']) < 50000 else 0
  3822. # except:
  3823. # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
  3824. # link['unitPrice'], link['quantity']))
  3825. elif len(unitPrice) > 15 or len(product) > 100: # 单价大于15位数或 产品名称长于100字
  3826. # i += 1
  3827. continue
  3828. else:
  3829. if quantity != "":
  3830. quantity, quantity_unit_ = self.fix_quantity(quantity, quantity_unit)
  3831. quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
  3832. if unitPrice != "":
  3833. unitPrice, _money_unit = money_process(unitPrice, header_list[3])
  3834. unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
  3835. if budget != "":
  3836. budget, _money_unit = money_process(budget, header_list2[2])
  3837. budget = str(budget) if budget != 0 and budget<50000000000 else ''
  3838. if total_price != "":
  3839. total_price, _money_unit = money_process(total_price, header_list[6])
  3840. total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
  3841. link = {'product': product, 'quantity': quantity,
  3842. 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
  3843. 'brand': brand[:50], 'specs': specs, 'total_price': total_price,
  3844. 'parameter': parameter}
  3845. if (product, specs, unitPrice, quantity) not in product_set:
  3846. product_set.add((product, specs, unitPrice, quantity))
  3847. product_link.append(link)
  3848. # if link['unitPrice'] != "" and link['quantity'] != '':
  3849. # try:
  3850. # total_product_money += float(link['unitPrice']) * float(
  3851. # link['quantity']) if float(link['quantity']) < 50000 else 0
  3852. # except:
  3853. # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
  3854. # link['unitPrice'], link['quantity']))
  3855. if order_begin != "" and order_end != "":
  3856. order_begin_year = int(order_begin.split("-")[0])
  3857. order_end_year = int(order_end.split("-")[0])
  3858. # 限制附件错误识别时间
  3859. if order_begin_year >= 2050 or order_begin_year < 2000 or order_end_year >= 2050 or order_end_year < 2000:
  3860. order_begin = order_end = ""
  3861. # print(budget, order_time)
  3862. if budget != "" and order_time != "":
  3863. link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
  3864. 'order_begin': order_begin, 'order_end': order_end}
  3865. if link not in demand_link:
  3866. demand_link.append(link)
  3867. if len(product_link) > 0:
  3868. attr_dic = {'product_attrs': {'data': product_link, 'header': list(set(headers)), 'header_col': list(set(header_col))}}
  3869. get_product_attrs = True
  3870. else:
  3871. attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
  3872. if len(demand_link) > 0:
  3873. demand_dic = {'demand_info': {'data': demand_link, 'header': headers_demand, 'header_col': header_col}}
  3874. else:
  3875. demand_dic = {'demand_info': {'data': [], 'header': [], 'header_col': []}}
  3876. product_attrs[0] = attr_dic
  3877. if len(product_attrs[1]['demand_info']['data']) == 0:
  3878. product_attrs[1] = demand_dic
  3879. if get_product_attrs:
  3880. break
  3881. # print('predict_by_text: ', product_attrs)
  3882. return product_attrs
  3883. def add_product_attrs(self,channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time):
  3884. # print(1,product_attrs[1]['demand_info']['data'])
  3885. if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
  3886. product_attrs = self.predict_without_table(product_attrs, list_sentences,list_entitys,codeName,prem,text,page_time)
  3887. # print(2,product_attrs[1]['demand_info']['data'])
  3888. if len(product_attrs[0]['product_attrs']['data']) == 0:
  3889. product_attrs = self.predict_by_text(product_attrs,text,list_outlines,product_list,page_time)
  3890. # print(3,product_attrs[1]['demand_info']['data'])
  3891. if len(product_attrs[1]['demand_info']['data'])>0:
  3892. for d in product_attrs[1]['demand_info']['data']:
  3893. for product in set(prem[0]['product']):
  3894. if product in d['project_name'] and product not in d['product']:
  3895. d['product'].append(product) #把产品在项目名称中的添加进需求要素中
  3896. # docchannel类型提取
  3897. class DocChannel():
  3898. def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb',config=None):
  3899. self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
  3900. self.mask, self.mask_title = self.load_life(life_model,config)
  3901. self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
  3902. self.type_mask, self.type_mask_title = self.load_type(type_model)
  3903. self.sequen_len = 200 # 150 200
  3904. self.title_len = 30
  3905. self.sentence_num = 10
  3906. self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
  3907. lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
  3908. lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
  3909. self.id2type = {k: v for k, v in enumerate(lb_type)}
  3910. self.id2life = {k: v for k, v in enumerate(lb_life)}
  3911. self.load_pattern()
  3912. def load_pattern(self):
  3913. self.type_dic = {
  3914. '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
  3915. '拍卖出让': '(拍卖|变卖|流拍|竞拍|竞买)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三四五六七八九\d]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会|(拍卖.?方式|起拍价)[::]|竞买人资格|竞买人资质要求',
  3916. '产权交易': '(产权|资产|权证)的?(类型|类别|用途|性质|状态|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)|看[样货](时间|地[点址]|方式|仓库|验货)|最小加价|加价[幅梯]度|交易模式[::\s]*延时竞价销售|挂牌(开始|结束)时间|挂牌价格?',
  3917. '采招数据': '(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;|采购需求清单|最低价排序|竞争性采购方式|采购进行公开竞价|竞价模式[::\s]*一次报价|预算金额|代理银行资格选定' # |变更|答疑|澄清|中标|成交|合同|废标|流标 |(采购|招标|代理)(人|机构|单位)|
  3918. }
  3919. self.title_type_dic = { # ‘**2’ 为类别强相关关键词,不会被其他规则修正
  3920. '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地|流转土地',
  3921. '土地矿产2': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地|流转土地',
  3922. '拍卖出让': '(拍卖|变卖|拍(变)卖|流拍|竞拍|竞买)[\)\])】]?的?(公告|公示|预告|告知书)|拍卖|变卖|流拍|竞拍|第[一二三四五六七八九\d]次拍卖|拍卖会$',
  3923. '拍卖出让2': '(拍卖|变卖|拍(变)卖|流拍|竞拍|竞买)[\)\])】]?的?(公告|公示|预告|告知书)|拍卖|变卖|流拍|第[一二三四五六七八九\d]次拍卖|拍卖会$',
  3924. '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让|废[旧弃]?(物资|设备|资源|金属|钢筋|料)处[置理]',
  3925. '产权交易2': '使用权|租赁权|股权|债权|排污权|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让|废[旧弃]?(物资|设备|资源|金属|钢筋|料)处[置理]',
  3926. # '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|工程|拦标价|控制价|银行|资格选定|资金|公款|存款|存放|现金管理|招募|入围|入库',
  3927. '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|拦标价|控制价|资格选定|资格认定|资金|公款|存款|现金管理|招募|入库|遴选.{,25}(服务|事务所|机构)',
  3928. # |竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
  3929. '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)|行政审批结果'
  3930. }
  3931. self.life_dic = {
  3932. '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
  3933. '采购意向neg': '发布政府采购意向|采购意向公告已于',
  3934. '招标预告': '(预计|计划)(招标|采购|发标|发包)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
  3935. '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[:\s]|[^\w]成交规则|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格(要求|条件)|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)|评选方式:?\s*价格最低',
  3936. '资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格
  3937. '招标答疑': '现澄清(为|如下)|答疑补遗|澄清内容如下|第[0-9一二三四五]次澄清|答疑澄清|(最高(投标)?限价|控制价|拦标价)公示', # |异议的回复
  3938. '公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
  3939. '公告变更neg': '履约变更内容',
  3940. '候选人公示': '候选人公示|评标结果公示|中标候选人名单公示|现将中标候选人(进行公示|公[示布]如下)|(中标|中选)候选人(信息|情况)[::\s]',
  3941. '候选人公示neg': '中标候选人公示期|中标候选人公示前',
  3942. '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果|竞价结果)\w{,4}(进行公示|公[示布]如下)|(询价|竞价|遴选)?(成交|中标|中选)(公告|公示)|(成交|中标|中选|选定|选取|入围|询价)结果(如下|公告|公示)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(人|成交)|成交)\w{,3}(信息|情况)[::\s]',
  3943. '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(成交|中标|中选)价格\s*[\d.,]+(?万?元|(采购|招标|成交|中标|中选|评标)结果|单一来源(采购|招标)?的?(中标|成交|结果)|项目已结束|中标公示 ', # |单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示
  3944. '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(信息[,:]?)?(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]|确定[\w()]{6,25}为中标人', # |唯一
  3945. '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家|通知中标单位|影响(成交|中标)结果|确定为成交供应商|(成交|中标|中选)公[告示](发布|\w{,2})后|竞价成交后', # 503076535 按照服务方案的优劣 确定为成交供应商
  3946. # |确定成交供应商[:,\s]
  3947. '合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|完成(日期|时间))|(供应商乙方|乙方供应商):|合同总?金额|履约信息',
  3948. '废标公告': '(终止|中止|废标|流标|流采|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标|废置)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
  3949. '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?(原因|理由)|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人|人数|供应商|单位)(不足|未达\w{,3}数量)|已终止|不足[3三]家|无(废标)|成交情况:\s*[流废]标|现予以废置|报名未够三家',
  3950. '废标公告neg': '超过此报价将作为[废流]标处理|否则按[废流]标处理|终止规则:|成交规则:|视为流标|竞价失败的一切其他情形|是否废标:否|若不足三家公司参与|供应商数量:?\s*报名供应商不足三家|有效报价不足三家,\s*系统自动废标|如遇项目流[标采]' # 503076535 供应商数量: 报名供应商不足三家。
  3951. }
  3952. self.title_life_dic = {
  3953. '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示|意向公开',
  3954. '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|(供应|招标)计划表?$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
  3955. '公告变更': '第[\d一二]次变更|(变更|更正(事项)?|更改|延期|暂停)(招标|采购)?的?(公告|公示|通知)|变更$|更正$',
  3956. '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)',
  3957. '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍|停止)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
  3958. '合同公告': '(合同(成交|变更)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$', # |(履约|验收)(结果)?
  3959. '候选人公示': '候选人(变更)?公示|评标(结果)?(公[告示]|报告)|评审结果', #中标前公示|中标预公示|
  3960. '中标信息': '(中标|中选|中价|中租|成交)?|入选|确认)(候选人|人|供应商|记录|结果|变更|情况)?的?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|磋商|交易|出让|抽取|抽签)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$|项目中标|(项目|工程|服务|定点)的?结果公[告示]|超市直购订单', # |开标(记录|信息|情况)
  3961. '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
  3962. '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
  3963. '开标记录': '开标记录|截标信息|评委名单公示|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果|开标会|评审专家公示|开标日程',
  3964. '验收合同': '(验收|履约)(公告|公示)|(验收|履约)(结果|报告|意见|单)(公告|公示)|预留项目执行情况'
  3965. }
  3966. def load_life(self,life_model,config):
  3967. with tf.Graph().as_default() as graph:
  3968. output_graph_def = graph.as_graph_def()
  3969. with open(os.path.dirname(__file__)+life_model, 'rb') as f:
  3970. output_graph_def.ParseFromString(f.read())
  3971. tf.import_graph_def(output_graph_def, name='')
  3972. # print("%d ops in the final graph" % len(output_graph_def.node))
  3973. del output_graph_def
  3974. sess = tf.Session(graph=graph,config=config)
  3975. sess.run(tf.global_variables_initializer())
  3976. inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
  3977. prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
  3978. title = sess.graph.get_tensor_by_name('inputs/title:0')
  3979. mask = sess.graph.get_tensor_by_name('inputs/mask:0')
  3980. mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
  3981. # logit = sess.graph.get_tensor_by_name('output/logit:0')
  3982. softmax = sess.graph.get_tensor_by_name('output/softmax:0')
  3983. return sess, title, inputs, prob, softmax, mask, mask_title
  3984. def load_type(self,type_model):
  3985. with tf.Graph().as_default() as graph:
  3986. output_graph_def = graph.as_graph_def()
  3987. with open(os.path.dirname(__file__)+type_model, 'rb') as f:
  3988. output_graph_def.ParseFromString(f.read())
  3989. tf.import_graph_def(output_graph_def, name='')
  3990. # print("%d ops in the final graph" % len(output_graph_def.node))
  3991. del output_graph_def
  3992. sess = tf.Session(graph=graph)
  3993. sess.run(tf.global_variables_initializer())
  3994. inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
  3995. prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
  3996. title = sess.graph.get_tensor_by_name('inputs/title:0')
  3997. mask = sess.graph.get_tensor_by_name('inputs/mask:0')
  3998. mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
  3999. # logit = sess.graph.get_tensor_by_name('output/logit:0')
  4000. softmax = sess.graph.get_tensor_by_name('output/softmax:0')
  4001. return sess, title, inputs, prob, softmax, mask, mask_title
  4002. def predict_process(self, docid='', doctitle='', dochtmlcon=''):
  4003. # print('准备预处理')
  4004. def get_kw_senten(s, span=10):
  4005. doc_sens = []
  4006. tmp = 0
  4007. num = 0
  4008. end_idx = 0
  4009. for it in re.finditer(self.kws, s): # '|'.join(keywordset)
  4010. left = s[end_idx:it.end()].split()
  4011. right = s[it.end():].split()
  4012. tmp_seg = s[tmp:it.start()].split()
  4013. if len(tmp_seg) > span or tmp == 0:
  4014. doc_sens.append(' '.join(left[-span:] + right[:span]))
  4015. end_idx = it.end() + 1 + len(' '.join(right[:span]))
  4016. tmp = it.end()
  4017. num += 1
  4018. if num >= self.sentence_num:
  4019. break
  4020. if doc_sens == []:
  4021. doc_sens.append(s)
  4022. return doc_sens
  4023. def word2id(wordlist, max_len=self.sequen_len):
  4024. ids = [getIndexOfWords(w) for w in wordlist]
  4025. ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
  4026. assert len(ids) == max_len
  4027. return ids
  4028. cost_time = dict()
  4029. datas = []
  4030. datas_title = []
  4031. try:
  4032. segword_title = ' '.join(selffool.cut(doctitle)[0])
  4033. segword_content = dochtmlcon
  4034. except:
  4035. segword_content = ''
  4036. segword_title = ''
  4037. if isinstance(segword_content, float):
  4038. segword_content = ''
  4039. if isinstance(segword_title, float):
  4040. segword_title = ''
  4041. segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
  4042. replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
  4043. replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
  4044. segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
  4045. segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
  4046. doc_word_list = segword_content.split()
  4047. if len(doc_word_list) > self.sequen_len / 2:
  4048. doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
  4049. doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
  4050. else:
  4051. doc_sens = ' '.join(doc_word_list[:self.sequen_len])
  4052. # print('标题:',segword_title)
  4053. # print('正文:',segword_content)
  4054. datas.append(doc_sens.split())
  4055. datas_title.append(segword_title.split())
  4056. # print('完成预处理')
  4057. return datas, datas_title
  4058. def is_houxuan(self, title, content):
  4059. '''
  4060. 通过标题和中文内容判断是否属于候选人公示类别
  4061. :param title: 公告标题
  4062. :param content: 公告正文文本内容
  4063. :return: 1 是候选人公示 ;0 不是
  4064. '''
  4065. if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围)
  4066. if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
  4067. return 0
  4068. return 1
  4069. if re.search('候选人的?公示', content[:100]):
  4070. if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
  4071. return 0
  4072. return 1
  4073. else:
  4074. return 0
  4075. def predict(self, title='', list_sentence='', web_source_no='', original_docchannel=''):
  4076. not_extract_dic = {
  4077. 104: '招标文件',
  4078. 106: '法律法规',
  4079. 107: '新闻资讯',
  4080. 108: '拟建项目',
  4081. 109: '展会推广',
  4082. 110: '企业名录',
  4083. 111: '企业资质',
  4084. 112: '全国工程人员',
  4085. 113: '业主采购'
  4086. }
  4087. if original_docchannel in not_extract_dic:
  4088. return {'docchannel': {'docchannel':'', 'doctype':not_extract_dic[original_docchannel], "original_docchannel_id": str(original_docchannel)}}
  4089. if web_source_no in ['02104-7']:
  4090. return {'docchannel': {'docchannel':'', 'doctype':'采招数据'}}
  4091. if isinstance(list_sentence, list):
  4092. token_l = [it.tokens for it in list_sentence]
  4093. tokens = [it for l in token_l for it in l]
  4094. content = ' '.join(tokens[:500])
  4095. title = re.sub('[^\u4e00-\u9fa5]', '', title)
  4096. if len(title)>50:
  4097. title = title[:20]+title[-30:]
  4098. data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字
  4099. text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
  4100. title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
  4101. result = {'docchannel': {'docchannel':'', 'doctype':'', "original_docchannel_id": str(original_docchannel)}}
  4102. array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
  4103. array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
  4104. pred = self.type_sess.run(self.type_softmax,
  4105. feed_dict={
  4106. self.type_title: array_title,
  4107. self.type_content: array_content,
  4108. self.type_mask:[[0]*text_len+[1]*(self.sequen_len-text_len)],
  4109. self.type_mask_title:[[0]*title_len+[1]*(self.title_len-title_len)],
  4110. self.type_prob:1}
  4111. )
  4112. id = np.argmax(pred, axis=1)[0]
  4113. prob = pred[0][id]
  4114. result['docchannel']['doctype'] = self.id2type[id]
  4115. # print('公告类别:', self.id2type[id], '概率:',prob)
  4116. # if id == 0:
  4117. if result['docchannel']['doctype'] not in ['', '新闻资讯']:
  4118. pred = self.lift_sess.run(self.lift_softmax,
  4119. feed_dict={
  4120. self.lift_title: array_title,
  4121. self.lift_content: array_content,
  4122. self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
  4123. self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
  4124. self.lift_prob:1}
  4125. )
  4126. id = np.argmax(pred, axis=1)[0]
  4127. prob = pred[0][id]
  4128. result['docchannel']['docchannel'] = self.id2life[id]
  4129. # print('生命周期:纯模型预测',self.id2life[id], '概率:',prob)
  4130. # if id == 6:
  4131. if result['docchannel']['docchannel'] == '中标信息':
  4132. if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
  4133. result['docchannel']['docchannel'] = '候选人公示'
  4134. # return '候选人公示', prob
  4135. # return [{'docchannel': '候选人公示'}]
  4136. return result
  4137. # return [{'docchannel':self.id2life[id]}]
  4138. # else:
  4139. # # return self.id2type[id], prob
  4140. # return [{'docchannel':self.id2type[id]}]
  4141. def predict_rule(self, title, content, channel_dic, prem_dic):
  4142. '''2022/2/10加入规则去除某些数据源及内容过短且不包含类别关键词的公告不做预测'''
  4143. hetong = '(合同|验收|履约)(公告|公示)|合同号?$' # 合同标题正则
  4144. zhongbiao_t = '(中标|中选|成交|入选|入围|结果|确认)(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选)结果|开标(记录|信息|情况)|单一来源|直接(选取|选定)|中标通知书|中标$'
  4145. zhongbiao_c = '(中标|中选|成交|拟选用|拟邀请|最终选定的?|拟定)(供应商|供货商|服务商|企业|公司|单位|(候选)?人)(名称)?[::]|[,。:.](供应商|供货商|服务商)(名称)?:|指定的中介服务机构:|建设服务单位:'
  4146. zhaobiao_t = '(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈)(公告|公示|$)'
  4147. title_cn = re.sub('[^\u4e00-\u9fa5]', '', title)
  4148. if len(re.sub('[^\u4e00-\u9fa5]', "", content))<50 and channel_dic['docchannel']['doctype'] != '新闻资讯':
  4149. if re.search(hetong, title_cn) != None:
  4150. channel_dic['docchannel']['docchannel'] = '合同公告'
  4151. elif re.search(zhongbiao_t, title_cn):
  4152. channel_dic['docchannel']['docchannel'] = '中标信息'
  4153. elif re.search(zhaobiao_t, title_cn):
  4154. channel_dic['docchannel']['docchannel'] = '招标公告'
  4155. else:
  4156. channel_dic['docchannel']['docchannel'] = ''
  4157. elif channel_dic['docchannel'].get('docchannel', '') == '招标公告' and 'win_tenderer' in json.dumps(prem_dic,
  4158. ensure_ascii=False):
  4159. if re.search(hetong, title_cn) != None:
  4160. channel_dic['docchannel']['docchannel'] = '合同公告'
  4161. log('正则把招标公告修改为合同公告')
  4162. elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
  4163. content):
  4164. channel_dic['docchannel']['docchannel'] = '中标信息'
  4165. log('正则把招标公告修改为中标信息')
  4166. elif channel_dic['docchannel'].get('docchannel', '') == '中标信息' and 'win_tenderer' not in json.dumps(prem_dic,
  4167. ensure_ascii=False):
  4168. if re.search(hetong, title_cn):
  4169. channel_dic['docchannel']['docchannel'] = '合同公告'
  4170. log('正则把中标信息修改为合同公告')
  4171. elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
  4172. content):
  4173. pass
  4174. elif re.search(zhaobiao_t, title_cn):
  4175. channel_dic['docchannel']['docchannel'] = '招标公告'
  4176. log('正则把中标信息修改为招标公告')
  4177. elif re.search('中标|成交|中选|入选|入围|结果|供应商|供货商|候选人', title_cn+content)==None:
  4178. channel_dic['docchannel']['docchannel'] = ''
  4179. log('正则把中标信息修改为空')
  4180. return channel_dic
  4181. def predict_merge(self, title, list_sentence, html, original_docchannel='', web_source_no=''):
  4182. '''
  4183. 正则,模型混合预测,返回公告类型及生命周期
  4184. :param title: 公告标题
  4185. :param content: 预处理后的返回的句子实体列表 list_sentence
  4186. :param html: 公告原文 html 内容
  4187. :param bidway: 招标方式
  4188. :return: {'docchannel': {'docchannel':'中标信息', 'doctype':'采招数据'}} 字典格式
  4189. '''
  4190. def cut_single_cn_space(text):
  4191. new_text = ""
  4192. for w in text.split():
  4193. if len(w) == 1 or re.search('^[\u4e00-\u9fa5][::]', w):
  4194. new_text += w
  4195. else:
  4196. new_text += ' ' + w
  4197. return new_text
  4198. def html2text(html):
  4199. ser = re.search('<div[^<>]*richTextFetch', html)
  4200. # if ser and len(re.sub('[^\u4e00-\u9fa5]', '', html[:ser.start()]))>500:
  4201. # html = html[:ser.start()]+'##richTextFetch##'
  4202. if ser:
  4203. if len(re.sub('[^\u4e00-\u9fa5]', '', html[:ser.start()])) > 200:
  4204. html = html[:ser.start()] + '##richTextFetch##'
  4205. else:
  4206. html = html[:ser.start() + 500]
  4207. text = re.sub('<[^<]*?>', '', html).replace('&nbsp;', ' ')
  4208. # text = re.sub('http[0-9a-zA-Z-.:/]+|[0-9a-zA-Z-./@]+', '', text)
  4209. text = re.sub('\s+', ' ', text)
  4210. # text = re.sub('[/|[()()]', '', text)
  4211. text = cut_single_cn_space(text)
  4212. return text[:20000]
  4213. def count_diffser(pattern, text):
  4214. num = 0
  4215. kw = []
  4216. for p in pattern.split(';'):
  4217. if re.search(p, text):
  4218. num += 1
  4219. kw.append(re.search(p, text).group(0))
  4220. return num, ';'.join(kw)
  4221. def is_single_source(bidway, title):
  4222. if re.search('单一来源|单一性采购', title):
  4223. return True
  4224. elif bidway == '单一来源':
  4225. return True
  4226. else:
  4227. return False
  4228. def get_type(title, text):
  4229. if re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'],
  4230. text): # and re.search('(土地|用地|宗地|地块)(经营权)?(流转|承包|出租|招租|租赁|确权)', text)==None
  4231. if re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title) \
  4232. and not re.search(self.title_type_dic['土地矿产2'], title):
  4233. return '采招数据', re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title).group(0)
  4234. return '土地矿产', (re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text)).group(0)
  4235. elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
  4236. if re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title) \
  4237. and not re.search(self.title_type_dic['拍卖出让2'], title):
  4238. return '采招数据', re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title).group(0)
  4239. return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0)
  4240. elif re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text):
  4241. if re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title) \
  4242. and not re.search(self.title_type_dic['产权交易2'], title):
  4243. return '采招数据', re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title).group(0)
  4244. return '产权交易', (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)).group(0)
  4245. elif re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text) or re.search("银行|资格选定|资格认定|资金|公款|存款|存放|现金管理|招募|入围|入库", title +text.strip().split(' ')[0]):
  4246. return '采招数据', (
  4247. re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text) or re.search("银行|资格选定|资格认定|资金|公款|存款|存放|现金管理|招募|入围|入库", title +text.strip().split(' ')[0])).group(
  4248. 0)
  4249. elif re.search(self.title_type_dic['新闻资讯'], title):
  4250. if re.search(self.title_type_dic['采招数据'], title +text.strip().split(' ')[0]) or re.search("银行|资格选定|资格认定|资金|公款|存款|存放|现金管理|招募|入围|入库", title +text.strip().split(' ')[0]):
  4251. return '采招数据', (re.search(self.title_type_dic['采招数据'], title +text.strip().split(' ')[0]) or re.search("银行|资格选定|资格认定|资金|公款|存款|存放|现金管理|招募|入围|入库", title +text.strip().split(' ')[0])).group(0)
  4252. return '新闻资讯', re.search(self.title_type_dic['新闻资讯'], title).group(0)
  4253. else:
  4254. return '', '没有公告类型关键词,返回空'
  4255. def get_life(title, text):
  4256. title = re.sub('[-()()0-9a-z]|第?[二三四]次公?告?', '', title)
  4257. first_line = text.split()[0] if len(text.split()) > 2 else ''
  4258. if title.strip()[-2:] not in ['公告', '公示'] and 5 < len(first_line) < 50 and first_line[-2:] in ['公告', '公示']:
  4259. # print('title: ', title, first_line)
  4260. title += first_line
  4261. def count_score(l):
  4262. return len(l) + len(set(l)) * 2
  4263. life_kw_title = {}
  4264. life_kw_content = {}
  4265. life_score = {}
  4266. # msc = ""
  4267. # 查找标题每个类别关键词
  4268. for k, v in self.title_life_dic.items():
  4269. k2 = re.sub('[\da-z]', '', k)
  4270. if k2 not in life_kw_title:
  4271. life_kw_title[k2] = []
  4272. for it in re.finditer(v, title):
  4273. life_kw_title[k2].append(it.group(0))
  4274. # 查找正文每个类别关键词
  4275. for k, v in self.life_dic.items():
  4276. k2 = re.sub('[\da-z]', '', k)
  4277. if k2 not in life_kw_content:
  4278. life_kw_content[k2] = {'pos': [], 'neg': []}
  4279. for it in re.finditer(v, text):
  4280. if 'neg' not in k:
  4281. life_kw_content[k2]['pos'].append(it.group(0))
  4282. else:
  4283. life_kw_content[k2]['neg'].append(it.group(0))
  4284. for k2 in life_kw_content:
  4285. life_score[k2] = count_score(life_kw_content[k2]['pos']) - count_score(
  4286. life_kw_content[k2]['neg'])
  4287. life_kw_title = {k: v for k, v in life_kw_title.items() if v != []}
  4288. life_kw_content = {k: v for k, v in life_kw_content.items() if life_score[k] > 0}
  4289. msc = [life_kw_title, life_kw_content, life_score]
  4290. msc = json.dumps(msc, ensure_ascii=False)
  4291. max_score = 0
  4292. life_list = []
  4293. for k in life_score.keys():
  4294. if life_score[k] > max_score:
  4295. max_score = life_score[k]
  4296. life_list = [k]
  4297. elif life_score[k] == max_score and life_score[k] > 0:
  4298. life_list.append(k)
  4299. if '采购意向' in life_kw_title or '采购意向' in life_list:
  4300. if '中标信息' in life_kw_title or '中标信息' in life_list:
  4301. return '中标信息', msc
  4302. elif '候选人公示' in life_kw_title:
  4303. return '候选人公示', msc
  4304. elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set():
  4305. return '', msc
  4306. return '采购意向', msc
  4307. elif '招标预告' in life_kw_title or '招标预告' in life_list:
  4308. if '中标信息' in life_kw_title or '中标信息' in life_list:
  4309. return '中标信息', msc
  4310. elif '候选人公示' in life_kw_title:
  4311. return '候选人公示', msc
  4312. elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set():
  4313. return '', msc
  4314. return '招标预告', msc
  4315. elif '公告变更' in life_kw_title or '公告变更' in life_list:
  4316. if life_score.get('候选人公示', 0) > 3 or '候选人公示' in life_kw_title:
  4317. return '候选人公示', msc
  4318. elif life_score.get('合同公告', 0) > 3 or '合同公告' in life_kw_title:
  4319. return '合同公告', msc
  4320. elif life_score.get('中标信息', 0) > 3 or '中标信息' in life_kw_title:
  4321. return '中标信息', msc
  4322. elif '招标公告' in life_kw_title and re.search('变更|更正', title[-4:])==None and life_score.get('公告变更', 0) < 4:
  4323. return '招标公告', msc
  4324. return '公告变更', msc
  4325. elif '招标答疑' in life_kw_title or '招标答疑' in life_list:
  4326. if '招标公告' in life_kw_title and life_score.get('招标答疑', 0) < 4:
  4327. return '招标公告', msc
  4328. elif life_score.get('招标答疑', 0) < max_score:
  4329. if max_score > 3 and len(life_list) == 1:
  4330. return life_list[0], msc
  4331. return '', msc
  4332. return '招标答疑', msc
  4333. elif '开标记录' in life_kw_title:
  4334. return '开标记录', msc
  4335. elif '验收合同' in life_kw_title:
  4336. return '验收合同', msc
  4337. elif '候选人公示' in life_kw_title or '候选人公示' in life_list:
  4338. if '招标公告' in life_kw_title and '候选人公示' not in life_kw_title: # and life_score.get('招标公告', 0) > 3
  4339. return '招标公告', msc
  4340. elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
  4341. return '废标公告', msc
  4342. return '候选人公示', msc
  4343. elif '合同公告' in life_kw_title or '合同公告' in life_list:
  4344. if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3:
  4345. return '招标公告', msc
  4346. elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
  4347. return '废标公告', msc
  4348. return '合同公告', msc
  4349. elif '中标信息' in life_kw_title or '中标信息' in life_list:
  4350. if '招标公告' in life_kw_title and '中标信息' not in life_kw_title and life_score.get('招标公告',0) >= life_score.get('中标信息',0): # (life_score.get('招标公告', 0)>2 or life_score.get('中标信息', 0)<4) 0.7886409793924245
  4351. return '招标公告', msc
  4352. elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
  4353. return '废标公告', msc
  4354. elif life_score.get('候选人公示', 0) > 3:
  4355. return '候选人公示', msc
  4356. elif life_score.get('合同公告', 0) > 5:
  4357. return '合同公告', msc
  4358. return '中标信息', msc
  4359. elif '废标公告' in life_kw_title or '废标公告' in life_list:
  4360. if life_score.get('招标公告', 0) > 3 and '废标公告' not in life_kw_title:
  4361. return '招标公告', msc
  4362. return '废标公告', msc
  4363. elif '资审结果' in life_kw_title or '资审结果' in life_list:
  4364. return '资审结果', msc
  4365. elif '招标公告' in life_kw_title or '招标公告' in life_list:
  4366. return '招标公告', msc
  4367. return '', msc
  4368. def get_model_inputs(list_sentence):
  4369. list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index)
  4370. token_l = [it.tokens for it in list_sentence]
  4371. tokens = [it for l in token_l for it in l]
  4372. content = ' '.join(tokens[:500])
  4373. data_content, data_title = self.predict_process(docid='', doctitle=title[-50:],
  4374. dochtmlcon=content) # 标题最多取50字
  4375. text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len
  4376. title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len
  4377. array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
  4378. array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
  4379. return array_content, array_title ,text_len, title_len, content
  4380. def type_model_predict():
  4381. pred = self.type_sess.run(self.type_softmax,
  4382. feed_dict={
  4383. self.type_title: array_title,
  4384. self.type_content: array_content,
  4385. self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
  4386. self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
  4387. self.type_prob: 1}
  4388. )
  4389. id = np.argmax(pred, axis=1)[0]
  4390. prob = pred[0][id]
  4391. return id, prob
  4392. def life_model_predict():
  4393. pred = self.lift_sess.run(self.lift_softmax,
  4394. feed_dict={
  4395. self.lift_title: array_title,
  4396. self.lift_content: array_content,
  4397. self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
  4398. self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
  4399. self.lift_prob: 1}
  4400. )
  4401. id = np.argmax(pred, axis=1)[0]
  4402. prob = pred[0][id]
  4403. return id, prob
  4404. not_extract_dic = {
  4405. 104: '招标文件',
  4406. 106: '法律法规',
  4407. 107: '新闻资讯',
  4408. 108: '拟建项目',
  4409. 109: '展会推广',
  4410. 110: '企业名录',
  4411. 111: '企业资质',
  4412. 112: '全国工程人员',
  4413. 113: '业主采购'
  4414. }
  4415. origin_dic = {51: '公告变更',
  4416. 52: '招标公告',
  4417. 101: '中标信息',
  4418. 102: '招标预告',
  4419. 103: '招标答疑',
  4420. 104: '招标文件',
  4421. 105: '资审结果',
  4422. 106: '法律法规',
  4423. 107: '新闻资讯',
  4424. 108: '拟建项目',
  4425. 109: '展会推广',
  4426. 110: '企业名录',
  4427. 111: '企业资质',
  4428. 112: '全国工程',
  4429. 113: '业主采购',
  4430. 114: '采购意向',
  4431. 115: '拍卖出让',
  4432. 116: '土地矿产',
  4433. 117: '产权交易',
  4434. 118: '废标公告',
  4435. 119: '候选人公示',
  4436. 120: '合同公告'}
  4437. title = re.sub('[^\u4e00-\u9fa5]+|出租车', '', title)
  4438. if len(title) > 50:
  4439. title = title[:20] + title[-30:]
  4440. text = html2text(html)
  4441. self.origin_dic = origin_dic
  4442. self.title = title
  4443. self.text = text
  4444. if original_docchannel in not_extract_dic:
  4445. return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel], 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '公告类别不在提取范围'
  4446. if web_source_no in ['02104-7', '04733', 'DX007628-6']: # 这些数据源无法识别
  4447. return {'docchannel': {'docchannel': origin_dic.get(original_docchannel, '原始类别'), 'doctype': '采招数据', 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '此数据源公告分类不明确,返回数据源类别'
  4448. if original_docchannel == 303:
  4449. return {'docchannel': {'docchannel': '处罚公告', 'doctype': '处罚公告', 'life_docchannel': '处罚公告'}}, "源类别为处罚公告"
  4450. result = {'docchannel': {'docchannel': '', 'doctype': ''}}
  4451. doc_type, type_kw = get_type(title, text)
  4452. # print(doc_type, type_kw)
  4453. # doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel)
  4454. doc_life, life_kw = get_life(title, text)
  4455. if doc_type in self.title_type_dic:
  4456. result['docchannel']['doctype'] = doc_type
  4457. if doc_life in self.title_life_dic:
  4458. result['docchannel']['docchannel'] = doc_life
  4459. # print('channel正则预测结果:', result)
  4460. msc = '正则结果:类型:%s, 关键词:%s, 周期:%s, 关键词:%s'%(doc_type, type_kw,doc_life, life_kw)+'\n'+'模型结果:'
  4461. # print('类型:%s, 关键词:%s, 周期:%s, 关键词:%s'%(doc_type, type_kw,doc_life, life_kw))
  4462. if doc_type == "" or doc_life == "" or (doc_type != '采招数据' and origin_dic.get(original_docchannel, '原始类别') in ['招标公告', '中标信息', '招标预告', '采购意向']):
  4463. array_content, array_title, text_len, title_len, content = get_model_inputs(list_sentence)
  4464. if doc_type =="" or (doc_type != '采招数据' and origin_dic.get(original_docchannel, '原始类别') in ['招标公告', '中标信息', '招标预告', '采购意向']):
  4465. type_id, type_prob = type_model_predict()
  4466. type_model = self.id2type[type_id]
  4467. if type_model == '新闻资讯' and doc_life!='': # 修复bug 78584245 "docchannel": "合同公告", "doctype": "新闻资讯",
  4468. result['docchannel']['doctype'] = '采招数据'
  4469. msc += '模型结果为新闻资讯,生命周期不为空,改为采招数据;'
  4470. else:
  4471. result['docchannel']['doctype'] = type_model
  4472. msc += type_model + ' 概率:%.4f;'%type_prob
  4473. # print('公告类别:', self.id2type[id], '概率:',prob)
  4474. # if id == 0:
  4475. if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
  4476. if len(text)>150 and re.search(self.kws, content):
  4477. life_id, life_prob = life_model_predict()
  4478. if life_prob>=0.8:
  4479. life_model = self.id2life[life_id]
  4480. result['docchannel']['docchannel'] = life_model
  4481. msc += life_model + ' 概率:%.4f;\n'%life_prob
  4482. # msc = final_change(msc)
  4483. # print('channel ', msc)
  4484. return result, msc
  4485. def final_change(self, result, prem, original_docchannel, msc):
  4486. '''
  4487. :param result: channel 结果字典
  4488. :param prem:
  4489. :param title: 标题
  4490. :param text: 正文
  4491. :param original_docchannel: 站源类别
  4492. :param msc: 备注
  4493. :return: channel结果字典
  4494. '''
  4495. '''
  4496. 修改逻辑:
  4497. 1、中标公告、合同公告无中标人且原始为非中标,返回原类型
  4498. 2、废标公告有中标人且标题无废标关键词,返回中标信息
  4499. 3、答疑公告标题无答疑关键且原始为招标,返回原始类别
  4500. 4、招标公告有中标人且原始为中标,返回中标信息
  4501. 5、预测为招标,原始为预告、意向,返回原始类别
  4502. 6、预测及原始均在变更、答疑,返回原始类别
  4503. 7、预测为采招数据,原始为产权且有关键词,返回原始类别
  4504. 8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别
  4505. 9、若预测为非采招数据且源网为采招数据且有招标关键词返回采招数据
  4506. 10、招标公告有中标人,且标题有直购关键词,改为中标信息
  4507. 11、预测预告,原始为意向、招标且标题无预告关键词,返回原始类别
  4508. '''
  4509. def is_contain_winner(extract_json):
  4510. if re.search('win_tenderer', extract_json):
  4511. return True
  4512. else:
  4513. return False
  4514. tenderee = ""
  4515. agency = ""
  4516. try:
  4517. for k, v in prem['prem'].items():
  4518. for link in v['roleList']:
  4519. if link['role_name'] == 'tenderee' and tenderee == "":
  4520. tenderee = link['role_text']
  4521. if link['role_name'] == 'agency' and agency == "":
  4522. agency = link['role_text']
  4523. except Exception as e:
  4524. # print('解析prem 获取招标人、代理人出错')
  4525. pass
  4526. origin_dic = self.origin_dic
  4527. title = self.title
  4528. text = self.text
  4529. # 剔除招标单位名称影响
  4530. if tenderee:
  4531. title = title.replace(tenderee, " ")
  4532. text = text.replace(tenderee, " ")
  4533. if agency:
  4534. title = title.replace(agency, " ")
  4535. text = text.replace(agency, " ")
  4536. prem_json = json.dumps(prem, ensure_ascii=False)
  4537. if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
  4538. original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(
  4539. prem_json) == False and re.search(self.title_life_dic['中标信息'], title) == None:
  4540. result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
  4541. msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
  4542. elif result['docchannel']['docchannel'] in ['中标信息'] and is_contain_winner(prem_json) == False \
  4543. and re.search("监督(抽查|检查)结果|抽查结果", title):
  4544. result['docchannel']['doctype'] = "新闻资讯"
  4545. result['docchannel']['docchannel'] = ""
  4546. msc += '最终规则修改:中标公告无中标人且包含新闻资讯关键词,返回新闻资讯类型'
  4547. elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
  4548. self.title_life_dic['废标公告'], title) == None:
  4549. if re.search(self.title_life_dic['合同公告'], title):
  4550. result['docchannel']['docchannel'] = '合同公告'
  4551. else:
  4552. result['docchannel']['docchannel'] = '中标信息'
  4553. msc += '最终规则修改:预测为废标却有中标人且标题无废标关键词改为中标信息;'
  4554. elif result['docchannel']['docchannel'] in ['招标答疑'] and re.search(
  4555. self.title_life_dic['招标答疑'], title) == None and origin_dic.get(
  4556. original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
  4557. result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
  4558. msc += '最终规则修改:答疑公告标题无答疑关键且原始为招标,返回原始类别;'
  4559. elif result['docchannel']['docchannel'] == '招标公告' and is_contain_winner(prem_json) and (origin_dic.get(
  4560. original_docchannel, '') == '中标信息' or re.search('直接采购', title)): # 20241025补充 标题包含直接采购且有中标人的为中标公告
  4561. result['docchannel']['docchannel'] = '中标信息'
  4562. msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;'
  4563. elif result['docchannel']['docchannel'] in ['招标公告'] and origin_dic.get(
  4564. original_docchannel, '') in ['采购意向', '招标预告']:
  4565. result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
  4566. msc += '最终规则修改:预测为招标,原始为预告、意向,返回原始类别'
  4567. elif result['docchannel']['docchannel'] in ['招标预告'] and origin_dic.get(
  4568. original_docchannel, '') in ['采购意向', '招标公告'] and re.search(
  4569. self.title_life_dic['招标预告'], title) == None:
  4570. result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
  4571. msc += '最终规则修改:预测预告,原始为意向、招标且标题无预告关键词,返回原始类别'
  4572. elif result['docchannel']['docchannel'] in ['招标答疑', '公告变更'] and origin_dic.get(
  4573. original_docchannel, '') in ['招标答疑', '公告变更']:
  4574. result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
  4575. msc += '最终规则修改:预测及原始均在答疑、变更,返回原始类别'
  4576. elif result['docchannel']['doctype'] == '采招数据' and origin_dic.get(
  4577. original_docchannel, '') in ['产权交易', '土地矿产'] and re.search('产权|转让|受让|招租|出租|承租|竞价', text):
  4578. result['docchannel']['doctype'] = origin_dic.get(original_docchannel, '')
  4579. # print(re.findall('产权|转让|受让|招租|出租|承租|竞价', text))
  4580. msc += '最终规则修改:预测为采招数据,原始为产权且有关键词,返回原始类别'
  4581. elif result['docchannel']['docchannel'] == '废标公告' and origin_dic.get(
  4582. original_docchannel, '') in ['招标公告', '采购意向', '招标预告'] and re.search(
  4583. self.title_life_dic['废标公告'], title) == None:
  4584. result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
  4585. msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
  4586. elif result['docchannel']['docchannel'] in ['招标公告', '招标预告'] and is_contain_winner(
  4587. prem_json) and re.search('直购', title):
  4588. result['docchannel']['docchannel'] = '中标信息'
  4589. msc += "最终规则修改:预测为招标却有中标人且标题有直购关键词返回中标"
  4590. elif result['docchannel']['docchannel'] == '开标记录' and '开标结果' in title and is_contain_winner(prem_json):
  4591. msc += "最终规则修改:开标结果包含中标人的作为中标信息"
  4592. result['docchannel']['docchannel'] = '中标信息'
  4593. if result['docchannel']['doctype'] in ['产权交易', '土地矿产', '拍卖出让'] and origin_dic.get(
  4594. original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] \
  4595. and (re.search(self.title_type_dic['采招数据'], title) or re.search('采购|询价|磋商', title)
  4596. or re.search('(采购|招投?标|投标)(信息|内容|项目|公告|数量|人|单位|方式)|(建设|工程|服务|施工|监理|勘察|设计)项目|(%s)'
  4597. % self.type_dic['采招数据'], text)
  4598. ):
  4599. # print('test',re.findall('(采购|招投?标|投标)(信息|内容|项目|公告|数量|人|单位|方式)|(建设|工程|服务|施工|监理|勘察|设计)项目|(%s)' % self.type_dic['采招数据'], text))
  4600. result['docchannel']['doctype'] = '采招数据'
  4601. msc += ' 最终规则修改:预测为非采招数据,原始为采招数据且有招标关键词,返回采招数据'
  4602. elif result['docchannel']['doctype'] in ['土地矿产'] and origin_dic.get(original_docchannel, '') in ['拍卖出让', '产权交易']:
  4603. if origin_dic.get(original_docchannel, '') in ['拍卖出让'] and (
  4604. re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
  4605. result['docchannel']['doctype'] = '拍卖出让'
  4606. msc += "最终规则修改:预测为土地矿产原始为拍卖且有拍卖关键词,返回拍卖"
  4607. elif (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)):
  4608. result['docchannel']['doctype'] = '产权交易'
  4609. msc += "最终规则修改:预测为土地矿产原始为产权交易且有产权交易关键词,返回产权交易"
  4610. '''下面是新格式增加返回字段'''
  4611. if result['docchannel']['docchannel'] != '': # 预测到生命周期的复制到life_docchannel,否则用数据源结果
  4612. result['docchannel']['life_docchannel'] = result['docchannel']['docchannel']
  4613. else:
  4614. result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别')
  4615. return result, msc
  4616. # 保证金支付方式提取
  4617. class DepositPaymentWay():
  4618. def __init__(self,):
  4619. self.pt = '(保证金的?(交纳|缴纳|应按下列|入账|支付)方式)[::]*([^,。]{,60})'
  4620. self.pt2 = '保证金(必?须以|必?须?通过|以)(.{,8})方式'
  4621. kws = ['银行转账', '公?对公方?式?转账', '对公转账', '柜台转账', '(线上|网上)自?行?(缴纳|交纳|缴退|收退)',
  4622. '网上银行支付', '现金存入', '直接缴纳', '支票', '汇票', '本票', '电汇', '转账', '汇款', '随机码',
  4623. '入账', '基本账户转出', '基本账户汇入', '诚信库中登记的账户转出',
  4624. '银行保函', '电子保函', '担保函', '保证保险', '合法担保机构出具的担保', '金融机构、担保机构出具的保函']
  4625. self.kws = sorted(kws, key=lambda x: len(x), reverse=True)
  4626. def predict(self,content):
  4627. pay_way = {'deposit_patment_way':''}
  4628. result = []
  4629. pay = re.search(self.pt, content)
  4630. if pay:
  4631. # print(pay.group(0))
  4632. pay = pay.group(3)
  4633. for it in re.finditer('|'.join(self.kws), pay):
  4634. result.append(it.group(0))
  4635. pay_way['deposit_patment_way'] = ';'.join(result)
  4636. return pay_way
  4637. pay = re.search(self.pt2, content)
  4638. if pay:
  4639. # print(pay.group(0))
  4640. pay = pay.group(2)
  4641. for it in re.finditer('|'.join(self.kws), pay):
  4642. result.append(it.group(0))
  4643. pay_way['deposit_patment_way'] = ';'.join(result)
  4644. return pay_way
  4645. else:
  4646. return pay_way
  4647. # 项目标签
  4648. class ProjectLabel():
  4649. def __init__(self, ):
  4650. self.keyword_list = self.get_label_keywords()
  4651. self.kongjing_keyword_list = self.get_kongjing_keywords()
  4652. def get_label_keywords(self):
  4653. import csv
  4654. path = os.path.dirname(__file__)+'/project_label_keywords.csv'
  4655. with open(path, 'r',encoding='utf-8') as f:
  4656. reader = csv.reader(f)
  4657. key_word_list = []
  4658. for r in reader:
  4659. if r[0] == '类型':
  4660. continue
  4661. type = r[0]
  4662. key_wrod = r[1]
  4663. key_paichuci = str(r[2])
  4664. key_paichuci = key_paichuci if key_paichuci and key_paichuci != 'nan' else ""
  4665. type_paichuci = str(r[3])
  4666. type_paichuci = type_paichuci if type_paichuci and type_paichuci != 'nan' else ""
  4667. key_word_list.append((type, key_wrod, key_paichuci, type_paichuci))
  4668. return key_word_list
  4669. def get_kongjing_keywords(self):
  4670. import csv
  4671. path = os.path.dirname(__file__)+'/kongjing_label_keywords.csv'
  4672. with open(path, 'r',encoding='utf-8') as f:
  4673. reader = csv.reader(f)
  4674. key_word_list = []
  4675. for r in reader:
  4676. if r[0] == '关键词':
  4677. continue
  4678. key_wrod = r[0]
  4679. key_wrod2 = str(r[1])
  4680. key_wrod2 = key_wrod2 if key_wrod2 and key_wrod2 != 'nan' else ""
  4681. search_type = r[2]
  4682. info_type_list = str(r[3])
  4683. info_type_list = info_type_list if info_type_list and info_type_list != 'nan' else ""
  4684. key_word_list.append((key_wrod, key_wrod2, search_type, info_type_list))
  4685. return key_word_list
  4686. def predict(self, doctitle,product,project_name,prem):
  4687. doctitle = doctitle if doctitle else ""
  4688. product = product if product else ""
  4689. product = ",".join(set(product.split(','))) # 产品词去重
  4690. project_name = project_name if project_name else ""
  4691. tenderee = ""
  4692. agency = ""
  4693. sub_project_names = [] # 标段名称
  4694. try:
  4695. for k,v in prem[0]['prem'].items():
  4696. # sub_project_names.append(k)
  4697. sub_project_names.append(v.get("name",""))
  4698. for link in v['roleList']:
  4699. if link['role_name'] == 'tenderee' and tenderee == "":
  4700. tenderee = link['role_text']
  4701. if link['role_name'] == 'agency' and agency == "":
  4702. agency = link['role_text']
  4703. except Exception as e:
  4704. # print('解析prem 获取招标人、代理人出错')
  4705. pass
  4706. sub_project_names = ";".join(sub_project_names)
  4707. # 核心字段:标题+产品词+项目名称+标段名称
  4708. main_text = ",".join([doctitle, product, project_name, sub_project_names])
  4709. # 剔除 招标单位、代理机构名称
  4710. if tenderee:
  4711. doctitle = doctitle.replace(tenderee, " ")
  4712. main_text = main_text.replace(tenderee, " ")
  4713. if agency:
  4714. doctitle = doctitle.replace(agency, " ")
  4715. main_text = main_text.replace(agency, " ")
  4716. doctitle_dict = dict()
  4717. main_text_dict = dict()
  4718. for item in self.keyword_list:
  4719. _type = item[0]
  4720. key_wrod = item[1]
  4721. # 关键词排除词
  4722. key_paichuci = item[2]
  4723. key_paichuci_s = "|".join(key_paichuci.strip('、').split('、'))
  4724. # 类型排除词
  4725. type_paichuci = item[3]
  4726. if type_paichuci:
  4727. paichuci_split = type_paichuci.strip('、').split('、')
  4728. if re.search("|".join(paichuci_split), main_text):
  4729. continue
  4730. if doctitle:
  4731. if key_wrod in doctitle:
  4732. if not key_paichuci_s or (key_paichuci_s and not re.search(key_paichuci_s, doctitle)):
  4733. key_wrod_count1 = doctitle.count(key_wrod)
  4734. if _type not in doctitle_dict:
  4735. # doctitle_dict[_type] = {'关键词': [], '排除词': type_paichuci}
  4736. doctitle_dict[_type] = []
  4737. doctitle_dict[_type].append((key_wrod, key_wrod_count1))
  4738. if main_text:
  4739. if key_wrod in main_text:
  4740. if not key_paichuci_s or (key_paichuci_s and not re.search(key_paichuci_s, main_text)):
  4741. key_wrod_count2 = main_text.count(key_wrod)
  4742. if _type not in main_text_dict:
  4743. # main_text_dict[_type] = {'关键词': [], '排除词': type_paichuci}
  4744. main_text_dict[_type] = []
  4745. main_text_dict[_type].append((key_wrod, key_wrod_count2))
  4746. # 排序 doctitle
  4747. for k, v in doctitle_dict.items():
  4748. doctitle_dict[k].sort(key=lambda x: x[1], reverse=True)
  4749. # 按匹配次数保留前10个标签
  4750. if len(doctitle_dict) > 10:
  4751. doctitle_labels = [(k, sum(w[1] for w in doctitle_dict[k])) for k in doctitle_dict]
  4752. doctitle_labels.sort(key=lambda x: x[1], reverse=True)
  4753. for item in doctitle_labels[10:]:
  4754. doctitle_dict.pop(item[0])
  4755. # main_text
  4756. pop_list = []
  4757. for k, v in main_text_dict.items():
  4758. if sum([j[1] for j in main_text_dict[k]]) == 1:
  4759. # 关键词匹配次数等于1的标签
  4760. pop_list.append(k)
  4761. main_text_dict[k].sort(key=lambda x: x[1], reverse=True)
  4762. # 核心字段标签,若存在同一个标签的关键词匹配次数大于1,则只保留关键词匹配次数大于1的标签,关键词匹配次数等于1的标签不要
  4763. if len(pop_list) < len(main_text_dict):
  4764. for k in pop_list:
  4765. main_text_dict.pop(k)
  4766. # 按匹配次数保留前10个标签
  4767. if len(main_text_dict) > 10:
  4768. main_text_labels = [(k, sum(w[1] for w in main_text_dict[k])) for k in main_text_dict]
  4769. main_text_labels.sort(key=lambda x: x[1], reverse=True)
  4770. for item in main_text_labels[10:]:
  4771. main_text_dict.pop(item[0])
  4772. return {"标题":doctitle_dict,"核心字段":main_text_dict}
  4773. def predict_other(self,project_label,industry,doctitle,project_name,product,list_articles):
  4774. # doctextcon 取正文内容
  4775. doctextcon = list_articles[0].content.split('##attachment##')[0]
  4776. info_type = industry.get('industry',{}).get("class_name","")
  4777. doctitle = doctitle if doctitle else ""
  4778. product = product if product else ""
  4779. product = ",".join(set(product.split(','))) # 产品词去重
  4780. project_name = project_name if project_name else ""
  4781. get_kongjing_label = False
  4782. keywords_list = []
  4783. for item in self.kongjing_keyword_list:
  4784. key_wrod = item[0]
  4785. key_wrod2 = item[1]
  4786. search_type = item[2]
  4787. info_type_list = item[3]
  4788. info_type_list = info_type_list.strip('|').split("|") if info_type_list else []
  4789. search_text = ""
  4790. if search_type=='正文':
  4791. search_text = ",".join([doctextcon,doctitle,project_name,product])
  4792. elif search_type=='产品':
  4793. search_text = ",".join([doctitle,project_name,product])
  4794. if search_type=='行业':
  4795. # ’行业’类型直接用info_type匹配关键词
  4796. if info_type==key_wrod:
  4797. # 匹配关键词记录
  4798. keywords_list.append(key_wrod)
  4799. get_kongjing_label = True
  4800. # break
  4801. else:
  4802. if key_wrod in search_text:
  4803. if key_wrod2 and key_wrod2 not in search_text:
  4804. continue
  4805. if info_type_list and info_type not in info_type_list:
  4806. continue
  4807. # 匹配关键词记录
  4808. if key_wrod2:
  4809. keywords_list.append(key_wrod+'+'+key_wrod2)
  4810. else:
  4811. keywords_list.append(key_wrod)
  4812. get_kongjing_label = True
  4813. # break
  4814. if get_kongjing_label:
  4815. project_label["核心字段"]["空净通"] = [[word,1] for word in keywords_list][:10]
  4816. return project_label
  4817. # 产权分类二级标签
  4818. class PropertyLabel():
  4819. '''
  4820. 产权分类二级标签
  4821. 全部类别:
  4822. 股权, 债权, 知识产权, 矿权, 房产, 土地, 交通运输工具, 闲置物资、设备、材料, 其他
  4823. '''
  4824. def __init__(self, ):
  4825. car = "比亚迪|奇瑞|奥迪|宝马|菲尼迪|雷克萨斯|三菱|铃木|马自达|奔驰|劳斯莱斯|北京现代|" \
  4826. "宾利|兰博基尼|布加迪|保时捷|斯柯达|雪佛兰|别克|凯迪拉克|庞蒂亚克|克尔维特|福特|林肯|克莱斯勒|道奇|JEEP品牌"
  4827. self.keywords_dict = {
  4828. "房产": "房产|住宅|公寓|商铺|车位|写字楼|办公楼|别墅|综合楼|在建工程|厂房|车库|车房|房转让|房屋|商品房|商业用房|"
  4829. "宅基地|[\u4e00-\u9fa5]{,2}用房|店面|商业房|门[面市]房|仓库|铺位|地下室|\d号?(房|室|门市|门面|商?铺|单元|户)|不动产|"
  4830. "自建房|铺面|商务楼|商住楼|阁楼|(杂物|储物|储藏)(房|间|室)|套房|[\da-zA-Z](栋|棟|幢|层|座|号?楼|单元)\d{1,4}(号|房|室|商?铺|户)|"
  4831. "[\da-zA-Z](栋|棟|幢|层|座|号?楼|单元)\d{2,}|门面+转让|楼+变卖|房地产",
  4832. "交通运输工具": "车辆|轿车|汽车(?!用品|库|位|衡)|公车|客车|货车|面包车|SUV|新能源车|二手车|车辆|商用车|机动车|观光车|巴车|"
  4833. "船舶|四驱" + "|" + car,
  4834. "股权": "\d.?股|股权(?!交易中心)|\d%(比例)?.?股|\d万.?股|\d.?元/股|增资(?!源)|扩股|股(转让|出售)|百分之[一二三四五六七八九十]{1,3}股",
  4835. "债权": "债权|债权转让|债权人|债务人|原债权人|新债权人|金融资产",
  4836. "土地": "住宅用地|商业用地|工业用地|国有[\u4e00-\u9fa5]{,3}[土用]地|集体土地|划拨|流转|地块编号|"
  4837. "土地使用权证|土地经营权|土地证|土地[发承]包|[\u4e00-\u9fa5]{,2}用地|土地\d{1,3}(亩|公?顷)|\d{1,3}(亩|公?顷)(使用|经营)权|"
  4838. "承包土地|(地块|土地)承包|水面经营权|[鱼水]塘|鱼池|(水面|旱田)[\u4e00-\u9fa5]{,3}[发承]包|水面资源|(水面|水田)[\u4e00-\u9fa5]{,3}权|"
  4839. "四荒|林地|林场|林木所有权|采伐权|水利设施所有权|水利设施使用权|海域|滩涂|林业产权|旱田|水田|机动田|机动地|耕地|荒地|农田|"
  4840. "苗圃地|塘口",
  4841. "矿权": "矿权|矿业权|采矿许可|探矿权|采矿权|开采权|矿产资源处置|矿[\u4e00-\u9fa5]{1,3}开[发采]",
  4842. "知识产权": "知识产权(?!局)|商标|专利|著作权|版权|商业秘密|科研成果",
  4843. "闲置物资、设备、材料": "(废旧|报废|废|闲置|二手|淘汰)(物资|资产|机械|设备|仪器|汽车|车|钢铁|钢材|钢|金属|塑料|材料|导管|漆|渣|有色|品|[\u4e00-\u9fa5]{,2}车|偶头)|"
  4844. "(金属|机械|设备|仪器|汽车|钢铁|钢材|钢|塑料|有色|)废料|废液|废旧|报废|边角料|残次品|(热轧|冷轧|酸洗|镀铝|热镀|镀锌|镀镁)|"
  4845. "机[器械]设备|医疗设备|生产设备|办公设备|仪器|仪表|设备出租|设备租赁|拖拉机|收割机|插秧机|挖机|车床|挖掘机|电机|"
  4846. "戒指|弃渣|电解质块|茶杯|装置|花瓶|女表|手表|男表|硫磺|物资|书画|茶叶|油茶|红茶|[茗名]茶|白酒|红酒|酒水|酒品|名酒|毛石|[石金木铁矿铜锌铝钢]料|"
  4847. "零部件",
  4848. "经营权": "经营权",
  4849. "租赁": "房+租|市场+续约|资产+出租|租赁|续租|招租|出租|租金|房租"
  4850. }
  4851. self.cqjy_keywords = self.get_cqjy_keywords()
  4852. self.score_idx = ["股权", "债权", "知识产权", "矿权", "房产", "土地", "交通运输工具", "闲置物资、设备、材料"]
  4853. def get_cqjy_keywords(self):
  4854. import csv
  4855. path = os.path.dirname(__file__)+'/property_label_products.csv'
  4856. with open(path, 'r',encoding='utf-8') as f:
  4857. reader = csv.reader(f)
  4858. key_word_list = []
  4859. for r in reader:
  4860. if r[0] == 'product':
  4861. continue
  4862. key_wrod = r[0]
  4863. _type = r[1]
  4864. key_word_list.append((_type, key_wrod))
  4865. return key_word_list
  4866. def get_type(self, text):
  4867. keyword_list = []
  4868. for key, value in self.keywords_dict.items():
  4869. keyword = "|".join([i for i in value.split("|") if '+' not in i])
  4870. keyword2 = [i for i in value.split("|") if '+' in i]
  4871. if re.search(keyword, text):
  4872. re1 = [i for i in re.finditer(keyword, text)][-1]
  4873. keyword_list.append((key, re1.start()))
  4874. else:
  4875. # 组合词 查询
  4876. for k in keyword2:
  4877. k1, k2 = k.split('+')
  4878. if re.search(k1, text) and re.search(k2, text):
  4879. keyword_list.append((key, re.search(k2, text).start()))
  4880. break
  4881. return keyword_list
  4882. def get_type2(self, text, cqjy_type_list):
  4883. have_type = [i[0] for i in cqjy_type_list]
  4884. for item in self.cqjy_keywords:
  4885. _type = item[0]
  4886. key_wrod = item[1]
  4887. if _type not in have_type:
  4888. if '+' in key_wrod:
  4889. k1, k2 = key_wrod.split('+')
  4890. if re.search(k1, text) and re.search(k2, text):
  4891. cqjy_type_list.append((_type, re.search(k2, text).start()))
  4892. have_type.append(_type)
  4893. else:
  4894. if key_wrod in text:
  4895. cqjy_type_list.append((_type, text.index(key_wrod)))
  4896. have_type.append(_type)
  4897. return cqjy_type_list
  4898. def predict(self, doctitle,product,project_name,prem,channel_dic):
  4899. docchannel = channel_dic['docchannel']['doctype']
  4900. # print('docchannel',docchannel)
  4901. if docchannel not in ['土地矿产', '拍卖出让', '产权交易']:
  4902. return ""
  4903. doctitle = doctitle if doctitle else ""
  4904. product = product if product else ""
  4905. product = ",".join(set(product.split(','))) # 产品词去重
  4906. project_name = project_name if project_name else ""
  4907. tenderee = ""
  4908. agency = ""
  4909. try:
  4910. for k,v in prem[0]['prem'].items():
  4911. for link in v['roleList']:
  4912. if link['role_name'] == 'tenderee' and tenderee == "":
  4913. tenderee = link['role_text']
  4914. if link['role_name'] == 'agency' and agency == "":
  4915. agency = link['role_text']
  4916. except Exception as e:
  4917. # print('解析prem 获取招标人、代理人出错')
  4918. pass
  4919. cqjy_type = []
  4920. idx = 0
  4921. for text in [doctitle, project_name, product]:
  4922. if tenderee:
  4923. text = text.replace(tenderee, "")
  4924. if agency:
  4925. text = text.replace(agency, "")
  4926. cqjy_type = self.get_type(text)
  4927. if not cqjy_type:
  4928. cqjy_type = self.get_type2(text, cqjy_type)
  4929. idx += 1
  4930. if idx == 2: # project_name
  4931. if len(re.split("[,、]", text)) > 9:
  4932. cqjy_type = []
  4933. if idx == 3: # product
  4934. if len(text.split(",")) > 15:
  4935. cqjy_type = []
  4936. if cqjy_type:
  4937. break
  4938. cqjy_type2 = [i[0] for i in cqjy_type]
  4939. if cqjy_type:
  4940. # 类别优先级调整
  4941. if "租赁" in cqjy_type2:
  4942. cqjy_type2 = ['租赁']
  4943. elif "经营权" in cqjy_type2:
  4944. cqjy_type2 = ['经营权']
  4945. elif "股权" in cqjy_type2 or "债权" in cqjy_type2 or "知识产权" in cqjy_type2:
  4946. cqjy_type.sort(key=lambda x: self.score_idx.index(x[0]))
  4947. cqjy_type = cqjy_type[0]
  4948. cqjy_type2 = [cqjy_type[0]]
  4949. elif len(cqjy_type2) == 2 and "房产" in cqjy_type2 and "土地" in cqjy_type2:
  4950. cqjy_type2 = ['房产']
  4951. else:
  4952. # 权重排序,取第一位
  4953. if idx in [1, 2]: # doctitle, project_name
  4954. cqjy_type.sort(key=lambda x: x[1], reverse=True)
  4955. cqjy_type = cqjy_type[0]
  4956. cqjy_type2 = [cqjy_type[0]]
  4957. else:
  4958. cqjy_type.sort(key=lambda x: self.score_idx.index(x[0]))
  4959. cqjy_type = cqjy_type[0]
  4960. cqjy_type2 = [cqjy_type[0]]
  4961. cqjy_type2 = ",".join(cqjy_type2)
  4962. if not cqjy_type2:
  4963. cqjy_type2 = '其他'
  4964. return cqjy_type2
  4965. # 总价单价提取
  4966. class TotalUnitMoney:
  4967. def __init__(self):
  4968. pass
  4969. def predict(self, list_sentences, list_entitys):
  4970. for i in range(len(list_entitys)):
  4971. list_entity = list_entitys[i]
  4972. # 总价单价
  4973. for _entity in list_entity:
  4974. if _entity.entity_type == 'money':
  4975. word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
  4976. # 总价在中投标金额中
  4977. if _entity.label == 1:
  4978. result = extract_total_money(word_of_sentence,
  4979. _entity.entity_text,
  4980. [_entity.wordOffset_begin, _entity.wordOffset_end])
  4981. if result:
  4982. _entity.is_total_money = 1
  4983. # 单价在普通金额中
  4984. else:
  4985. result = extract_unit_money(word_of_sentence,
  4986. _entity.entity_text,
  4987. [_entity.wordOffset_begin, _entity.wordOffset_end])
  4988. if result:
  4989. _entity.is_unit_money = 1
  4990. # print("total_unit_money", _entity.entity_text,
  4991. # _entity.is_total_money, _entity.is_unit_money)
  4992. # 行业分类
  4993. class IndustryPredictor():
  4994. def __init__(self,):
  4995. self.model_path = os.path.dirname(__file__)+ '/industry_model'
  4996. self.id2lb = {0: '专业施工', 1: '专用仪器仪表', 2: '专用设备修理', 3: '互联网信息服务', 4: '互联网安全服务', 5: '互联网平台', 6: '互联网接入及相关服务', 7: '人力资源服务',
  4997. 8: '人造原油', 9: '仓储业', 10: '仪器仪表', 11: '仪器仪表修理', 12: '会计、审计及税务服务', 13: '会议、展览及相关服务', 14: '住宅、商业用房',
  4998. 15: '体育场地设施管理', 16: '体育组织', 17: '体育设备', 18: '保险服务', 19: '信息处理和存储支持服务', 20: '信息技术咨询服务',
  4999. 21: '信息系统集成和物联网技术服务', 22: '修缮工程', 23: '健康咨询', 24: '公路旅客运输', 25: '其他专业咨询与调查', 26: '其他专业技术服务',
  5000. 27: '其他交通运输设备', 28: '其他公共设施管理', 29: '其他土木工程建筑', 30: '其他工程服务', 31: '其他建筑建材', 32: '其他运输业', 33: '农业和林业机械',
  5001. 34: '农业服务', 35: '农产品', 36: '农副食品,动、植物油制品', 37: '出版业', 38: '办公消耗用品及类似物品', 39: '办公设备', 40: '化学原料及化学制品',
  5002. 41: '化学纤维', 42: '化学药品和中药专用设备', 43: '医疗设备', 44: '医药品', 45: '卫星传输服务', 46: '卫生', 47: '印刷服务', 48: '图书和档案',
  5003. 49: '图书档案设备', 50: '图书馆与档案馆', 51: '土地管理业', 52: '地质勘查', 53: '地震服务', 54: '场馆、站港用房', 55: '城市公共交通运输',
  5004. 56: '塑料制品、半成品及辅料', 57: '天然石料', 58: '娱乐设备', 59: '婚姻服务', 60: '安全保护服务', 61: '安全生产设备', 62: '家具用具',
  5005. 63: '家用电器修理', 64: '工业、生产用房', 65: '工业与专业设计及其他专业技术服务', 66: '工矿工程建筑', 67: '工程技术与设计服务', 68: '工程机械',
  5006. 69: '工程监理服务', 70: '工程评价服务', 71: '工程造价服务', 72: '市场调查', 73: '广告业', 74: '广播', 75: '广播、电视、电影设备',
  5007. 76: '广播电视传输服务', 77: '废弃资源综合利用业', 78: '建筑涂料', 79: '建筑物、构筑物附属结构', 80: '建筑物拆除和场地准备活动', 81: '建筑装饰和装修业',
  5008. 82: '录音制作', 83: '影视节目制作', 84: '房地产中介服务', 85: '房地产开发经营', 86: '房地产租赁经营', 87: '房屋租赁', 88: '招标代理',
  5009. 89: '探矿、采矿、选矿和造块设备', 90: '政法、检测专用设备', 91: '教育服务', 92: '教育设备', 93: '文物及非物质文化遗产保护', 94: '文物和陈列品',
  5010. 95: '文艺创作与表演', 96: '文艺设备', 97: '新闻业', 98: '旅行社及相关服务', 99: '日杂用品', 100: '有色金属冶炼及压延产品', 101: '有色金属矿',
  5011. 102: '木材、板材等', 103: '木材采集和加工设备', 104: '机械设备', 105: '机械设备经营租赁', 106: '林业产品', 107: '林业服务', 108: '架线和管道工程建筑',
  5012. 109: '核工业专用设备', 110: '橡胶制品', 111: '殡葬服务', 112: '殡葬设备及用品', 113: '气象服务', 114: '水上交通运输设备', 115: '水上运输业',
  5013. 116: '水利和水运工程建筑', 117: '水工机械', 118: '水文服务', 119: '水资源管理', 120: '污水处理及其再生利用', 121: '汽车、摩托车修理与维护',
  5014. 122: '法律服务', 123: '洗染服务', 124: '测绘地理信息服务', 125: '海洋仪器设备', 126: '海洋工程建筑', 127: '海洋服务', 128: '消防设备',
  5015. 129: '清洁服务', 130: '渔业产品', 131: '渔业服务', 132: '炼焦和金属冶炼轧制设备', 133: '烟草加工设备', 134: '热力生产和供应', 135: '焦炭及其副产品',
  5016. 136: '煤炭采选产品', 137: '燃气生产和供应业', 138: '物业管理', 139: '特种用途动、植物', 140: '环保咨询', 141: '环境与生态监测检测服务',
  5017. 142: '环境污染防治设备', 143: '环境治理业', 144: '玻璃及其制品', 145: '理发及美容服务', 146: '生态保护', 147: '电信',
  5018. 148: '电力、城市燃气、蒸汽和热水、水', 149: '电力供应', 150: '电力工业专用设备', 151: '电力工程施工', 152: '电力生产', 153: '电子和通信测量仪器',
  5019. 154: '电工、电子专用生产设备', 155: '电影放映', 156: '电气安装', 157: '电气设备', 158: '电气设备修理', 159: '畜牧业服务', 160: '监控设备',
  5020. 161: '石油制品', 162: '石油和化学工业专用设备', 163: '石油和天然气开采产品', 164: '石油天然气开采专用设备', 165: '研究和试验发展', 166: '社会工作',
  5021. 167: '社会经济咨询', 168: '科技推广和应用服务业', 169: '科研、医疗、教育用房', 170: '管道和设备安装', 171: '粮油作物和饲料加工设备', 172: '纸、纸制品及印刷品',
  5022. 173: '纺织原料、毛皮、被服装具', 174: '纺织设备', 175: '绿化管理', 176: '缝纫、服饰、制革和毛皮加工设备', 177: '航空器及其配套设备', 178: '航空客货运输',
  5023. 179: '航空航天工业专用设备', 180: '节能环保工程施工', 181: '装卸搬运', 182: '计算机和办公设备维修', 183: '计算机设备', 184: '计量标准器具及量具、衡器',
  5024. 185: '货币处理专用设备', 186: '货币金融服务', 187: '质检技术服务', 188: '资本市场服务', 189: '车辆', 190: '边界勘界和联检专用设备', 191: '运行维护服务',
  5025. 192: '通信设备', 193: '通用设备修理', 194: '道路货物运输', 195: '邮政专用设备', 196: '邮政业', 197: '采矿业和制造业服务',
  5026. 198: '铁路、船舶、航空航天等运输设备修理', 199: '铁路、道路、隧道和桥梁工程建筑', 200: '铁路运输设备', 201: '防洪除涝设施管理', 202: '陶瓷制品',
  5027. 203: '雷达、无线电和卫星导航设备', 204: '非金属矿', 205: '非金属矿物制品工业专用设备', 206: '非金属矿物材料', 207: '食品加工专用设备', 208: '食品及加工盐',
  5028. 209: '餐饮业', 210: '饮料、酒精及精制茶', 211: '饮料加工设备', 212: '饲养动物及其产品', 213: '黑色金属冶炼及压延产品', 214: '黑色金属矿'}
  5029. self.industry_dic = {'专业施工': {'大类': '专业施工', '门类': '建筑业'},
  5030. '专用仪器仪表': {'大类': '专用设备', '门类': '零售批发'},
  5031. '专用设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
  5032. '互联网信息服务': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'},
  5033. '互联网安全服务': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'},
  5034. '互联网平台': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'},
  5035. '互联网接入及相关服务': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'},
  5036. '人力资源服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5037. '人造原油': {'大类': '炼焦产品、炼油产品', '门类': '零售批发'},
  5038. '仓储业': {'大类': '装卸搬运和运输代理业', '门类': '交通运输、仓储和邮政业'},
  5039. '仪器仪表': {'大类': '通用设备', '门类': '零售批发'},
  5040. '仪器仪表修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
  5041. '会计、审计及税务服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5042. '会议、展览及相关服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5043. '住宅、商业用房': {'大类': '房屋建筑业', '门类': '建筑业'},
  5044. '体育场地设施管理': {'大类': '体育', '门类': '文化、体育和娱乐业'},
  5045. '体育组织': {'大类': '体育', '门类': '文化、体育和娱乐业'},
  5046. '体育设备': {'大类': '专用设备', '门类': '零售批发'},
  5047. '保险服务': {'大类': '保险业', '门类': '金融业'},
  5048. '信息处理和存储支持服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'},
  5049. '信息技术咨询服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'},
  5050. '信息系统集成和物联网技术服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'},
  5051. '修缮工程': {'大类': '修缮工程', '门类': '建筑业'},
  5052. '健康咨询': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5053. '公路旅客运输': {'大类': '道路运输业', '门类': '交通运输、仓储和邮政业'},
  5054. '其他专业咨询与调查': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5055. '其他专业技术服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  5056. '其他交通运输设备': {'大类': '专用设备', '门类': '零售批发'},
  5057. '其他公共设施管理': {'大类': '公共设施管理业', '门类': '水利、环境和公共设施管理业'},
  5058. '其他土木工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
  5059. '其他工程服务': {'大类': '工程服务', '门类': '科学研究和技术服务业'},
  5060. '其他建筑建材': {'大类': '建筑建材', '门类': '零售批发'},
  5061. '其他运输业': {'大类': '其他运输业', '门类': '交通运输、仓储和邮政业'},
  5062. '农业和林业机械': {'大类': '专用设备', '门类': '零售批发'},
  5063. '农业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'},
  5064. '农产品': {'大类': '农林牧渔业产品', '门类': '零售批发'},
  5065. '农副食品,动、植物油制品': {'大类': '食品、饮料和烟草原料', '门类': '零售批发'},
  5066. '出版业': {'大类': '新闻和出版业', '门类': '文化、体育和娱乐业'},
  5067. '办公消耗用品及类似物品': {'大类': '办公消耗用品及类似物品', '门类': '零售批发'},
  5068. '办公设备': {'大类': '通用设备', '门类': '零售批发'},
  5069. '化学原料及化学制品': {'大类': '基础化学品及相关产品', '门类': '零售批发'},
  5070. '化学纤维': {'大类': '基础化学品及相关产品', '门类': '零售批发'},
  5071. '化学药品和中药专用设备': {'大类': '专用设备', '门类': '零售批发'},
  5072. '医疗设备': {'大类': '专用设备', '门类': '零售批发'},
  5073. '医药品': {'大类': '医药品', '门类': '零售批发'},
  5074. '卫星传输服务': {'大类': '电信、广播电视和卫星传输服务', '门类': '信息传输、软件和信息技术服务业'},
  5075. '卫生': {'大类': '卫生', '门类': '卫生和社会工作'},
  5076. '印刷服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5077. '图书和档案': {'大类': '图书和档案', '门类': '零售批发'},
  5078. '图书档案设备': {'大类': '通用设备', '门类': '零售批发'},
  5079. '图书馆与档案馆': {'大类': '文化艺术业', '门类': '文化、体育和娱乐业'},
  5080. '土地管理业': {'大类': '土地管理业', '门类': '水利、环境和公共设施管理业'},
  5081. '地质勘查': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  5082. '地震服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  5083. '场馆、站港用房': {'大类': '房屋建筑业', '门类': '建筑业'},
  5084. '城市公共交通运输': {'大类': '道路运输业', '门类': '交通运输、仓储和邮政业'},
  5085. '塑料制品、半成品及辅料': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'},
  5086. '天然石料': {'大类': '建筑建材', '门类': '零售批发'},
  5087. '娱乐设备': {'大类': '专用设备', '门类': '零售批发'},
  5088. '婚姻服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'},
  5089. '安全保护服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5090. '安全生产设备': {'大类': '专用设备', '门类': '零售批发'},
  5091. '家具用具': {'大类': '家具用具', '门类': '零售批发'},
  5092. '家用电器修理': {'大类': '机动车、电子产品和日用产品修理业', '门类': '居民服务、修理和其他服务业'},
  5093. '工业、生产用房': {'大类': '房屋建筑业', '门类': '建筑业'},
  5094. '工业与专业设计及其他专业技术服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  5095. '工矿工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
  5096. '工程技术与设计服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  5097. '工程机械': {'大类': '专用设备', '门类': '零售批发'},
  5098. '工程监理服务': {'大类': '工程服务', '门类': '科学研究和技术服务业'},
  5099. '工程评价服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  5100. '工程造价服务': {'大类': '工程服务', '门类': '科学研究和技术服务业'},
  5101. '市场调查': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5102. '广告业': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5103. '广播': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'},
  5104. '广播、电视、电影设备': {'大类': '通用设备', '门类': '零售批发'},
  5105. '广播电视传输服务': {'大类': '电信、广播电视和卫星传输服务', '门类': '信息传输、软件和信息技术服务业'},
  5106. '废弃资源综合利用业': {'大类': '废弃资源综合利用业', '门类': '废弃资源综合利用业'},
  5107. '建筑涂料': {'大类': '建筑建材', '门类': '零售批发'},
  5108. '建筑物、构筑物附属结构': {'大类': '建筑建材', '门类': '零售批发'},
  5109. '建筑物拆除和场地准备活动': {'大类': '建筑装饰和其他建筑业', '门类': '建筑业'},
  5110. '建筑装饰和装修业': {'大类': '建筑装饰和其他建筑业', '门类': '建筑业'},
  5111. '录音制作': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'},
  5112. '影视节目制作': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'},
  5113. '房地产中介服务': {'大类': '房地产业', '门类': '房地产业'},
  5114. '房地产开发经营': {'大类': '房地产业', '门类': '房地产业'},
  5115. '房地产租赁经营': {'大类': '房地产业', '门类': '房地产业'},
  5116. '房屋租赁': {'大类': '租赁业', '门类': '租赁和商务服务业'},
  5117. '招标代理': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  5118. '探矿、采矿、选矿和造块设备': {'大类': '专用设备', '门类': '零售批发'},
  5119. '政法、检测专用设备': {'大类': '专用设备', '门类': '零售批发'},
  5120. '教育服务': {'大类': '教育服务', '门类': '教育'},
  5121. '教育设备': {'大类': '专用设备', '门类': '零售批发'},
  5122. '文体设备和用品出租': {'大类': '租赁业', '门类': '租赁和商务服务业'},
  5123. '文物及非物质文化遗产保护': {'大类': '文化艺术业', '门类': '文化、体育和娱乐业'},
  5124. '文物和陈列品': {'大类': '文物和陈列品', '门类': '零售批发'},
  5125. '文艺创作与表演': {'大类': '文化艺术业', '门类': '文化、体育和娱乐业'},
  5126. '文艺设备': {'大类': '专用设备', '门类': '零售批发'},
  5127. '新闻业': {'大类': '新闻和出版业', '门类': '文化、体育和娱乐业'},
  5128. '旅行社及相关服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5129. '日杂用品': {'大类': '日杂用品', '门类': '零售批发'},
  5130. '有色金属冶炼及压延产品': {'大类': '建筑建材', '门类': '零售批发'},
  5131. '有色金属矿': {'大类': '矿与矿物', '门类': '零售批发'},
  5132. '木材、板材等': {'大类': '建筑建材', '门类': '零售批发'},
  5133. '木材采集和加工设备': {'大类': '专用设备', '门类': '零售批发'},
  5134. '机械设备': {'大类': '通用设备', '门类': '零售批发'},
  5135. '机械设备经营租赁': {'大类': '租赁业', '门类': '租赁和商务服务业'},
  5136. '林业产品': {'大类': '农林牧渔业产品', '门类': '零售批发'},
  5137. '林业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'},
  5138. '架线和管道工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
  5139. '核工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
  5140. '橡胶制品': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'},
  5141. '殡葬服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'},
  5142. '殡葬设备及用品': {'大类': '专用设备', '门类': '零售批发'},
  5143. '气象服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  5144. '水上交通运输设备': {'大类': '专用设备', '门类': '零售批发'},
  5145. '水上运输业': {'大类': '水上运输业', '门类': '交通运输、仓储和邮政业'},
  5146. '水利和水运工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
  5147. '水工机械': {'大类': '专用设备', '门类': '零售批发'},
  5148. '水文服务': {'大类': '水利管理业', '门类': '水利、环境和公共设施管理业'},
  5149. '水资源管理': {'大类': '水利管理业', '门类': '水利、环境和公共设施管理业'},
  5150. '污水处理及其再生利用': {'大类': '水的生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
  5151. '汽车、摩托车修理与维护': {'大类': '机动车、电子产品和日用产品修理业', '门类': '居民服务、修理和其他服务业'},
  5152. '法律服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5153. '洗染服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'},
  5154. '测绘地理信息服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  5155. '海洋仪器设备': {'大类': '专用设备', '门类': '零售批发'},
  5156. '海洋工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
  5157. '海洋服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  5158. '消防设备': {'大类': '专用设备', '门类': '零售批发'},
  5159. '清洁服务': {'大类': '其他服务业', '门类': '居民服务、修理和其他服务业'},
  5160. '渔业产品': {'大类': '农林牧渔业产品', '门类': '零售批发'},
  5161. '渔业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'},
  5162. '炼焦和金属冶炼轧制设备': {'大类': '专用设备', '门类': '零售批发'},
  5163. '烟草加工设备': {'大类': '专用设备', '门类': '零售批发'},
  5164. '热力生产和供应': {'大类': '电力、热力生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
  5165. '焦炭及其副产品': {'大类': '炼焦产品、炼油产品', '门类': '零售批发'},
  5166. '煤炭采选产品': {'大类': '矿与矿物', '门类': '零售批发'},
  5167. '燃气生产和供应业': {'大类': '燃气生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
  5168. '物业管理': {'大类': '房地产业', '门类': '房地产业'},
  5169. '特种用途动、植物': {'大类': '农林牧渔业产品', '门类': '零售批发'},
  5170. '环保咨询': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5171. '环境与生态监测检测服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  5172. '环境污染防治设备': {'大类': '专用设备', '门类': '零售批发'},
  5173. '环境治理业': {'大类': '生态保护和环境治理业', '门类': '水利、环境和公共设施管理业'},
  5174. '玻璃及其制品': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'},
  5175. '理发及美容服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'},
  5176. '生态保护': {'大类': '生态保护和环境治理业', '门类': '水利、环境和公共设施管理业'},
  5177. '电信': {'大类': '电信、广播电视和卫星传输服务', '门类': '信息传输、软件和信息技术服务业'},
  5178. '电力、城市燃气、蒸汽和热水、水': {'大类': '电力、城市燃气、蒸汽和热水、水', '门类': '零售批发'},
  5179. '电力供应': {'大类': '电力、热力生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
  5180. '电力工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
  5181. '电力工程施工': {'大类': '土木工程建筑业', '门类': '建筑业'},
  5182. '电力生产': {'大类': '电力、热力生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
  5183. '电子和通信测量仪器': {'大类': '通用设备', '门类': '零售批发'},
  5184. '电工、电子专用生产设备': {'大类': '专用设备', '门类': '零售批发'},
  5185. '电影放映': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'},
  5186. '电气安装': {'大类': '建筑安装业', '门类': '建筑业'},
  5187. '电气设备': {'大类': '通用设备', '门类': '零售批发'},
  5188. '电气设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
  5189. '畜牧业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'},
  5190. '监控设备': {'大类': '通用设备', '门类': '零售批发'},
  5191. '石油制品': {'大类': '炼焦产品、炼油产品', '门类': '零售批发'},
  5192. '石油和化学工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
  5193. '石油和天然气开采产品': {'大类': '矿与矿物', '门类': '零售批发'},
  5194. '石油天然气开采专用设备': {'大类': '专用设备', '门类': '零售批发'},
  5195. '研究和试验发展': {'大类': '研究和试验发展', '门类': '科学研究和技术服务业'},
  5196. '社会工作': {'大类': '社会工作', '门类': '卫生和社会工作'},
  5197. '社会经济咨询': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  5198. '科技推广和应用服务业': {'大类': '科技推广和应用服务业', '门类': '科学研究和技术服务业'},
  5199. '科研、医疗、教育用房': {'大类': '房屋建筑业', '门类': '建筑业'},
  5200. '管道和设备安装': {'大类': '建筑安装业', '门类': '建筑业'},
  5201. '粮油作物和饲料加工设备': {'大类': '专用设备', '门类': '零售批发'},
  5202. '纸、纸制品及印刷品': {'大类': '纸、纸制品及印刷品', '门类': '零售批发'},
  5203. '纺织原料、毛皮、被服装具': {'大类': '纺织原料、毛皮、被服装具', '门类': '零售批发'},
  5204. '纺织设备': {'大类': '专用设备', '门类': '零售批发'},
  5205. '绿化管理': {'大类': '公共设施管理业', '门类': '水利、环境和公共设施管理业'},
  5206. '缝纫、服饰、制革和毛皮加工设备': {'大类': '专用设备', '门类': '零售批发'},
  5207. '航空器及其配套设备': {'大类': '专用设备', '门类': '零售批发'},
  5208. '航空客货运输': {'大类': '航空运输业', '门类': '交通运输、仓储和邮政业'},
  5209. '航空航天工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
  5210. '节能环保工程施工': {'大类': '土木工程建筑业', '门类': '建筑业'},
  5211. '装卸搬运': {'大类': '装卸搬运和运输代理业', '门类': '交通运输、仓储和邮政业'},
  5212. '计算机和办公设备维修': {'大类': '机动车、电子产品和日用产品修理业', '门类': '居民服务、修理和其他服务业'},
  5213. '计算机设备': {'大类': '通用设备', '门类': '零售批发'},
  5214. '计量标准器具及量具、衡器': {'大类': '通用设备', '门类': '零售批发'},
  5215. '货币处理专用设备': {'大类': '专用设备', '门类': '零售批发'},
  5216. '货币金融服务': {'大类': '货币金融服务', '门类': '金融业'},
  5217. '质检技术服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  5218. '资本市场服务': {'大类': '资本市场服务', '门类': '金融业'},
  5219. '车辆': {'大类': '通用设备', '门类': '零售批发'},
  5220. '边界勘界和联检专用设备': {'大类': '专用设备', '门类': '零售批发'},
  5221. '运行维护服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'},
  5222. '通信设备': {'大类': '通用设备', '门类': '零售批发'},
  5223. '通用设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
  5224. '道路货物运输': {'大类': '道路运输业', '门类': '交通运输、仓储和邮政业'},
  5225. '邮政专用设备': {'大类': '专用设备', '门类': '零售批发'},
  5226. '邮政业': {'大类': '邮政业', '门类': '交通运输、仓储和邮政业'},
  5227. '采矿业和制造业服务': {'大类': '采矿业和制造业服务', '门类': '农林牧副渔服务'},
  5228. '铁路、船舶、航空航天等运输设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
  5229. '铁路、道路、隧道和桥梁工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
  5230. '铁路运输设备': {'大类': '专用设备', '门类': '零售批发'},
  5231. '防洪除涝设施管理': {'大类': '水利管理业', '门类': '水利、环境和公共设施管理业'},
  5232. '陶瓷制品': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'},
  5233. '雷达、无线电和卫星导航设备': {'大类': '通用设备', '门类': '零售批发'},
  5234. '非金属矿': {'大类': '矿与矿物', '门类': '零售批发'},
  5235. '非金属矿物制品工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
  5236. '非金属矿物材料': {'大类': '建筑建材', '门类': '零售批发'},
  5237. '食品加工专用设备': {'大类': '专用设备', '门类': '零售批发'},
  5238. '食品及加工盐': {'大类': '食品、饮料和烟草原料', '门类': '零售批发'},
  5239. '餐饮业': {'大类': '餐饮业', '门类': '住宿和餐饮业'},
  5240. '饮料、酒精及精制茶': {'大类': '食品、饮料和烟草原料', '门类': '零售批发'},
  5241. '饮料加工设备': {'大类': '专用设备', '门类': '零售批发'},
  5242. '饲养动物及其产品': {'大类': '农林牧渔业产品', '门类': '零售批发'},
  5243. '黑色金属冶炼及压延产品': {'大类': '建筑建材', '门类': '零售批发'},
  5244. '黑色金属矿': {'大类': '矿与矿物', '门类': '零售批发'}}
  5245. self.sess = tf.Session(graph=tf.Graph())
  5246. self.get_model()
  5247. with open(os.path.dirname(__file__)+'/industry_rule_kw_json/tw_industry_keyword_org/tw_industry_keyword_org.json', 'r',
  5248. encoding='utf-8') as fp1:
  5249. self.json_data_industry = json.load(fp1)
  5250. with open(os.path.dirname(__file__)+'/industry_rule_kw_json/tw_company_classification_keyword/tw_company_classification_keyword.json', 'r',
  5251. encoding='utf-8') as fp2:
  5252. self.json_data_company = json.load(fp2)
  5253. with open(os.path.dirname(__file__)+'/industry_rule_kw_json/tw_custom_keyword/tw_custom_keyword.json', 'r', encoding='utf-8') as fp3:
  5254. self.json_data_custom = json.load(fp3)
  5255. '''下面补充行业关键词'''
  5256. d = {'id': 5592, 'pingmu': '工程', 'menlei': '建筑业', 'dalei': '建筑装饰和其他建筑业', 'xiaolei': '建筑物拆除和场地准备活动', 'key_word': '围蔽', 'key_word2': None, 'power': '6.00'}
  5257. self.json_data_industry.append(d)
  5258. def get_model(self):
  5259. with self.sess.as_default() as sess:
  5260. with self.sess.graph.as_default():
  5261. meta_graph_def = tf.saved_model.loader.load(sess,
  5262. tags=['serve'],
  5263. export_dir=os.path.dirname(__file__)+'/industry_model')
  5264. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  5265. signature_def = meta_graph_def.signature_def
  5266. self.title = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs['title'].name)
  5267. self.project = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs['project'].name)
  5268. self.product = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs['product'].name)
  5269. self.outputs = sess.graph.get_tensor_by_name(signature_def[signature_key].outputs['outputs'].name)
  5270. def text2array(self, text, tenderee='', maxSententLen=20):
  5271. tenderee = tenderee.replace('(', '(').replace(')', ')')
  5272. text = text.replace('(', '(').replace(')', ')')
  5273. text = re.sub(
  5274. '(废标|终止|综?合?评审|评标|开标|资审|履约|验收|成交|中标人?|中选人?|单一来源|合同|候选人|结果|变更|更正|答疑|澄清|意向|需求|采购|招标|询比?价|磋商|谈判|比选|比价|竞价|议价)的?(公告|预告|公示)?|关于为?|选取|定点|直接|邀请函?|通知书?|备案|公开|公示|公告|记录|竞争性',
  5275. ' ', text)
  5276. text = text.replace(tenderee, ' ')
  5277. text = ' ' if text=="" else text
  5278. words_docs_list = selffool.cut(text)
  5279. words_docs_list = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)][-maxSententLen:] for l in words_docs_list]
  5280. array = embedding(words_docs_list, shape=(len(words_docs_list), maxSententLen, 128))
  5281. return array
  5282. def process(self, title, project, product, tenderee):
  5283. return self.text2array(title, tenderee), self.text2array(project, tenderee), self.text2array(product)
  5284. def predict_model(self, title, project, product, tenderee=''):
  5285. title_array, project_array, product_array = self.process(title, project, product, tenderee)
  5286. rs = self.sess.run(self.outputs,
  5287. feed_dict={
  5288. self.title:title_array,
  5289. self.project:project_array,
  5290. self.product:product_array
  5291. }
  5292. )
  5293. pred = np.argmax(rs[0])
  5294. return self.id2lb[pred], rs[0][pred]
  5295. # # 返回top2 结果
  5296. # pred_list = np.argsort(-rs[0])
  5297. # return self.id2lb[pred_list[0]], self.id2lb[pred_list[1]], rs[0][pred_list[0]], rs[0][pred_list[1]]
  5298. def predict_rule(self, doctitle, tenderee, win_tenderer, project_name, product):
  5299. doctitle = doctitle if doctitle else ''
  5300. tenderee = tenderee if tenderee else ''
  5301. win_tenderer = win_tenderer if win_tenderer else ''
  5302. project_name = project_name if project_name else ''
  5303. product = product if product else ''
  5304. text_ind = (doctitle + project_name + product).replace(tenderee, '')
  5305. text_ind = text_ind.replace('墙面粉刷', '墙面 粉刷')
  5306. text_com = win_tenderer
  5307. length_ind_text = len(text_ind) + 1
  5308. length_com_text = len(text_com) + 1
  5309. # print(text)
  5310. dic_res = {} # 行业分类字典
  5311. score_lst = [] # 得分列表
  5312. word_lst = [] # 关键词列表
  5313. # 主要内容关键词
  5314. if text_ind:
  5315. # logging.info("data_ind%s"%str(_json_data_industry[0]))
  5316. for data_industry in self.json_data_industry:
  5317. industry = data_industry['xiaolei']
  5318. key_word = data_industry['key_word']
  5319. key_word_2 = data_industry['key_word2']
  5320. power = float(data_industry['power']) if data_industry['power'] else 0
  5321. this_score = power * (text_ind.count(key_word) * len(key_word) / length_ind_text)
  5322. if key_word_2:
  5323. # key_word_compose = key_word + "+" + key_word_2
  5324. if text_ind.count(key_word_2) == 0:
  5325. this_score = 0
  5326. if this_score > 0:
  5327. # print(industry,key_word,this_score)
  5328. if industry in dic_res.keys():
  5329. dic_res[industry] += this_score
  5330. else:
  5331. dic_res[industry] = this_score
  5332. if key_word not in word_lst:
  5333. word_lst.append(key_word)
  5334. # 供应商关键词
  5335. if text_com:
  5336. for data_company in self.json_data_company:
  5337. industry = data_company['industry_type']
  5338. key_word = data_company['company_word']
  5339. power = float(data_company['industry_rate']) if data_company['industry_rate'] else 0
  5340. this_score = power * (text_com.count(key_word) * len(key_word) / length_com_text)
  5341. if this_score > 0:
  5342. # print(industry,key_word,this_score)
  5343. if industry in dic_res.keys():
  5344. dic_res[industry] += this_score
  5345. else:
  5346. dic_res[industry] = this_score
  5347. if key_word not in word_lst:
  5348. word_lst.append(key_word)
  5349. # 自定义关键词
  5350. if text_ind:
  5351. custom_ind = [
  5352. ['tenderee', '医院|疾病预防', ['设备', '系统', '器'], '医疗设备'],
  5353. ['tenderee', '学校|大学|小学|中学|学院|幼儿园', ['设备', '器'], '教育设备'],
  5354. ['tenderee', '学校|大学|小学|中学|学院|幼儿园|医院', ['工程'], '科研、医疗、教育用房'],
  5355. ['tenderee', '供电局|电网|国网|电力|电厂|粤电', ['设备', '器', '物资'], '电力工业专用设备'],
  5356. ['tenderee', '公安|法院|检察院', ['设备', '器'], '政法、检测专用设备'],
  5357. ['tenderee', '^中铁|^中交|^中建|中国建筑', ['材料'], '其他建筑建材'],
  5358. ['doctextcon', '信息技术服务|系统开发|信息化|信息系统', ['监理'], '信息技术咨询服务'],
  5359. ['doctextcon', '工程', ['消防'], '专业施工'],
  5360. ['doctextcon', '铁路|航空|船舶|航天|广铁', ['维修'], '铁路、船舶、航空航天等运输设备修理'],
  5361. ['doctextcon', '设备|仪|器', ['租赁'], '机械设备经营租赁'],
  5362. ['doctextcon', '交通|铁路|公路|道路|桥梁', ['工程'], '铁路、道路、隧道和桥梁工程建筑'],
  5363. ['win_tenderer', '电力', ['设备', '器'], '电力工业专用设备'],
  5364. ['win_tenderer', '信息|网络科技', ['系统'], '信息系统集成和物联网技术服务'],
  5365. ['tenderee,doctextcon', '铁路|广铁|铁道', ['设备', '器', '物资', '材料', '铁路'], '铁路运输设备'],
  5366. ]
  5367. for data_custom in self.json_data_custom:
  5368. industry_custom = data_custom['industry']
  5369. key_word = data_custom['company_word']
  5370. power = float(data_custom['industry_rate'])
  5371. for k in range(len(custom_ind)):
  5372. subject = ''
  5373. if 'tenderee' in custom_ind[k][0]:
  5374. subject += tenderee
  5375. if 'win_tenderer' in custom_ind[k][0]:
  5376. subject += win_tenderer
  5377. if 'doctextcon' in custom_ind[k][0]:
  5378. subject += text_ind
  5379. ptn = custom_ind[k][1]
  5380. # print('ptn',ptn)
  5381. if re.search(ptn, subject) and industry_custom in custom_ind[k][2]:
  5382. industry = custom_ind[k][3]
  5383. else:
  5384. continue
  5385. this_score = power * (text_ind.count(key_word) * len(key_word) / len(subject))
  5386. if this_score > 0:
  5387. # print(industry,key_word,this_score)
  5388. if industry in dic_res.keys():
  5389. dic_res[industry] += this_score
  5390. else:
  5391. dic_res[industry] = this_score
  5392. if key_word not in word_lst:
  5393. word_lst.append(key_word)
  5394. sort_res = sorted(dic_res.items(), key=lambda x: x[1], reverse=True)
  5395. lst_res = [s[0] for s in sort_res]
  5396. score_lst = [str(round(float(s[1]), 2)) for s in sort_res]
  5397. if len(lst_res) > 0:
  5398. return lst_res, score_lst, word_lst
  5399. else:
  5400. return [""], [], []
  5401. def predict_merge(self, pinmu_type, industry_lst):
  5402. '''
  5403. 通过一系列规则最终决定使用模型还是规则的结果
  5404. :param pinmu_type: 模型预测类别
  5405. :param industry_lst: 规则预测类别列表
  5406. :return:
  5407. '''
  5408. industry_type = industry_lst[0]
  5409. if industry_type == "":
  5410. return pinmu_type
  5411. if industry_type == '专用设备修理' and re.search('修理|维修|装修|修缮', pinmu_type):
  5412. final_type = pinmu_type
  5413. elif industry_type == '其他土木工程建筑' and re.search('工程|建筑|用房|施工|安装|质检|其他专业咨询与调查', pinmu_type):
  5414. final_type = pinmu_type
  5415. elif pinmu_type == '专用设备修理' and re.search('工程|修理', industry_type):
  5416. final_type = industry_type
  5417. elif pinmu_type == '信息系统集成和物联网技术服务' and re.search('卫星传输|信息处理和存储支持服务|信息技术咨询服务|运行维护服务|其他专业技术服务|医疗设备|医药品',
  5418. industry_type):
  5419. final_type = industry_type
  5420. elif industry_type == '仪器仪表' and re.search('仪器|器具|医疗设备', pinmu_type):
  5421. final_type = pinmu_type
  5422. elif industry_type == '医药品' and re.search('医疗设备', pinmu_type):
  5423. final_type = pinmu_type
  5424. elif industry_type == '医药品' and re.search('医疗设备', pinmu_type):
  5425. final_type = pinmu_type
  5426. elif re.search('设备', industry_type) and re.search('修理|维修', pinmu_type):
  5427. final_type = pinmu_type
  5428. elif industry_type == '社会工作' and re.search('工程', pinmu_type):
  5429. final_type = pinmu_type
  5430. elif industry_type == '信息系统集成和物联网技术服务' and re.search('信息处理|设备', pinmu_type):
  5431. final_type = pinmu_type
  5432. elif industry_type == '研究和试验发展' and re.search('其他专业咨询与调查|质检技术服务|信息系统集成|其他工程服务', pinmu_type):
  5433. final_type = pinmu_type
  5434. elif industry_type == '其他专业咨询与调查' and re.search('工程造价服务', pinmu_type):
  5435. final_type = pinmu_type
  5436. elif industry_type == '广告业' and re.search('印刷服务|影视节目制作|信息系统', pinmu_type):
  5437. final_type = pinmu_type
  5438. elif industry_type == '清洁服务' and re.search('工程|环境污染防治设备|修理', pinmu_type):
  5439. final_type = pinmu_type
  5440. elif industry_type == '其他公共设施管理' and re.search('信息系统', pinmu_type):
  5441. final_type = pinmu_type
  5442. elif industry_type == '其他专业技术服务' and re.search('工程技术与设计服务|质检技术服务|环境与生态监测检测服务', pinmu_type):
  5443. final_type = pinmu_type
  5444. elif industry_type == '机械设备经营租赁' and re.search('电信', pinmu_type):
  5445. final_type = pinmu_type
  5446. elif industry_type == '货币金融服务' and re.search('信息系统集成和物联网技术服务', pinmu_type):
  5447. final_type = pinmu_type
  5448. elif industry_type == '体育场地设施管理' and re.search('体育设备', pinmu_type):
  5449. final_type = pinmu_type
  5450. elif industry_type == '安全保护服务' and re.search('信息系统|监控设备|互联网安全服务', pinmu_type):
  5451. final_type = pinmu_type
  5452. elif industry_type == '互联网接入及相关服务' and re.search('通信设备', pinmu_type):
  5453. final_type = pinmu_type
  5454. elif industry_type == '卫生' and re.search('医疗设备|信息系统', pinmu_type):
  5455. final_type = pinmu_type
  5456. elif pinmu_type == '研究和试验发展' and re.search('其他工程服务', industry_type):
  5457. final_type = industry_type
  5458. elif pinmu_type == '办公设备' and re.search('教育设备', industry_type):
  5459. final_type = industry_type
  5460. elif re.search('车辆|机械设备经营租赁', pinmu_type) and re.search('公路旅客运输', industry_type):
  5461. final_type = industry_type
  5462. elif len(industry_lst) > 1 and pinmu_type == industry_lst[1] and re.search('会计|法律|物业|家具|印刷|互联网安全',
  5463. industry_type) == None \
  5464. and re.search('其他|人力资源服务', pinmu_type) == None:
  5465. final_type = pinmu_type
  5466. elif industry_type != "":
  5467. final_type = industry_type
  5468. else:
  5469. final_type = pinmu_type
  5470. return final_type
  5471. def predict(self, title, project, product, prem, product_attrs):
  5472. def get_ree_win(prem):
  5473. tenderee = ""
  5474. win_tenderer = ""
  5475. try:
  5476. for v in prem[0]['prem'].values():
  5477. for link in v['roleList']:
  5478. if link['role_name'] == 'tenderee' and tenderee == "":
  5479. tenderee = link['role_text']
  5480. elif link['role_name'] == 'win_tenderer' and win_tenderer == "":
  5481. win_tenderer = link['role_text']
  5482. except Exception as e:
  5483. print('解析prem 获取招标人、中标人出错')
  5484. return tenderee, win_tenderer
  5485. if product_attrs[0]['product_attrs'].get('data', [])!=[]: # 20250116 如果产品属性提取到产品,替换产品
  5486. products = [d['product'] for d in product_attrs[0]['product_attrs']['data']]
  5487. product = ' '.join(products)
  5488. tenderee, win_tenderer = get_ree_win(prem)
  5489. result_model, prob = self.predict_model(title, project, product, tenderee)
  5490. industry_lst, score_lst, word_lst = self.predict_rule(title, tenderee, win_tenderer, project, product)
  5491. final_type = self.predict_merge(result_model, industry_lst)
  5492. # print('模型:%s;规则:%s;最终:%s'%(result_model, industry_lst[0], final_type))
  5493. # return {'industry': final_type}
  5494. return {'industry': {
  5495. 'class_name': final_type,
  5496. 'subclass': self.industry_dic[final_type]['大类'],
  5497. 'class': self.industry_dic[final_type]['门类']
  5498. }
  5499. }
  5500. class DistrictPredictor():
  5501. def __init__(self):
  5502. # with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
  5503. # dist_dic = pickle.load(f)
  5504. # short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
  5505. # full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
  5506. # short2id = {}
  5507. # full2id = {}
  5508. # for k, v in dist_dic.items():
  5509. # if v['简称'] not in short2id:
  5510. # short2id[v['简称']] = [k]
  5511. # else:
  5512. # short2id[v['简称']].append(k)
  5513. # if v['全称'] not in full2id:
  5514. # full2id[v['全称']] = [k]
  5515. # else:
  5516. # full2id[v['全称']].append(k)
  5517. # self.dist_dic = dist_dic
  5518. # self.short_name = short_name
  5519. # self.full_name = full_name
  5520. # self.short2id = short2id
  5521. # self.full2id = full2id
  5522. # # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
  5523. with open(os.path.dirname(__file__)+'/district_tuple.pkl', 'rb') as f:
  5524. district_tuple = pickle.load(f)
  5525. self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
  5526. # self.pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
  5527. # self.p_pro, self.p_city, self.p_dis, self.p_city, self.p_dis, self.p_dis)
  5528. self.pettern = "(?P<prov>%s)##(?P<city>%s)##(?P<dist>%s)" % (
  5529. self.p_pro, self.p_city, self.p_dis)
  5530. with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
  5531. self.area_variance_dic = pickle.load(f)
  5532. @staticmethod
  5533. def find_whole_areas(text, pettern, area_variance_dic, full_dic, weight=1):
  5534. '''
  5535. 通过正则匹配字符串返回地址
  5536. :param pettern: 地址正则 广东省|广西省|...
  5537. :param text: 待匹配文本
  5538. :return:
  5539. '''
  5540. province_l, city_l, district_l = [], [], []
  5541. text = str(text).replace('(', '(').replace(')', ')')
  5542. text = re.sub('\d{2,4}年度?|[\d/-]{1,5}[月日]|\d+|[a-zA-Z0-9]+', ' ', text)
  5543. text = re.sub(
  5544. '复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城|西九龙站|广州路北|安阳山村|电信|联通|北京现代', # 570445994 广州路北侧 预测为 广州 路北
  5545. ' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
  5546. text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
  5547. text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
  5548. text = re.sub('茂名滨海新区', '茂名市', text)
  5549. text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
  5550. text = re.sub('横州市', '横县', text) # 例:547363890 修复广西南宁横州 不在地区表问题
  5551. text = re.sub('广东中山', '广东中山市', text)
  5552. text = re.sub('朝阳柳城经济开发区', '朝阳市', text)
  5553. ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
  5554. if ser and '黎族' not in ser.group(0):
  5555. text = text.replace(ser.group(0), ser.group(0) + '黎族')
  5556. for k, v in area_variance_dic.items(): # 20241113 根据地区变更信息替换文本
  5557. text = text.replace(k, v)
  5558. text = re.sub('\s+', ' ', text)
  5559. if re.search('[\u4e00-\u9fa5]', text) == None:
  5560. return province_l, city_l, district_l
  5561. for pettern in pettern.split('##'):
  5562. for it in re.finditer(pettern, text):
  5563. if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
  5564. continue
  5565. for k, v in it.groupdict().items():
  5566. if v != None:
  5567. if it.end() == it.end(k) and re.search('[省市区县州旗盟]$', v) == None and re.search(
  5568. '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区|巷|坊)|酒店|宾馆|经济开发区|开发区|新区|公园|广场|医院|[大中小]学)',
  5569. # 城市不匹配为区的地址 修复 滨州北海经济开发区 北海新区 等提取为北海
  5570. text[it.end(k):]) != None:
  5571. continue
  5572. if k in ['prov']:
  5573. if v in full_dic['province']:
  5574. score = 2
  5575. else:
  5576. score = 1
  5577. if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站|地区|区域)'
  5578. , text[it.end(k):]) or re.search('^((%s)|\-%s)' % (v, v),
  5579. text[max(0, it.start(k) - 1):]):
  5580. score += 1
  5581. score += it.end(k) / len(text) / 10
  5582. province_l.append((v, score * weight))
  5583. elif k in ['city', 'city1']:
  5584. if v in full_dic['city']:
  5585. score = 2
  5586. else:
  5587. score = 1
  5588. if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站|地区|区域)'
  5589. , text[it.end(k):]) or re.search('^((%s)|\-%s)' % (v, v),
  5590. text[max(0, it.start(k) - 1):]):
  5591. score += 1
  5592. score += it.end(k) / len(text) / 10 # 优化 572840045 上海铁路公安局合肥公安处 这种表达
  5593. city_l.append((v, score * weight))
  5594. elif k in ['dist', 'dist1', 'dist2']:
  5595. if v in ['东区', '西区', '城区', '郊区', '矿区', '东至']:
  5596. continue
  5597. if v in full_dic['district'] and len(v) > 2:
  5598. score = 2
  5599. else:
  5600. score = 0.5
  5601. if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站|地区|区域)'
  5602. , text[it.end(k):]) or (
  5603. re.match('\s*%s' % v, text) and it.start(k) < 2) or re.search(
  5604. '^((%s)|\-%s)' % (v, v), text[max(0, it.start(k) - 1):]):
  5605. score += 0.5
  5606. # score += it.end(k) / len(text) / 10
  5607. if v == '昌江' and '景德镇' not in it.group(0):
  5608. district_l.append(('昌江黎族', score * weight))
  5609. else:
  5610. district_l.append((v, score * weight))
  5611. return province_l, city_l, district_l
  5612. @staticmethod
  5613. def merge_score(province_l, city_l, district_l, full_dic, short_dic, idx_dic, filter_short_dist=True):
  5614. '''
  5615. 合并分数,下级地区分数加到上级
  5616. :param province_l: 提取到的省份列表 [(name, score)]
  5617. :param city_l: 提取到的城市列表 [(name, score)]
  5618. :param district_l: 提取到的区县列表 [(name, score)]
  5619. :param filter_short_dist: 是否过滤不在省份下的区县简称权重
  5620. :return:
  5621. '''
  5622. pro_ids = dict()
  5623. city_ids = dict()
  5624. dis_ids = dict()
  5625. for pro in province_l:
  5626. name, score = pro
  5627. idx = full_dic['province'][name] if name in full_dic['province'] else short_dic['province'][name]
  5628. if idx not in pro_ids:
  5629. pro_ids[idx] = 0
  5630. pro_ids[idx] += score
  5631. tmp_pro = {}
  5632. for city in city_l:
  5633. name, score = city
  5634. if name in full_dic['city']:
  5635. for idx in full_dic['city'][name]:
  5636. if idx not in city_ids:
  5637. city_ids[idx] = 0
  5638. city_ids[idx] += score
  5639. pro_idx = idx_dic[idx]['省']
  5640. if pro_idx in tmp_pro:
  5641. tmp_pro[pro_idx] += score
  5642. else:
  5643. tmp_pro[pro_idx] = score
  5644. elif name in short_dic['city']:
  5645. for idx in short_dic['city'][name]:
  5646. if idx not in city_ids:
  5647. city_ids[idx] = 0
  5648. city_ids[idx] += score
  5649. pro_idx = idx_dic[idx]['省']
  5650. if pro_idx in tmp_pro:
  5651. tmp_pro[pro_idx] += score
  5652. else:
  5653. tmp_pro[pro_idx] = score
  5654. if set(tmp_pro) & set(pro_ids) != set():
  5655. for k, v in tmp_pro.items():
  5656. if k in pro_ids:
  5657. pro_ids[k] += v
  5658. else:
  5659. pro_ids[k] = v
  5660. else:
  5661. pro_ids.update(tmp_pro)
  5662. tmp_pro = {}
  5663. tmp_city = {}
  5664. for dis in district_l:
  5665. name, score = dis
  5666. if name in full_dic['district']:
  5667. for idx in full_dic['district'][name]:
  5668. if idx not in dis_ids:
  5669. dis_ids[idx] = 0
  5670. dis_ids[idx] += score
  5671. pro_idx = idx_dic[idx]['省']
  5672. if pro_idx in tmp_pro:
  5673. tmp_pro[pro_idx] += score
  5674. else:
  5675. tmp_pro[pro_idx] = score
  5676. city_idx = idx_dic[idx]['市']
  5677. if city_idx in tmp_city:
  5678. tmp_city[city_idx] += score
  5679. else:
  5680. tmp_city[city_idx] = score
  5681. elif name in short_dic['district']:
  5682. for idx in short_dic['district'][name]:
  5683. if idx not in dis_ids:
  5684. dis_ids[idx] = 0
  5685. dis_ids[idx] += score
  5686. pro_idx = idx_dic[idx]['省']
  5687. if filter_short_dist and score < 1: # pro_idx not in pro_ids
  5688. continue
  5689. if pro_idx in tmp_pro:
  5690. tmp_pro[pro_idx] += score
  5691. else:
  5692. tmp_pro[pro_idx] = score
  5693. city_idx = idx_dic[idx]['市']
  5694. if city_idx in tmp_city:
  5695. tmp_city[city_idx] += score
  5696. else:
  5697. tmp_city[city_idx] = score
  5698. if set(tmp_pro) & set(pro_ids) != set():
  5699. for k, v in tmp_pro.items():
  5700. if k in pro_ids:
  5701. pro_ids[k] += v
  5702. else:
  5703. pro_ids.update(tmp_pro)
  5704. if set(tmp_city) & set(city_ids) != set():
  5705. for k, v in tmp_city.items():
  5706. if k in city_ids:
  5707. city_ids[k] += v
  5708. else:
  5709. city_ids.update(tmp_city)
  5710. return pro_ids, city_ids, dis_ids
  5711. @staticmethod
  5712. def get_final_addr(pro_ids, city_ids, dis_ids, idx_dic):
  5713. '''
  5714. 先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市
  5715. :param province_l: 匹配到的所有省份
  5716. :param city_l: 匹配到的所有城市
  5717. :param district_l: 匹配到的所有区县
  5718. :return:
  5719. '''
  5720. big_area = ""
  5721. pred_pro = ""
  5722. pred_city = ""
  5723. pred_dis = ""
  5724. final_pro = ""
  5725. final_city = ""
  5726. prob = 0
  5727. max_score = 0
  5728. if len(pro_ids) >= 1:
  5729. pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
  5730. scores = [it[1] for it in pro_l]
  5731. prob = max(scores) / sum(scores)
  5732. max_score = max(scores)
  5733. final_pro, score = pro_l[0]
  5734. if score >= 0.01:
  5735. pred_pro = idx_dic[final_pro]['返回名称']
  5736. big_area = idx_dic[final_pro]['大区']
  5737. if pred_pro != "" and len(city_ids) >= 1:
  5738. city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
  5739. for it in city_l:
  5740. if idx_dic[it[0]]['省'] == final_pro:
  5741. final_city = it[0]
  5742. pred_city = idx_dic[final_city]['返回名称']
  5743. break
  5744. if final_city != "" and len(set(dis_ids)) >= 1:
  5745. dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
  5746. for it in dis_l:
  5747. if idx_dic[it[0]]['市'] == final_city:
  5748. pred_dis = idx_dic[it[0]]['返回名称']
  5749. elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1: # 20241111 省份不为空,市为空,如果区县在省份下,补充对应的市县
  5750. dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
  5751. for it in dis_l:
  5752. if idx_dic[it[0]]['省'] == final_pro:
  5753. pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
  5754. pred_dis = idx_dic[it[0]]['返回名称']
  5755. return big_area, pred_pro, pred_city, pred_dis, prob, max_score
  5756. @staticmethod
  5757. def get_ree_addr(prem):
  5758. tenderee = ""
  5759. tenderee_address = ""
  5760. try:
  5761. for v in prem.values():
  5762. for link in v['roleList']:
  5763. if link['role_name'] == 'tenderee' and tenderee == "":
  5764. tenderee = link['role_text']
  5765. tenderee_address = link['address']
  5766. except Exception as e:
  5767. print('解析prem 获取招标人、及地址出错')
  5768. return tenderee, tenderee_address
  5769. @staticmethod
  5770. def get_role_address(text):
  5771. '''正则匹配获取招标人地址
  5772. 3:地址直接在招标人后面 招标人:xxx,地址:xxx
  5773. 4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
  5774. '''
  5775. p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
  5776. p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
  5777. p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
  5778. if re.search(p3, text):
  5779. return re.search(p3, text).group('addr')
  5780. elif re.search(p4, text):
  5781. return re.search(p4, text).group('addr')
  5782. elif re.search(p5, text):
  5783. return re.search(p5, text).group('addr')
  5784. else:
  5785. return ''
  5786. @staticmethod
  5787. def get_all_addr(list_entity):
  5788. tenderee_l = []
  5789. addr_l = []
  5790. for ent in list_entity:
  5791. if ent.entity_type == 'location' and len(ent.entity_text) > 2:
  5792. addr_l.append(ent.entity_text)
  5793. elif ent.entity_type in ['org', 'company']:
  5794. if ent.label in [0, 1]: # 加招标或代理
  5795. tenderee_l.append(ent.entity_text)
  5796. return ' '.join(set(addr_l)), ' '.join(set(tenderee_l))
  5797. def predict_area(self, title, content, web_source_name, prem={}, addr_dic={}, list_entity=[]):
  5798. area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
  5799. addr_project = addr_dic.get('addr_project', '')
  5800. addr_delivery = addr_dic.get('addr_delivery', '')
  5801. addr_bidopen = addr_dic.get('addr_bidopen', '')
  5802. addr_bidsend = addr_dic.get('addr_bidsend', '')
  5803. addr_contact = addr_dic.get('addr_contact', '')
  5804. in_content = False
  5805. not_sure = True # 是否不确定地区
  5806. province_l, city_l, district_l = self.find_whole_areas('%s %s'%(title, addr_project), self.pettern, self.area_variance_dic, self.full_dic)
  5807. pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
  5808. big_area_1, pred_pro_1, pred_city_1, pred_dis_1, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
  5809. big_area, pred_pro, pred_city, pred_dis = big_area_1, pred_pro_1, pred_city_1, pred_dis_1
  5810. # print('关键词1:', province_l, city_l, district_l)
  5811. # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
  5812. if pred_city_1 == "" or prob < 0.7 or max_score<2:
  5813. ree, addr = self.get_ree_addr(prem)
  5814. if ree in title:
  5815. ree = '##'
  5816. rule_ree_addr = self.get_role_address(content)
  5817. if rule_ree_addr:
  5818. addr = rule_ree_addr
  5819. # addr = content
  5820. # ree = ''
  5821. province_l2, city_l2, district_l2 = self.find_whole_areas('%s %s %s %s' % (ree, addr, addr_contact, addr_delivery), self.pettern, self.area_variance_dic, self.full_dic, weight=0.8)
  5822. province_l.extend(province_l2)
  5823. city_l.extend(city_l2)
  5824. district_l.extend(district_l2)
  5825. pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
  5826. big_area_2, pred_pro_2, pred_city_2, pred_dis_2, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
  5827. big_area, pred_pro, pred_city, pred_dis = big_area_2, pred_pro_2, pred_city_2, pred_dis_2
  5828. # print('关键词2:', province_l, city_l, district_l)
  5829. # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
  5830. if re.search('省|市|自治', addr_project) and pred_pro_1 != '' and pred_pro_1 != pred_pro_2: # 如果有项目地址使用项目地址
  5831. not_sure = False
  5832. big_area, pred_pro, pred_city, pred_dis = big_area_1, pred_pro_1, pred_city_1, pred_dis_1
  5833. if not_sure and (pred_city_2 == "" or prob < 0.7 or max_score<2):
  5834. province_l3, city_l3, district_l3 = self.find_whole_areas('%s %s'%(addr_bidopen, addr_bidsend), self.pettern, self.area_variance_dic, self.full_dic, weight=0.6)
  5835. province_l.extend(province_l3)
  5836. city_l.extend(city_l3)
  5837. district_l.extend(district_l3)
  5838. pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
  5839. big_area_3, pred_pro_3, pred_city_3, pred_dis_3, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
  5840. big_area, pred_pro, pred_city, pred_dis = big_area_3, pred_pro_3, pred_city_3, pred_dis_3
  5841. # print('关键词3:', province_l, city_l, district_l)
  5842. # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
  5843. if not_sure and (pred_city_3 == "" or prob < 0.6 or max_score < 2):
  5844. all_addr, tenderees = self.get_all_addr(list_entity)
  5845. province_l4, city_l4, district_l4 = self.find_whole_areas('%s %s %s' % (web_source_name, tenderees, all_addr), self.pettern, self.area_variance_dic, self.full_dic, weight=0.3)
  5846. province_l.extend(province_l4)
  5847. city_l.extend(city_l4)
  5848. district_l.extend(district_l4)
  5849. pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
  5850. big_area_4, pred_pro_4, pred_city_4, pred_dis_4, prob, max_score = self.get_final_addr(pro_ids, city_ids,dis_ids, self.idx_dic)
  5851. big_area, pred_pro, pred_city, pred_dis = big_area_4, pred_pro_4, pred_city_4, pred_dis_4
  5852. if pred_pro_3 != pred_pro_4 and (prob < 0.6 or max_score < 2):
  5853. in_content = True
  5854. # print('关键词4:', province_l, city_l, district_l)
  5855. # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
  5856. if pred_city in ['北京', '天津', '上海', '重庆']:
  5857. pred_city = pred_dis
  5858. pred_dis = ""
  5859. if big_area != "":
  5860. area_dic['area'] = big_area
  5861. if pred_pro != "":
  5862. area_dic['province'] = pred_pro
  5863. if pred_city != "":
  5864. area_dic['city'] = pred_city
  5865. if pred_dis != "":
  5866. area_dic['district'] = pred_dis
  5867. area_dic['is_in_text'] = in_content
  5868. # area_dic['prob'] = prob
  5869. # area_dic['max_score'] = max_score
  5870. return {'district': area_dic}
  5871. def get_area(self, text, web_name, in_content=False):
  5872. p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
  5873. def get_final_addr(pro_ids, city_ids, dis_ids):
  5874. '''
  5875. 先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市
  5876. :param province_l: 匹配到的所有省份
  5877. :param city_l: 匹配到的所有城市
  5878. :param district_l: 匹配到的所有区县
  5879. :return:
  5880. '''
  5881. big_area = ""
  5882. pred_pro = ""
  5883. pred_city = ""
  5884. pred_dis = ""
  5885. final_pro = ""
  5886. final_city = ""
  5887. pro_prob = 0
  5888. city_prob = 0
  5889. if len(pro_ids) >= 1:
  5890. pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
  5891. scores = [it[1] for it in pro_l]
  5892. pro_prob = max(scores)/sum(scores)
  5893. final_pro, score = pro_l[0]
  5894. if score >= 0.01:
  5895. pred_pro = idx_dic[final_pro]['返回名称']
  5896. big_area = idx_dic[final_pro]['大区']
  5897. # else:
  5898. # print("得分过低,过滤掉", idx_dic[final_pro]['返回名称'], score)
  5899. if pred_pro != "" and len(city_ids) >= 1:
  5900. city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
  5901. scores = [it[1] for it in city_l]
  5902. city_prob = max(scores) / sum(scores)
  5903. for it in city_l:
  5904. if idx_dic[it[0]]['省'] == final_pro:
  5905. final_city = it[0]
  5906. pred_city = idx_dic[final_city]['返回名称']
  5907. break
  5908. if final_city != "" and len(set(dis_ids)) >= 1:
  5909. dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
  5910. for it in dis_l:
  5911. if idx_dic[it[0]]['市'] == final_city:
  5912. pred_dis = idx_dic[it[0]]['返回名称']
  5913. elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1: # 20241111 省份不为空,市为空,如果区县在省份下,补充对应的市县
  5914. dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
  5915. for it in dis_l:
  5916. if idx_dic[it[0]]['省'] == final_pro:
  5917. pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
  5918. pred_dis = idx_dic[it[0]]['返回名称']
  5919. # print('20241111 省份不为空,市为空,如果区县在省份下,补充对应的市县: ', pred_city, pred_dis)
  5920. if pred_city in ['北京', '天津', '上海', '重庆']:
  5921. pred_city = pred_dis
  5922. pred_dis = ""
  5923. return big_area, pred_pro, pred_city, pred_dis
  5924. def find_areas(pettern, text):
  5925. '''
  5926. 通过正则匹配字符串返回地址
  5927. :param pettern: 地址正则 广东省|广西省|...
  5928. :param text: 待匹配文本
  5929. :return:
  5930. '''
  5931. addr = []
  5932. for it in re.finditer(pettern, text):
  5933. if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
  5934. '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
  5935. continue
  5936. if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
  5937. continue
  5938. if re.search('^(经济开发区|开发区|新区)', text[it.end():]) and re.search('广州市', pettern): # 城市不匹配为区的地址 修复 滨州北海经济开发区 北海新区 等提取为北海
  5939. continue
  5940. addr.append((it.group(0), it.start(), it.end()))
  5941. if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end():]):
  5942. addr.append((it.group(0), it.start(), it.end()))
  5943. return addr
  5944. def chage_area2score(group_list, max_len):
  5945. '''
  5946. 把匹配的的地址转为分数
  5947. :param group_list: [('name', b, e)]
  5948. :return:
  5949. '''
  5950. area_list = []
  5951. if group_list != []:
  5952. for it in group_list:
  5953. name, b, e = it
  5954. area_list.append((name, (e - b + e) / max_len / 2))
  5955. return area_list
  5956. def find_whole_areas(text):
  5957. '''
  5958. 通过正则匹配字符串返回地址
  5959. :param pettern: 地址正则 广东省|广西省|...
  5960. :param text: 待匹配文本
  5961. :return:
  5962. '''
  5963. pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
  5964. p_pro, p_city, p_dis, p_city, p_dis, p_dis)
  5965. province_l, city_l, district_l = [], [], []
  5966. for it in re.finditer(pettern, text):
  5967. if re.search('[省市区县旗盟]', it.group(0)) == None and re.search(
  5968. '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
  5969. continue
  5970. if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
  5971. continue
  5972. for k, v in it.groupdict().items():
  5973. if v != None:
  5974. if k in ['prov']:
  5975. province_l.append((it.group(k), it.start(k), it.end(k)))
  5976. elif k in ['city', 'city1']:
  5977. if re.search('^(经济开发区|开发区|新区)', text[it.end(k):]): # 城市不匹配为区的地址 修复 滨州北海经济开发区 北海新区 等提取为北海
  5978. continue
  5979. city_l.append((it.group(k), it.start(k), it.end(k)))
  5980. if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end(k):]):
  5981. city_l.append((it.group(k), it.start(k), it.end(k)))
  5982. elif k in ['dist', 'dist1', 'dist2']:
  5983. if it.group(k)=='昌江' and '景德镇' not in it.group(0):
  5984. district_l.append(('昌江黎族', it.start(k), it.end(k)))
  5985. else:
  5986. district_l.append((it.group(k), it.start(k), it.end(k)))
  5987. return province_l, city_l, district_l
  5988. def get_pro_city_dis_score(text, text_weight=1):
  5989. text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间', ' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
  5990. text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
  5991. text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
  5992. text = re.sub('茂名滨海新区', '茂名市', text)
  5993. text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
  5994. text = re.sub('横州市', '横县', text) # 例:547363890 修复广西南宁横州 不在地区表问题
  5995. ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
  5996. if ser and '黎族' not in ser.group(0):
  5997. text = text.replace(ser.group(0), ser.group(0)+'黎族')
  5998. for k, v in self.area_variance_dic.items(): # 20241113 根据地区变更信息替换文本
  5999. text = text.replace(k, v)
  6000. # province_l = find_areas(p_pro, text)
  6001. # city_l = find_areas(p_city, text)
  6002. # district_l = find_areas(p_dis, text)
  6003. province_l, city_l, district_l = find_whole_areas(text) # 20240703 优化地址提取,解决类似 海南昌江 得到 海南 南昌 结果
  6004. # if len(province_l) == len(city_l) == 0:
  6005. # district_l = [it for it in district_l if
  6006. # re.search('[市县旗区]$', it[0])] # 20240428去掉只有区县地址且不是全称的匹配,避免错误 例 凌云工业股份有限公司 提取地区为广西白色凌云
  6007. province_l = chage_area2score(province_l, max_len=len(text))
  6008. city_l = chage_area2score(city_l, max_len=len(text))
  6009. district_l = chage_area2score(district_l, max_len=len(text))
  6010. pro_ids = dict()
  6011. city_ids = dict()
  6012. dis_ids = dict()
  6013. for pro in province_l:
  6014. name, score = pro
  6015. assert (name in full_dic['province'] or name in short_dic['province'])
  6016. if name in full_dic['province']:
  6017. idx = full_dic['province'][name]
  6018. if idx not in pro_ids:
  6019. pro_ids[idx] = 0
  6020. pro_ids[idx] += (score + 1)
  6021. else:
  6022. idx = short_dic['province'][name]
  6023. if idx not in pro_ids:
  6024. pro_ids[idx] = 0
  6025. pro_ids[idx] += (score + 0)
  6026. for city in city_l:
  6027. name, score = city
  6028. if name in full_dic['city']:
  6029. w = 0.1 if len(full_dic['city'][name]) > 1 else 1
  6030. for idx in full_dic['city'][name]:
  6031. if idx not in city_ids:
  6032. city_ids[idx] = 0
  6033. # weight = idx_dic[idx]['权重']
  6034. city_ids[idx] += (score + 2) * w
  6035. pro_idx = idx_dic[idx]['省']
  6036. if pro_idx in pro_ids:
  6037. pro_ids[pro_idx] += (score + 2) * w
  6038. else:
  6039. pro_ids[pro_idx] = (score + 2) * w * 0.5
  6040. elif name in short_dic['city']:
  6041. w = 0.1 if len(short_dic['city'][name]) > 1 else 1
  6042. for idx in short_dic['city'][name]:
  6043. if idx not in city_ids:
  6044. city_ids[idx] = 0
  6045. weight = idx_dic[idx]['权重']
  6046. city_ids[idx] += (score + 1) * w * weight
  6047. pro_idx = idx_dic[idx]['省']
  6048. if pro_idx in pro_ids:
  6049. pro_ids[pro_idx] += (score + 1) * w * weight
  6050. else:
  6051. pro_ids[pro_idx] = (score + 1) * w * weight * 0.5
  6052. for dis in district_l:
  6053. name, score = dis
  6054. if name in full_dic['district']:
  6055. w = 0.1 if len(full_dic['district'][name]) > 1 else 1
  6056. for idx in full_dic['district'][name]:
  6057. if idx not in dis_ids:
  6058. dis_ids[idx] = 0
  6059. # weight = idx_dic[idx]['权重']
  6060. dis_ids[idx] += (score + 1) * w
  6061. pro_idx = idx_dic[idx]['省']
  6062. if pro_idx in pro_ids:
  6063. pro_ids[pro_idx] += (score + 1) * w
  6064. else:
  6065. pro_ids[pro_idx] = (score + 1) * w * 0.5
  6066. city_idx = idx_dic[idx]['市']
  6067. if city_idx in city_ids:
  6068. city_ids[city_idx] += (score + 1) * w
  6069. else:
  6070. city_ids[city_idx] = (score + 1) * w * 0.5
  6071. elif name in short_dic['district']:
  6072. w = 0.1 if len(short_dic['district'][name]) > 1 else 1
  6073. for idx in short_dic['district'][name]:
  6074. if idx not in dis_ids:
  6075. dis_ids[idx] = 0
  6076. weight = idx_dic[idx]['权重']
  6077. dis_ids[idx] += (score + 0) * w
  6078. if idx_dic[idx]['市'] not in city_ids and idx_dic[idx]['省'] not in pro_ids: # 20241111 区县简称不在获取到的省、市范围内的过滤掉
  6079. continue
  6080. pro_idx = idx_dic[idx]['省']
  6081. if pro_idx in pro_ids:
  6082. pro_ids[pro_idx] += (score + 0) * w * weight
  6083. # else: # 20241015 注销 区县简称且不在提取的省市下面,不加分,避免提取错误 例:536550843
  6084. # pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
  6085. city_idx = idx_dic[idx]['市']
  6086. if city_idx in city_ids:
  6087. city_ids[city_idx] += (score + 0) * w * weight
  6088. # else: # 20241015 注销 区县简称且不在提取的省市下面,不加分,避免提取错误 例:536550843
  6089. # city_ids[city_idx] = (score + 0) * w * weight * 0.1
  6090. elif pro_idx in pro_ids:
  6091. city_ids[city_idx] = (score + 0) * w * weight * 0.1
  6092. for k, v in pro_ids.items():
  6093. pro_ids[k] = v * text_weight
  6094. for k, v in city_ids.items():
  6095. city_ids[k] = v * text_weight
  6096. for k, v in dis_ids.items():
  6097. dis_ids[k] = v * text_weight
  6098. return pro_ids, city_ids, dis_ids
  6099. area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
  6100. pro_ids, city_ids, dis_ids = get_pro_city_dis_score(text)
  6101. pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name, text_weight=0.01) # 20240422 修改为站源名称只取前三字,避免类似 459056219 中金岭南阳光采购平台 错提取阳光
  6102. for k in pro_ids1:
  6103. if k in pro_ids:
  6104. pro_ids[k] += pro_ids1[k]
  6105. else:
  6106. pro_ids[k] = pro_ids1[k]
  6107. for k in city_ids1:
  6108. if k in city_ids:
  6109. city_ids[k] += city_ids1[k]
  6110. else:
  6111. city_ids[k] = city_ids1[k]
  6112. for k in dis_ids1:
  6113. if k in dis_ids:
  6114. dis_ids[k] += dis_ids1[k]
  6115. else:
  6116. dis_ids[k] = dis_ids1[k]
  6117. big_area, pred_pro, pred_city, pred_dis = get_final_addr(pro_ids, city_ids, dis_ids)
  6118. if big_area != "":
  6119. area_dic['area'] = big_area
  6120. if pred_pro != "":
  6121. area_dic['province'] = pred_pro
  6122. if pred_city != "":
  6123. area_dic['city'] = pred_city
  6124. if pred_dis != "":
  6125. area_dic['district'] = pred_dis
  6126. if in_content:
  6127. area_dic['is_in_text'] = True
  6128. return {'district': area_dic}
  6129. def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
  6130. '''
  6131. 先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
  6132. :param project_name:
  6133. :param prem:
  6134. :param title:
  6135. :param list_articles:
  6136. :param web_source_name:
  6137. :return:
  6138. '''
  6139. def get_ree_addr(prem):
  6140. tenderee = ""
  6141. tenderee_address = ""
  6142. try:
  6143. for v in prem[0]['prem'].values():
  6144. for link in v['roleList']:
  6145. if link['role_name'] == 'tenderee' and tenderee == "":
  6146. tenderee = link['role_text']
  6147. tenderee_address = link['address']
  6148. except Exception as e:
  6149. print('解析prem 获取招标人、及地址出错')
  6150. return tenderee, tenderee_address
  6151. def get_role_address(text):
  6152. '''正则匹配获取招标人地址
  6153. 3:地址直接在招标人后面 招标人:xxx,地址:xxx
  6154. 4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
  6155. '''
  6156. p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
  6157. p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
  6158. p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
  6159. if re.search(p3, text):
  6160. return re.search(p3, text).group('addr')
  6161. elif re.search(p4, text):
  6162. return re.search(p4, text).group('addr')
  6163. elif re.search(p5, text):
  6164. return re.search(p5, text).group('addr')
  6165. else:
  6166. return ''
  6167. def get_project_addr(text):
  6168. p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+([\w()]{,20}[,。])?|\w{2,15}[,。])'
  6169. p2 = '项目位于(?P<addr>\w{2}市\w{2,4}区)'
  6170. if re.search(p1, text):
  6171. return re.search(p1, text).group('addr')
  6172. elif re.search(p2, text):
  6173. return re.search(p2, text).group('addr')
  6174. else:
  6175. return ''
  6176. def get_bid_addr(text):
  6177. p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售|所属)(地址|地点|所在地区?|地域):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
  6178. if re.search(p2, text):
  6179. return re.search(p2, text).group('addr')
  6180. else:
  6181. return ''
  6182. def get_all_addr(list_entitys):
  6183. tenderee_l = []
  6184. addr_l = []
  6185. for ent in list_entitys[0]:
  6186. if ent.entity_type == 'location' and len(ent.entity_text) > 2:
  6187. addr_l.append(ent.entity_text)
  6188. elif ent.entity_type in ['org', 'company']:
  6189. if ent.label in [0, 1]: # 加招标或代理
  6190. tenderee_l.append(ent.entity_text)
  6191. return ' '.join(addr_l), ' '.join(tenderee_l)
  6192. def get_title_addr(text):
  6193. p1 = '(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
  6194. if re.search(p1, text):
  6195. return re.search(p1, text).group('addr')
  6196. else:
  6197. return ''
  6198. if '##attachment##' in list_articles[0].content:
  6199. content, attachment = list_articles[0].content.split('##attachment##')
  6200. if len(content) < 200:
  6201. content += attachment
  6202. else:
  6203. content = list_articles[0].content
  6204. tenderee, tenderee_address = get_ree_addr(prem)
  6205. msc = ""
  6206. pro_addr = get_project_addr(content)
  6207. if pro_addr != "" and re.search('(采购人|招标人)?指定地点', pro_addr)==None: # 排除错误项目地址 例:554024168 1.5服务地点:采购人指定地点。
  6208. msc += '使用规则提取的项目地址;'
  6209. tenderee_address = pro_addr
  6210. else:
  6211. role_addr = get_role_address(content)
  6212. if role_addr != "" and re.search('(采购人|招标人)?指定地点', role_addr)==None:
  6213. msc += '使用规则提取的联系人地址;'
  6214. tenderee_address = role_addr
  6215. if tenderee_address == "":
  6216. title_addr = get_title_addr(title)
  6217. if title_addr != "":
  6218. msc += '使用规则提取的标题地址;'
  6219. tenderee_address = title_addr
  6220. else:
  6221. bid_addr = get_bid_addr(content)
  6222. if bid_addr != "":
  6223. msc += '使用规则提取的开标地址;'
  6224. tenderee_address = bid_addr
  6225. project_name = str(project_name)
  6226. tenderee = str(tenderee)
  6227. # print('招标人地址',role_addr, tenderee_address)
  6228. project_name = project_name + title if project_name not in title else title
  6229. # project_name = project_name.replace(tenderee, '')
  6230. if len(project_name)>3:
  6231. entity_list = getNers([project_name],useselffool=False) # 2024/4/26 修改为去重项目名称中所有公司名称
  6232. for tup in entity_list[0]:
  6233. if tup[2] in ['org', 'company']:
  6234. project_name = project_name.replace(tup[3], '')
  6235. text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)
  6236. web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错
  6237. text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) # 预防提取错 合肥 路南 新会 等地区
  6238. if pro_addr and re.search('\w{2,}([市县旗盟]|自治[区州县旗])', pro_addr):
  6239. if re.search('[市县旗盟]', pro_addr)==None: # 修复 486623506 项目地址不完整
  6240. pro_addr = text1 + ' '+ pro_addr
  6241. msc += '## 使用项目地址输入:%s ##;' % pro_addr
  6242. rs = self.get_area(pro_addr, '')
  6243. msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
  6244. rs['district']['province'], rs['district']['city'], rs['district']['district'])
  6245. if rs['district']['province'] != '全国' and rs['district']['city'] != '未知':
  6246. # print('地区匹配:', msc)
  6247. return rs
  6248. # print('text1:', text1)
  6249. msc += '## 第一次预测输入:%s ##;' % text1
  6250. rs = self.get_area(text1, '') # 2024/4/22 调整第一次输入不带站源名称,避免出错
  6251. msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
  6252. rs['district']['province'], rs['district']['city'], rs['district']['district'])
  6253. # self.f.write('%s %s \n' % (list_articles[0].id, msc))
  6254. # print('地区匹配:', msc)
  6255. if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
  6256. # msc = ""
  6257. all_addr, tenderees = get_all_addr(list_entitys)
  6258. text2 = tenderees + " " + all_addr + ' ' + title
  6259. msc += '使用实体列表所有招标人+所有地址;'
  6260. # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
  6261. text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
  6262. # print('text2:', text2)
  6263. msc += '## 第二次预测输入:%s %s##' % (text2,web_source_name)
  6264. rs2 = self.get_area(text2, web_source_name, in_content=True)
  6265. # rs2['district']['is_in_text'] = True
  6266. if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
  6267. rs = rs2
  6268. elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
  6269. rs = rs2
  6270. msc += '预测结果:省份:%s, 城市:%s,区县:%s' % (
  6271. rs['district']['province'], rs['district']['city'], rs['district']['district'])
  6272. # self.f.write('%s %s \n'%(list_articles[0].id, msc))
  6273. # print('地区匹配:', msc)
  6274. return rs
  6275. class TableTag2List():
  6276. '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
  6277. def table2list(self, table, text_process=None, return_html_table=False,return_kv=False):
  6278. '''
  6279. 表格补全及把表格内容列表返回
  6280. :param table:
  6281. :param text_process: 预处理方法,segment(),不为None 时把td内容做预处理,结果返回加标签,适配表头识别 [[[text, 0], [text, 0]] ], 否则只返回文本[[text, text], [text, text]]
  6282. :param return_html_table:
  6283. :param return_kv:
  6284. :return:
  6285. '''
  6286. self._output = []
  6287. row_ind = 0
  6288. col_ind = 0
  6289. html_table = []
  6290. for row in table.find_all('tr'):
  6291. # record the smallest row_span, so that we know how many rows
  6292. # we should skip
  6293. smallest_row_span = 1
  6294. if len(row.find_all(['td', 'th'], recursive=False)) > 20:
  6295. log('未补全前表格列数大于20的不做表格处理')
  6296. if return_html_table:
  6297. return [], []
  6298. return []
  6299. for cell in row.children:
  6300. if cell.name in ('td', 'th'):
  6301. # check multiple rows
  6302. # pdb.set_trace()
  6303. row_span = int(re.sub('[^0-9]', '', cell.get('rowspan'))) if cell.get('rowspan') and cell.get('rowspan').isdigit() else 1
  6304. # try updating smallest_row_span
  6305. smallest_row_span = min(smallest_row_span, row_span)
  6306. # check multiple columns
  6307. col_span = int(re.sub('[^0-9]', '', cell.get('colspan'))) if cell.get('colspan') and cell.get('colspan').isdigit() else 1
  6308. if col_span > 20: # 修复 335590254 山东港口阳光智采e平台 数据源表格第一行colspan为200超过50列造成无法提取问题
  6309. col_span = 20
  6310. # find the right index
  6311. while True:
  6312. if self._check_cell_validity(row_ind, col_ind):
  6313. break
  6314. col_ind += 1
  6315. # insert into self._output
  6316. try:
  6317. if text_process != None:
  6318. # text = [re.sub('\xa0', '', text_process(cell, final=False)), 0]
  6319. # td_text = re.sub('\xa0', '', text_process(cell, final=False))
  6320. td_text = re.sub('\s|\xa0', '', str(cell.get_text())) # 修复 370835008 td 内公司被p标签拆分为两半情况
  6321. if 'title' in cell.attrs and cell.get_text().strip().endswith('...') and cell.get_text().strip()[:-3] in cell.attrs['title']:
  6322. td_text = cell.attrs['title'] # 修复 类似 215597851 省略号隐藏内容
  6323. elif len(td_text)>30:
  6324. if return_kv:
  6325. td_text = cell.get_text().strip()
  6326. else:
  6327. td_text = re.sub('\xa0', '', text_process(cell, final=False))
  6328. if td_text == "":
  6329. td_text = ' '
  6330. text = [td_text,0]
  6331. else:
  6332. if return_kv:
  6333. td_text = cell.get_text().strip()
  6334. else:
  6335. td_text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '').replace('&nbsp', '')
  6336. text = td_text
  6337. # text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
  6338. # # text = re.sub('\s', '', text)[:200] # 只需取前200字即可
  6339. # text = ' ' if text == "" else text
  6340. self._insert(row_ind, col_ind, row_span, col_span, text)
  6341. if return_html_table:
  6342. html_table = self._insert_new(row_ind, col_ind, row_span, col_span, str(cell), html_table)
  6343. except UnicodeEncodeError:
  6344. raise Exception( 'Failed to decode text; you might want to specify kwargs transformer=unicode' )
  6345. # update col_ind
  6346. col_ind += col_span
  6347. if col_ind > 50 and text_process == None: # 表格要素提取及候选人提取的 表格列数大于50的去掉
  6348. if return_html_table:
  6349. return [], []
  6350. return []
  6351. # update row_ind
  6352. row_ind += smallest_row_span
  6353. col_ind = 0
  6354. if return_html_table:
  6355. temp_list = []
  6356. for row in self._output:
  6357. if len(row) > 0:
  6358. temp_list.append(row)
  6359. self._output = temp_list
  6360. temp_list = []
  6361. for row in html_table:
  6362. if len(row) > 0:
  6363. temp_list.append(row)
  6364. html_table = temp_list
  6365. return self._output, html_table
  6366. else:
  6367. return self._output
  6368. def _check_validity(self, i, j, height, width):
  6369. """
  6370. check if a rectangle (i, j, height, width) can be put into self.output
  6371. """
  6372. return all(self._check_cell_validity(ii, jj) for ii in range(i, i+height) for jj in range(j, j+width))
  6373. def _check_cell_validity(self, i, j):
  6374. """
  6375. check if a cell (i, j) can be put into self._output
  6376. """
  6377. if i >= len(self._output):
  6378. return True
  6379. if j >= len(self._output[i]):
  6380. return True
  6381. if self._output[i][j] == "":
  6382. return True
  6383. return False
  6384. def _insert(self, i, j, height, width, val):
  6385. # pdb.set_trace()
  6386. for ii in range(i, i+height):
  6387. for jj in range(j, j+width):
  6388. self._insert_cell(ii, jj, val)
  6389. def _insert_cell(self, i, j, val):
  6390. while i >= len(self._output):
  6391. self._output.append([])
  6392. while j >= len(self._output[i]):
  6393. self._output[i].append("")
  6394. if self._output[i][j] == "":
  6395. self._output[i][j] = val
  6396. def _insert_new(self, i, j, height, width, val, cell_list):
  6397. # pdb.set_trace()
  6398. for ii in range(i, i+height):
  6399. for jj in range(j, j+width):
  6400. cell_list = self._insert_cell_new(ii, jj, val, cell_list)
  6401. return cell_list
  6402. def _insert_cell_new(self, i, j, val, cell_list):
  6403. while i >= len(cell_list):
  6404. cell_list.append([])
  6405. while j >= len(cell_list[i]):
  6406. cell_list[i].append("")
  6407. if cell_list[i][j] == "":
  6408. cell_list[i][j] = val
  6409. return cell_list
  6410. def is_head_line(list_item):
  6411. '''
  6412. 调用表头识别模型判断是否为表头行
  6413. :param list_item: 行内容 例: ['技术参数、要求', '变更项']
  6414. :return:
  6415. '''
  6416. x = []
  6417. for item in list_item:
  6418. x.append(getPredictor("form").encode(item))
  6419. predict_y = getPredictor("form").predict(np.array(x), type="item")
  6420. count = 0
  6421. for item, values in zip(list_item, list(predict_y)):
  6422. if values[1] > 0.6:
  6423. count += 1
  6424. if count/len(list_item)>0.6:
  6425. return True
  6426. return False
  6427. class TablePremExtractor(object):
  6428. def __init__(self):
  6429. '''各要素表头规则'''
  6430. self.head_rule_dic = {
  6431. 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码|代码)",
  6432. 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
  6433. "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
  6434. "win_sort": "排名|排序|名次|推荐顺序",
  6435. 'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因|中标情况|^中标结果$',
  6436. "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟?推荐(入选|入围)?)?供应商(名称)?$",
  6437. "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
  6438. "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
  6439. "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格|中标存款金?额|中标资金|中标存款|存放金额|分配额度",
  6440. "serviceTime": '合同期限|工期/交货期/服务期|工期\(交货期\)|合格工期|服务期限|工期' \
  6441. '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期限' \
  6442. '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
  6443. '|交货时间|保洁期限|维保期|管理年限|工期承诺|(服务|合同|施工|实施|工程|设计)的?(年限|期限|周期|期:)' \
  6444. '|计划工期|工期要求|服务期限?' \
  6445. '|投标工期|设计工期|合格服务周期|总工期|服务时间(范围)?|流转期限|维护期限|服务时限|交货期' \
  6446. '|完成时间|中标工期|项目周期|期限要求|周期|供货期|合同的?履行日期|计划周期' \
  6447. '|履约期限|合同的?约定完成时限|合同的?完成日期|承诺完成日期' \
  6448. '|合同起始日起|合同的?履约期|履约截止日期|承包期限|合同的?完成日期|特许经营期限' \
  6449. '|服务期间|服务履行期|委托(管理)?期限|经营期限|数量' \
  6450. '|(工期|服务期限?|交货期限?|服务履行期|合同期限?|履[行约]期限?)说明|存款期限?|(存款|存放|定存)(期|年)限' \
  6451. '|服务(有效期|年限)|本?合同有效期|协议有效期|项目期限'
  6452. }
  6453. with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
  6454. self.headerset = pickle.load(f)
  6455. self.tb = TableTag2List()
  6456. def find_header(self, td_list, all_winner=False, first_line=False):
  6457. fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
  6458. header_dic = dict()
  6459. flag = False
  6460. contain_header = False
  6461. not_sure_winner = False # 是否 不确定中标的中标人表达方式
  6462. for text in set(fix_td_list) - self.headerset:
  6463. if len(text)<10 and re.search(self.head_rule_dic['bid_amount'], text):
  6464. self.headerset.add(text)
  6465. if len(set(fix_td_list))>0 and (first_line or len(set(fix_td_list) & self.headerset)>=2) and (len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6 or is_head_line(fix_td_list)):
  6466. other_tenderer = ""
  6467. other_tenderer2 = ""
  6468. flag = True
  6469. for i in range(len(td_list)) :
  6470. text = td_list[i]
  6471. text = re.sub('\s|[((]排名不分先后[))]', '', text)
  6472. text = re.sub('排名价', '', text) # 20241225 修复 252208201 排名价(元)错误为排名
  6473. text = re.sub('^人选', '入选', text)
  6474. if text == '备选中标人':
  6475. text = '第二候选人'
  6476. if len(re.sub('(([\w、×*/]{1,20}))$', '', text)) > 15: # 长度大于15 不进行表头匹配
  6477. continue
  6478. if re.search('未(中标|成交|中选|入围)原因', text): # 不提取此种表格
  6479. return flag, contain_header, dict(), not_sure_winner
  6480. num = 0
  6481. for k, v in self.head_rule_dic.items():
  6482. if re.search('评分|得分|分数|分值', text):
  6483. continue
  6484. if re.search(v, text):
  6485. if k in ['tenderer'] and re.search('是否', text):
  6486. continue
  6487. if k == 'budget' and re.search('量', text): # 预算工作量 预算采购量 等不作为预算
  6488. continue
  6489. elif k == 'bid_amount' and re.search('分配方案|基准利率|BP值', text): # 517987084 中标资金分配方案
  6490. continue
  6491. elif k in header_dic:
  6492. if k in ['budget', 'bid_amount'] and re.search('总(价|金?额)', text): # 总价替换单价
  6493. header_dic[k] = (i, text)
  6494. num += 1
  6495. elif k == 'project_code' and text != header_dic[k][1] and 'package_code' not in header_dic\
  6496. and re.search(self.head_rule_dic['package_code'], re.sub('\s', '', ','.join(td_list)))==None: # 如果出现两次项目编号且没有包号,把第二次出现的作为包号 例:472537470
  6497. header_dic['package_code'] = (i, text)
  6498. continue
  6499. header_dic[k] = (i, text)
  6500. num += 1
  6501. elif re.search('^((中标|成交|中选|入围|入选)(候选)?)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)(名称)?$', text) and re.search('未', text)==None:
  6502. other_tenderer = (i, text)
  6503. elif re.search('^((投标|应答|响应|候选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|(存款|投标)?银行|供应商)(名称)?$|^机构名称$|^单位(名称)?$', text) and re.search('未', text)==None:
  6504. other_tenderer2 = (i, text)
  6505. if num>1:
  6506. if re.search(self.head_rule_dic['project_code'], text) and re.search(self.head_rule_dic['package_code'], text): # 修复 528486798 分标编号-包号
  6507. continue
  6508. # print('表头错误,一个td匹配到两个表头:', header_dic)
  6509. return flag, contain_header, dict(), not_sure_winner
  6510. if text == '单位': # 20241128 补充金额单位
  6511. header_dic['amount_unit'] = (i, text)
  6512. if re.search(';金额((万?元))?;', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
  6513. if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
  6514. for i in range(len(td_list)):
  6515. text = td_list[i]
  6516. if re.search('^金额((万?元))?$',text):
  6517. header_dic['bid_amount'] = (i, text)
  6518. break
  6519. elif 'tenderee' in header_dic and 'budget' not in header_dic:
  6520. for i in range(len(td_list)):
  6521. text = td_list[i]
  6522. if re.search('^金额((万?元))?$', text):
  6523. header_dic['budget'] = (i, text)
  6524. break
  6525. if all_winner == 1 and 'tenderer' not in header_dic: # 标题有存款、入库、入围等公告补充其他表达做中标人
  6526. if other_tenderer!="":
  6527. header_dic['tenderer'] = other_tenderer
  6528. elif other_tenderer2!="":
  6529. header_dic['tenderer'] = other_tenderer2
  6530. if 'win_sort' not in header_dic:
  6531. not_sure_winner = True
  6532. elif 'tenderer' not in header_dic and 'win_or_not' in header_dic:
  6533. if other_tenderer!="":
  6534. header_dic['tenderer'] = other_tenderer
  6535. elif other_tenderer2!="":
  6536. header_dic['tenderer'] = other_tenderer2
  6537. if all_winner == 1 and 'win_sort' in header_dic: # 标题有存管类公告不分排名
  6538. header_dic.pop('win_sort')
  6539. if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
  6540. 'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
  6541. return flag, contain_header, header_dic, not_sure_winner
  6542. elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
  6543. if 'win_sort' in header_dic: # 有排名的 用候选人提取类
  6544. return flag, contain_header, dict(), not_sure_winner
  6545. elif re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_or_not' not in header_dic and re.search('(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', header_dic['bid_amount'][1])==None: # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
  6546. # print('只有供应商名称 没排名和包号的去掉')
  6547. return flag, contain_header, dict(), not_sure_winner
  6548. return flag,contain_header, header_dic, not_sure_winner
  6549. elif 'tenderer' in header_dic and (re.search('(中标|中选|中价|成交|竞得)(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)',header_dic['tenderer'][1]) or all_winner): # 有中标人,且有明确中标关键词的进行提取
  6550. return flag, contain_header, header_dic, not_sure_winner
  6551. # elif 'tenderer' in header_dic and 'serviceTime' in header_dic:
  6552. # return flag, contain_header, header_dic, not_sure_winner
  6553. elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
  6554. contain_header = True
  6555. return flag, contain_header, dict(), not_sure_winner
  6556. def extract_from_df(self, df, headers, web_source_name, all_winner=False):
  6557. prem_dic = {}
  6558. previous_package = "" # 上一行包号
  6559. multi_same_package = False # 非连续的重复包号
  6560. package_fix2raw = dict() # 处理后包号:处理前包号 字典
  6561. link_set = set()
  6562. tenderer_list = [] # 保存所有中标人
  6563. serviceTime_list = []
  6564. not_package = True if 'project_name' in headers and re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1]) and \
  6565. 'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
  6566. if set(['project_code', 'package_code', 'tenderee', 'tenderer']) & set(headers) == set() and ('project_name' not in headers # 补充没有项目名称或有项目名称且是货物的才过滤掉
  6567. or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683; 补充避免423647863采购意向被过滤
  6568. # print('没有包号及角色的不要')
  6569. return {}
  6570. have_bid_amount = False # 是否包含中标金额
  6571. if "bid_amount" in headers and re.search('[1-9]+', '#'.join([it.strip() for it in df[headers['bid_amount'][0]]])):
  6572. have_bid_amount = True
  6573. for i in df.index:
  6574. same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
  6575. project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
  6576. package_code_raw = df.loc[i, headers['package_code'][0]].strip() if "package_code" in headers else ""
  6577. project_name = df.loc[i, headers['project_name'][0]].strip() if "project_name" in headers else ""
  6578. tenderee = df.loc[i, headers['tenderee'][0]].strip() if "tenderee" in headers else ""
  6579. tenderer = df.loc[i, headers['tenderer'][0]].strip() if "tenderer" in headers else ""
  6580. budget_ = df.loc[i, headers['budget'][0]].strip() if "budget" in headers else ""
  6581. bid_amount_ = df.loc[i, headers['bid_amount'][0]].strip() if "bid_amount" in headers else ""
  6582. win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
  6583. win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else ""
  6584. serviceTime = df.loc[i, headers['serviceTime'][0]].strip() if "serviceTime" in headers else ""
  6585. amount_unit = df.loc[i, headers['amount_unit'][0]].strip() if "amount_unit" in headers else ""
  6586. if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
  6587. # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
  6588. break
  6589. if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2 and tenderer=='': # 内容为空或全部一样 停止匹配
  6590. # print('内容为空或全部一样 停止匹配')
  6591. break
  6592. if re.search('详见', project_name): # 去除某些表达: 详见招标文件
  6593. project_name = ""
  6594. if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}$', project_name):
  6595. package_code_raw = project_name
  6596. project_name = ""
  6597. package_code = package_code_raw
  6598. if re.search('合计|总计', package_code+project_code+project_name):
  6599. continue
  6600. if package_code + project_code == previous_package: # 处理 208162730 一个包采购多种东西情况
  6601. same_package = True
  6602. if previous_package!="": # 有包号或项目编号且跟上一行相同时,去除项目名称
  6603. project_name = ''
  6604. previous_package = package_code + project_code
  6605. if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取 防止类似 328485591 作为多包
  6606. break
  6607. if win_or_not != "" and (re.search('(建议|推荐)(中标|成交|中选)|是|^(中标|成交|中选)', win_or_not)==None or re.search('\w', win_or_not)==None): # 2024/04/2 修复 252208201 为空的不中标
  6608. continue
  6609. elif 'win_or_not' in headers and win_or_not == '': # 2024/12/25 修复 334753545 中标情况为空的不中标
  6610. continue
  6611. if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
  6612. continue
  6613. if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None and all_winner == False:
  6614. tenderer = ""
  6615. if tenderer in ['采购失败', '废标']: # 避免类似 353867205 这篇只提取到一个
  6616. continue
  6617. # tenderee = tenderee if self.is_role(tenderee) else ""
  6618. # tenderer = tenderer if self.is_role(tenderer) else ""
  6619. package = uniform_package_name(package_code) if package_code else '自增1' # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
  6620. if project_name != "" and package.startswith('自增'):
  6621. pk_l = find_package(project_name)
  6622. if len(pk_l)==1:
  6623. package = uniform_package_name(pk_l[0].group(0))
  6624. elif re.search('[一二三四五六七八九十]+标段:|标段[一二三四五六七八九十]+:', tenderer) and package.startswith('自增'):
  6625. pk_l = find_package(tenderer)
  6626. if len(pk_l) == 1:
  6627. package = uniform_package_name(pk_l[0].group(0))
  6628. tenderee = get_role(tenderee, self.nlp_enterprise) if tenderee!="" else tenderee
  6629. tenderer = get_role(tenderer, self.nlp_enterprise) if tenderer!='' else tenderer
  6630. tenderee = cut_repeat_name(tenderee)
  6631. tenderer = cut_repeat_name(tenderer)
  6632. if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
  6633. break
  6634. if not_package:
  6635. if (project_code, package_code, tenderee, tenderer, budget_, bid_amount_) in link_set:
  6636. continue
  6637. link_set.add((project_code, package_code, tenderee, tenderer, budget_, bid_amount_))
  6638. else:
  6639. if (project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_) in link_set:
  6640. continue
  6641. link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
  6642. if project_code != "":
  6643. uni_project_code= uniform_package_name(project_code)
  6644. if uni_project_code != "" and uni_project_code!=package:
  6645. if package.startswith('自增'): # 没有包号有项目编号的,直接用项目编号
  6646. package = uni_project_code
  6647. else:
  6648. # print('重组包号:', '%s_%s'%(uni_project_code, package))
  6649. package = '%s_%s'%(uni_project_code, package) # 同时包号项目编号及包号的,组合起来做包号
  6650. if package_code_raw!='':
  6651. if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
  6652. package_fix2raw[package] = package_code_raw
  6653. elif same_package == False:
  6654. multi_same_package = True
  6655. if multi_same_package:
  6656. package = package_code_raw
  6657. if package not in prem_dic or not same_package:
  6658. prem_dic[package] = {
  6659. 'code': '',
  6660. 'name': '',
  6661. 'roleList': [],
  6662. 'tendereeMoney': 0,
  6663. 'tendereeMoneyUnit': ""
  6664. }
  6665. prem_dic[package]['code'] = project_code
  6666. prem_dic[package]['name'] = project_name
  6667. if budget_ != "":
  6668. if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税|(六个月|一年|\w{2,3})期加点\d+BP', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
  6669. prem_dic.pop(package)
  6670. break
  6671. budget_header = headers['budget'][1] if 'budget' in headers else ''
  6672. if amount_unit!='' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元', budget_+budget_header)==None : # 20241128 补充某些表格价格单位分开两列, 例:557953660
  6673. budget_ += amount_unit
  6674. budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率|期加点\d+BP', budget_)==None else (0, '')
  6675. if re.search('元[/每]', amount_unit) or re.search('单价', budget_header):
  6676. unit_tendereeMoney = budget
  6677. budget = 0
  6678. else:
  6679. unit_tendereeMoney = 0
  6680. if (re.search('费率|下浮率|[%%‰折]|优惠率',
  6681. budget_header + budget_) and budget < 100) or budget > 50000000000: # 如果是费率或大于500亿的金额改为0
  6682. budget = 0
  6683. if budget > 0:
  6684. if same_package and prem_dic[package]['tendereeMoney'] != budget: # 处理 类似 136839070 一包多物品多预算
  6685. prem_dic[package]['tendereeMoney'] += budget
  6686. else:
  6687. prem_dic[package]['tendereeMoney'] = budget
  6688. prem_dic[package]['tendereeMoneyUnit'] = money_unit
  6689. if unit_tendereeMoney > 0:
  6690. if 'unit_tendereeMoney' not in prem_dic[package]:
  6691. prem_dic[package]['unit_tendereeMoney'] = 0
  6692. if same_package and prem_dic[package]['unit_tendereeMoney'] != unit_tendereeMoney: # 处理 类似 136839070 一包多物品多预算
  6693. prem_dic[package]['unit_tendereeMoney'] += unit_tendereeMoney
  6694. else:
  6695. prem_dic[package]['unit_tendereeMoney'] = unit_tendereeMoney
  6696. if tenderee and not same_package:
  6697. prem_dic[package]['roleList'].append({
  6698. "address": "",
  6699. "linklist": [],
  6700. "role_money": {
  6701. "discount_ratio": "",
  6702. "downward_floating_ratio": "",
  6703. "floating_ratio": "",
  6704. "money": 0,
  6705. "money_unit": ""
  6706. },
  6707. "role_name": "tenderee",
  6708. "role_text": tenderee,
  6709. "serviceTime": ""
  6710. })
  6711. if tenderer:
  6712. if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税|(六个月|一年|\w{2,3})期加点\d+BP', '',
  6713. bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
  6714. prem_dic.pop(package)
  6715. break
  6716. bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
  6717. if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and bid_amount_!='' and re.search('元',
  6718. bid_amount_ + bid_amount_header) == None:
  6719. bid_amount_ += amount_unit
  6720. bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率|期加点\d+BP', bid_amount_)==None and 'bid_amount' in headers else (0, '')
  6721. if re.search('元[/每]', amount_unit) or re.search('单价', bid_amount_header):
  6722. unit_price = bid_amount
  6723. bid_amount = 0
  6724. else:
  6725. unit_price = 0
  6726. if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
  6727. if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
  6728. prem_dic.pop(package)
  6729. continue
  6730. elif 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and have_bid_amount and bid_amount_ in ['/','','0','0.0']: # 如果不是所有行中标金额都为0,则把为0的做非中标
  6731. if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
  6732. prem_dic.pop(package)
  6733. continue
  6734. if (re.search('费率|下浮率|[%%‰折]|优惠率',
  6735. bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000: # 如果是费率或大于500亿的金额改为0
  6736. bid_amount = 0
  6737. if serviceTime:
  6738. serviceTime_text = headers['serviceTime'][1] + serviceTime if headers['serviceTime'][1][-1] in [':',':'] else headers['serviceTime'][1] + ':' + serviceTime
  6739. # print('serviceTime_text',serviceTime_text)
  6740. serviceTime = extract_servicetime(serviceTime_text)
  6741. serviceTime.sort(key=lambda x:x.get('begin_index',0))
  6742. serviceTime = extract_serviceTime(serviceTime[0]['body'],"") if serviceTime else ""
  6743. # print(serviceTime)
  6744. if not same_package or len(prem_dic[package]['roleList'])==0:
  6745. role_dic = {
  6746. "address": "",
  6747. "linklist": [],
  6748. "role_money": {
  6749. "discount_ratio": "",
  6750. "downward_floating_ratio": "",
  6751. "floating_ratio": "",
  6752. "money": bid_amount,
  6753. "money_unit": money_unit
  6754. },
  6755. "role_name": "win_tenderer",
  6756. "role_text": tenderer,
  6757. "serviceTime": serviceTime
  6758. }
  6759. if unit_price > 0:
  6760. role_dic['role_money']['unit_price'] = unit_price
  6761. prem_dic[package]['roleList'].append(role_dic)
  6762. elif prem_dic[package]['roleList'] and prem_dic[package]['roleList'][-1].get('role_name', '')=='win_tenderer':
  6763. if 'multi_winner' not in prem_dic[package]['roleList'][-1]:
  6764. prem_dic[package]['roleList'][-1]['multi_winner'] = prem_dic[package]['roleList'][-1]['role_text']
  6765. prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
  6766. elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
  6767. prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
  6768. if bid_amount != 0 or unit_price > 0: # 有中标金额的才放进去
  6769. if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
  6770. prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
  6771. prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit, "serviceTime": serviceTime})
  6772. tenderer_list.append(tenderer)
  6773. serviceTime_list.append(serviceTime)
  6774. if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
  6775. prem_dic.pop(package)
  6776. # break # 注释掉避免 400084571 某些包废标 中断匹配
  6777. if multi_same_package: # 预处理后包号重复的,使用原始包号
  6778. for k, v in package_fix2raw.items():
  6779. if k in prem_dic:
  6780. prem_dic[v] = prem_dic.pop(k)
  6781. if len(tenderer_list)>2 and len(set(tenderer_list))==1 and "package_code" not in headers: # 没提取到包号且中标人一样应该是错误多包,需去掉多包 例 244355092 281854766
  6782. total_money = 0
  6783. for v in prem_dic.values():
  6784. for d in v['roleList']:
  6785. if d['role_name'] == "win_tenderer":
  6786. total_money += d['role_money']['money']
  6787. if 'other_winner_dic' in d:
  6788. for other in d['other_winner_dic']:
  6789. total_money += other.get('money', 0)
  6790. return {'自增1': {
  6791. 'code': '',
  6792. 'name': '',
  6793. 'roleList': [{
  6794. "address": "",
  6795. "linklist": [],
  6796. "role_money": {
  6797. "discount_ratio": "",
  6798. "downward_floating_ratio": "",
  6799. "floating_ratio": "",
  6800. "money": total_money,
  6801. "money_unit": ''
  6802. },
  6803. "role_name": "win_tenderer",
  6804. "role_text": tenderer_list[0],
  6805. "serviceTime": serviceTime_list[0]
  6806. }],
  6807. 'tendereeMoney': 0,
  6808. 'tendereeMoneyUnit': ""
  6809. }}
  6810. return prem_dic
  6811. def update_prem(self, rs_dic, tmp_dic):
  6812. '''
  6813. 合并更新 prem
  6814. :param rs_dic: 返回结果
  6815. :param tmp_dic: 待合并结果
  6816. :return:
  6817. '''
  6818. if '自增1' in tmp_dic and '自增1' not in rs_dic and len(tmp_dic)==len(rs_dic):
  6819. pass
  6820. else:
  6821. for pack in tmp_dic:
  6822. if pack in rs_dic:
  6823. for k in tmp_dic[pack]:
  6824. if rs_dic[pack][k] in ['', 0]:
  6825. rs_dic[pack][k] = tmp_dic[pack][k]
  6826. elif rs_dic[pack][k] == []:
  6827. rs_dic[pack][k] = tmp_dic[pack][k]
  6828. elif k == 'roleList' and len(rs_dic[pack][k])>0 and rs_dic[pack][k][0].get('role_money', {}).get('money', 0) == 0:
  6829. rs_dic[pack][k] = tmp_dic[pack][k]
  6830. else:
  6831. rs_dic[pack] = tmp_dic[pack]
  6832. def get_prem(self, soup, web_source_name='', all_winner=False):
  6833. tables = soup.find_all('table')
  6834. tables.reverse()
  6835. rs_dic = {}
  6836. for table in tables:
  6837. text = table.text.strip()
  6838. pre_text = ""
  6839. previous = None
  6840. if table.findPreviousSibling() != None:
  6841. previous = table.findPreviousSibling()
  6842. pre_text = previous.text.strip()
  6843. if pre_text == "" and table.findPreviousSibling().findPreviousSibling() != None: # 修复表格前一标签没内容,再前一个才有内容情况
  6844. previous = table.findPreviousSibling().findPreviousSibling()
  6845. pre_text = previous.text.strip()
  6846. if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+pre_text): # 包含业绩的表格过滤掉,不进行处理
  6847. tb_ex = table.extract()
  6848. if previous:
  6849. sib = previous.extract()
  6850. continue
  6851. trs = self.tb.table2list(table)
  6852. # table.extract()
  6853. i = 0
  6854. headers = ""
  6855. table_prem = {}
  6856. while i < len(trs) - 1:
  6857. flag_, contain_header_, headers_, not_sure_winner = self.find_header(trs[i], all_winner, first_line=i==0)
  6858. if flag_ and 'tenderer' in headers_ and not_sure_winner and re.search('中标|成交|中选|入围|入选', pre_text)==None:
  6859. # print('过滤:',headers_)
  6860. flag_ = False
  6861. headers_ = {}
  6862. if flag_ and headers_ != dict():
  6863. table_items = []
  6864. headers = headers_
  6865. for j in range(i + 1, len(trs)):
  6866. if len(trs[j]) == len(trs[i]):
  6867. flag_2, contain_header_2, headers_2, not_sure_winner = self.find_header(trs[j], all_winner)
  6868. if flag_2 or contain_header_2:
  6869. if j == i+1 and flag_2:
  6870. if len(headers_)<=len(headers_2):
  6871. headers = headers_2
  6872. continue
  6873. elif trs[i] == trs[j]: # 修复表格重复表头多次出现情况 例:514890585
  6874. continue
  6875. break
  6876. elif ''.join(trs[j]).strip() == '': # 修复整行为空的 例:514890585
  6877. continue
  6878. else:
  6879. table_items.append(trs[j])
  6880. else:
  6881. # print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
  6882. break
  6883. if len(table_items) > 0:
  6884. df = pd.DataFrame(table_items)
  6885. prem_ = self.extract_from_df(df, headers, web_source_name, all_winner)
  6886. # rs_dic.update(prem_)
  6887. # table_prem.update(prem_)
  6888. self.update_prem(table_prem, prem_)
  6889. i = j - 1
  6890. i += 1
  6891. if table_prem and 'project_code' not in headers and 'package_code' not in headers and '自增1' in table_prem and table.find_previous_sibling(): # 表格内没有标段的,从上一个兄弟标签找标段
  6892. sib = table.find_previous_sibling()
  6893. sib_text = sib.get_text()
  6894. ser_sib = re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
  6895. if sib.name in ['p','div','dl','ol','ul','h1','h2','h3','h4','h5','h6'] and len(sib_text)<100 and ser_sib:
  6896. package_sib = ser_sib.group(0)
  6897. package_sib = uniform_package_name(package_sib)
  6898. table_prem[package_sib] = table_prem.pop('自增1')
  6899. if table_prem:
  6900. # rs_dic.update(table_prem)
  6901. self.update_prem(rs_dic, table_prem)
  6902. table.extract()
  6903. return rs_dic
  6904. def predict(self, html, nlp_enterprise, web_source_name="", all_winner=False):
  6905. html = re.sub("<html>|</html>|<body>|</body>","",html)
  6906. html = re.sub("##attachment##","",html)
  6907. soup = BeautifulSoup(html, 'lxml')
  6908. richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
  6909. self.nlp_enterprise = nlp_enterprise
  6910. in_attachment = False
  6911. if richText:
  6912. richText = richText.extract() # 过滤掉附件
  6913. del_tabel_achievement(soup) # 20240819 过滤掉业绩表格
  6914. prem = self.get_prem(soup, web_source_name, all_winner)
  6915. if prem == {} and richText:
  6916. del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
  6917. prem = self.get_prem(richText, web_source_name, all_winner)
  6918. in_attachment = True
  6919. if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
  6920. k = list(prem)[0]
  6921. if k.startswith('自增'):
  6922. prem['Project'] = prem.pop(k)
  6923. return prem, in_attachment
  6924. class CandidateExtractor(object):
  6925. def __init__(self):
  6926. '''各要素表头规则'''
  6927. self.head_rule_dic = {
  6928. 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
  6929. 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
  6930. "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
  6931. "win_sort": "排名|排序|名次|推荐顺序",
  6932. 'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论|^选择设备$', # 补充站源特别表达:例:577351909 选择设备 1 为中标 0 非中标
  6933. "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位|^公司名称$|供应商单位名称$", #补充 368295593 投标个人/单位 提取
  6934. "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
  6935. "win_tenderer": "第一名|第一(中标|成交)?候选人",
  6936. "second_tenderer": "第二名|第二(中标|成交)?候选人",
  6937. "third_tenderer": "第三名|第三(中标|成交)?候选人",
  6938. }
  6939. '''非表格候选人正则'''
  6940. # self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|应答人)|(通过)?名单)(名称|名单|全称|\d)?:$'
  6941. self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答|响应)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?[是为:]?$'
  6942. self.tb = TableTag2List()
  6943. with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
  6944. self.headerset = pickle.load(f)
  6945. def find_header(self, td_list):
  6946. fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
  6947. header_dic = dict()
  6948. flag = False
  6949. contain_header = False
  6950. if len(set(fix_td_list) & self.headerset)>=2 and (len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6 or is_head_line(fix_td_list)):
  6951. flag = True
  6952. for i in range(len(td_list)) :
  6953. text = td_list[i]
  6954. text = re.sub('\s|[((]排名不分先后[))]', '', text)
  6955. if len(text) > 15: # 长度大于15 不进行表头匹配
  6956. continue
  6957. if re.search('未(中标|成交)原因', text): # 不提取此种表格
  6958. return flag, contain_header, dict()
  6959. num = 0
  6960. for k, v in self.head_rule_dic.items():
  6961. if k == 'candidate' and re.search('第[一二三]名|第[一二三](中标|成交)?候选人', text):
  6962. continue
  6963. if re.search('评分|得分|分数|分值', text):
  6964. continue
  6965. if re.search(v, text):
  6966. if k in ['candidate', 'win_tenderer', 'second_tenderer', 'third_tenderer'] and re.search('是否', text):
  6967. continue
  6968. elif k == 'win_or_not' and re.search('是否(中标|成交)候选人', text): # 修复 584112560 把第二作第一错误
  6969. continue
  6970. header_dic[k] = (i, text)
  6971. # if k != 'candidate': # candidate 可与前三候选重复
  6972. num += 1
  6973. if 'win_tenderer'in header_dic and 'second_tenderer' in header_dic and 'candidate' in header_dic:
  6974. header_dic.pop('candidate')
  6975. if num>1:
  6976. # print('表头错误,一个td匹配到两个表头:', header_dic)
  6977. return flag, contain_header, dict()
  6978. if text == '单位': # 20241128 补充金额单位
  6979. header_dic['amount_unit'] = (i, text)
  6980. if ('candidate' in header_dic and 'win_sort' in header_dic) or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic): # 有排名才返回表头进行提取
  6981. return flag, contain_header, header_dic
  6982. elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(fix_td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
  6983. contain_header = True
  6984. return flag, contain_header, dict()
  6985. def is_role(self, text):
  6986. if len(text) > 25 or len(text) < 4:
  6987. return False
  6988. elif len(re.findall('有限责?任?公司', text)) > 1:
  6989. return False
  6990. elif re.search('[\w()]{4,}(有限责?任?公司|学校|学院|大学|中学|小学|医院|管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处)$', text):
  6991. return True
  6992. else:
  6993. ners = selffool.ner(text)
  6994. if len(ners[0]) == 1 and ('company' in ners[0][0] or 'org' in ners[0][0]):
  6995. return True
  6996. return False
  6997. def get_role(self, text, nlp_enterprise):
  6998. '''
  6999. 获取字符串text角色实体
  7000. :param text: 待获取实体字符串
  7001. :param nlp_enterprise: 公告中的角色实体列表
  7002. :return:
  7003. '''
  7004. text = re.sub('主报名人:|联合报名人:|联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]|(联合体(牵头|成员)单位)'
  7005. , ',', text)
  7006. text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
  7007. text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '',
  7008. text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
  7009. text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
  7010. if text in nlp_enterprise:
  7011. return text
  7012. if len(text) > 50 or len(text)<4:
  7013. return ''
  7014. ners = getNers([text], useselffool=True)
  7015. roles = []
  7016. if ners:
  7017. for ner in ners[0]:
  7018. if ner[2] in ['org', 'company', 'location']:
  7019. roles.append(ner[3])
  7020. if roles and len(''.join(roles)) > len(text)*0.8:
  7021. return roles[0]
  7022. else:
  7023. return ''
  7024. def extract_from_df(self, df, headers):
  7025. prem_dic = {}
  7026. link_set = set()
  7027. candidate_set = set()
  7028. role_dic = dict() # 保存一二三候选人并排的情况
  7029. findtop3 = False
  7030. findmoney = False
  7031. line_num = 0
  7032. line_package = None
  7033. package_flag = 0
  7034. if "package_code" in headers:
  7035. package_flag = 1
  7036. if len(df)!=len(set(df[headers["package_code"][0]])): # 如果有包号但重复,进行下列判断是否和跟其他字段组合包号
  7037. if "project_code" in headers and df[headers["project_code"][0]][0] != df[headers["package_code"][0]][0]:
  7038. package_flag = 2
  7039. elif "project_name" in headers and find_package(df[headers["package_code"][0]][0]):
  7040. package_flag = 3
  7041. for i in df.index:
  7042. package_code_raw = df.loc[i, headers['package_code'][0]].strip() if "package_code" in headers else ""
  7043. project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
  7044. project_name = df.loc[i, headers['project_name'][0]].strip() if "project_name" in headers else ""
  7045. candidate_ = df.loc[i, headers['candidate'][0]].strip() if "candidate" in headers else ""
  7046. win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else ""
  7047. # budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
  7048. bid_amount_ = df.loc[i, headers['bid_amount'][0]].strip() if "bid_amount" in headers else ""
  7049. win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
  7050. win_tenderer = df.loc[i, headers['win_tenderer'][0]].strip() if "win_tenderer" in headers else ""
  7051. second_tenderer = df.loc[i, headers['second_tenderer'][0]].strip() if "second_tenderer" in headers else ""
  7052. third_tenderer = df.loc[i, headers['third_tenderer'][0]].strip() if "third_tenderer" in headers else ""
  7053. amount_unit = df.loc[i, headers['amount_unit'][0]].strip() if "amount_unit" in headers else ""
  7054. if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配 # 排除 ,win_sort 避免367940050漏提取
  7055. # print('包含表头, 停止匹配')
  7056. break
  7057. if len(set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2: # 全部为空或内容一样 停止匹配
  7058. # print('全部为空或内容一样 停止匹配')
  7059. if len(set(df.loc[i,:]))==1 and re.search('^第?([一二三四五六七八九十]{1,3}|[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段])([一二三四五六七八九十]{1,3}|[a-zA-Z0-9-]{,9})?$', win_sort):
  7060. line_package = win_sort
  7061. continue
  7062. else:
  7063. break
  7064. if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替
  7065. col_indx = headers['candidate'][0] -1
  7066. pre_col = df.loc[i, col_indx]
  7067. if col_indx > 0 and pre_col == candidate_:
  7068. pre_col = df.loc[i, col_indx - 1]
  7069. if re.search('第[一二三]名|第[一二三](中标)?候选人', pre_col):
  7070. win_sort = pre_col
  7071. package_code = package_code_raw
  7072. if package_code == '' and line_package:
  7073. package_code = line_package
  7074. # candidate = candidate_ if self.is_role(candidate_) else ""
  7075. # tenderer = tenderer if self.is_role(tenderer) else ""
  7076. candidate = self.get_role(candidate_, self.nlp_enterprise)
  7077. # if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
  7078. # break
  7079. if(candidate_,win_tenderer, second_tenderer,third_tenderer, bid_amount_,package_code,project_code,win_sort) in link_set:
  7080. continue
  7081. link_set.add((candidate_, win_tenderer, second_tenderer, third_tenderer, bid_amount_,package_code,project_code,win_sort))
  7082. package = package_code
  7083. if package == "" and project_code != "": # 修复 395747178 多项目 只提取到一个
  7084. package = project_code
  7085. package = uniform_package_name(package) if package !="" else "Project"
  7086. if package_flag == 2 and project_code != "":
  7087. project_code_pk = uniform_package_name(project_code)
  7088. package = "%s_%s"%(project_code_pk, package)
  7089. elif package_flag == 3 and project_name != "":
  7090. for iter in find_package(project_name):
  7091. project_name_pk = uniform_package_name(iter.group(0))
  7092. package = "%s_%s"%(project_name_pk, package)
  7093. break
  7094. if candidate:
  7095. if win_or_not and re.search('否|未入围', win_or_not):
  7096. candidate_set.add(candidate)
  7097. # elif re.search('^((建议|推荐)(中标|成交)|是)$', win_or_not) and win_sort in ['', '参与投标单位及排名'] and win_tenderer=='':
  7098. # win_sort = '第一名'
  7099. # candidate_set.add(candidate)
  7100. else:
  7101. candidate_set.add(candidate)
  7102. if win_tenderer and second_tenderer: # and third_tenderer 128778062 这篇只有 第一二候选人
  7103. if re.search("(候选人|投标人|单位|公司)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人|单位|公司)名?称?", df.loc[i, 1]):
  7104. findtop3 = True
  7105. for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
  7106. [win_tenderer, second_tenderer, third_tenderer]):
  7107. text = self.get_role(text, self.nlp_enterprise)
  7108. if text:
  7109. # if self.is_role(text):
  7110. if type not in role_dic:
  7111. role_dic[type] = dict()
  7112. role_dic[type]['role_text'] = text
  7113. candidate_set.add(text)
  7114. elif re.search('投标报价|报价$', df.loc[i, 0]) or re.search('投标报价|报价$', df.loc[i, 1]):
  7115. findmoney = True
  7116. header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
  7117. for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
  7118. [win_tenderer, second_tenderer, third_tenderer]):
  7119. if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
  7120. text)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
  7121. break
  7122. if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元', text+header)==None: # 补充另外在一列的金额单位
  7123. text += amount_unit
  7124. money, money_unit = money_process(text, header)
  7125. if re.search('元[/每]', amount_unit) or re.search('单价', header):
  7126. unit_price = money
  7127. money = 0
  7128. else:
  7129. unit_price = 0
  7130. if (re.search('费率|下浮率|[%%‰折]|优惠率', header+text) and money < 100) or money > 50000000000: # 如果是费率或大于500亿的金额改为0
  7131. money = 0
  7132. if money > 0:
  7133. if type not in role_dic:
  7134. role_dic[type] = dict()
  7135. role_dic[type]['money'] = money
  7136. role_dic[type]['money_unit'] = money_unit
  7137. if unit_price > 0:
  7138. if type not in role_dic:
  7139. role_dic[type] = dict()
  7140. role_dic[type]['unit_price'] = unit_price
  7141. role_dic[type]['money_unit'] = money_unit
  7142. else:
  7143. line_num += 1
  7144. if findtop3 and findmoney:
  7145. break
  7146. if line_num > 3:
  7147. break
  7148. elif candidate and win_sort:
  7149. role_type = ""
  7150. if re.search('第[一1]|^[一1]$', win_sort):
  7151. role_type = "win_tenderer"
  7152. if win_or_not in ['否', '未中标', '0']: # 修复特别站源表达 577351909 选择设备:0 不是中标
  7153. role_type = ''
  7154. elif re.search('第[二2]|^[二2]$', win_sort):
  7155. role_type = "second_tenderer"
  7156. if win_or_not in ['是', '1']:
  7157. role_type = "win_tenderer"
  7158. elif re.search('第[三3]|^[三3]$', win_sort):
  7159. role_type = "third_tenderer"
  7160. if role_type != "":
  7161. if package not in prem_dic:
  7162. prem_dic[package] = {
  7163. 'code': '',
  7164. 'name': '',
  7165. 'roleList': [],
  7166. 'tendereeMoney': 0,
  7167. 'tendereeMoneyUnit': ""
  7168. }
  7169. prem_dic[package]['code'] = project_code
  7170. prem_dic[package]['name'] = project_name
  7171. if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', bid_amount_))> 5: # 金额字段出现超过5个非金额字符,中断匹配
  7172. break
  7173. header = headers['bid_amount'][1] if "bid_amount" in headers else ''
  7174. if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元',
  7175. bid_amount_ + header) == None: # 补充另外在一列的金额单位
  7176. bid_amount_ += amount_unit
  7177. bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if "bid_amount" in headers else (0, "")
  7178. if re.search('元[/每]', amount_unit) or re.search('单价', header):
  7179. unit_price = bid_amount
  7180. bid_amount = 0
  7181. else:
  7182. unit_price = 0
  7183. if (re.search('费率|下浮率|[%%‰折]|优惠率',
  7184. header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000: # 如果是费率或大于500亿的金额改为0
  7185. bid_amount = 0
  7186. tmp_role_dic = {
  7187. "address": "",
  7188. "linklist": [],
  7189. "role_money": {
  7190. "discount_ratio": "",
  7191. "downward_floating_ratio": "",
  7192. "floating_ratio": "",
  7193. "money": bid_amount,
  7194. "money_unit": money_unit
  7195. },
  7196. "role_name": role_type,
  7197. "role_text": candidate,
  7198. "serviceTime": ""
  7199. }
  7200. if unit_price > 0:
  7201. tmp_role_dic['role_money']['unit_price'] = unit_price
  7202. prem_dic[package]['roleList'].append(tmp_role_dic)
  7203. if len(prem_dic[package]['roleList']) == 0: # 只有项目编号和名称的 丢弃
  7204. prem_dic.pop(package)
  7205. if role_dic and prem_dic == dict():
  7206. if package not in prem_dic:
  7207. prem_dic[package] = {
  7208. 'code': '',
  7209. 'name': '',
  7210. 'roleList': [],
  7211. 'tendereeMoney': 0,
  7212. 'tendereeMoneyUnit': ""
  7213. }
  7214. for role_type, v in role_dic.items():
  7215. role_text = v.get('role_text', '')
  7216. if role_text == "":
  7217. continue
  7218. money = v.get('money', 0)
  7219. money_unit = v.get('money_unit', '')
  7220. prem_dic[package]['roleList'].append({
  7221. "address": "",
  7222. "linklist": [],
  7223. "role_money": {
  7224. "discount_ratio": "",
  7225. "downward_floating_ratio": "",
  7226. "floating_ratio": "",
  7227. "money": money,
  7228. "money_unit": money_unit
  7229. },
  7230. "role_name": role_type,
  7231. "role_text": role_text,
  7232. "serviceTime": ""
  7233. })
  7234. if len(prem_dic[package]['roleList']) == 0: # 只有项目编号和名称的 丢弃
  7235. prem_dic.pop(package)
  7236. return prem_dic, candidate_set
  7237. def get_prem(self, soup):
  7238. tables = soup.find_all('table')
  7239. tables.reverse()
  7240. rs_dic = {}
  7241. candidate_set = set()
  7242. for table in tables:
  7243. trs = self.tb.table2list(table)
  7244. i = 0
  7245. headers = ""
  7246. while i < len(trs) - 1:
  7247. flag_, contain_header_, headers_ = self.find_header(trs[i])
  7248. if flag_ and headers_ != dict():
  7249. table_items = []
  7250. headers = headers_
  7251. for j in range(i + 1, len(trs)):
  7252. if len(trs[j]) == len(trs[i]):
  7253. flag_, contain_header_, headers_ = self.find_header(trs[j])
  7254. if flag_ or contain_header_:
  7255. break
  7256. else:
  7257. table_items.append(trs[j])
  7258. else:
  7259. # print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
  7260. break
  7261. if len(table_items) >= 1:
  7262. df = pd.DataFrame(table_items)
  7263. prem_, candidate_set_ = self.extract_from_df(df, headers)
  7264. # print('prem_: ', prem_)
  7265. rs_dic.update(prem_)
  7266. candidate_set.update(candidate_set_)
  7267. i = j - 1
  7268. i += 1
  7269. if rs_dic and 'package_code' not in headers and 'Project' in rs_dic and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
  7270. sib = table.find_previous_sibling()
  7271. sib_text = sib.get_text()
  7272. ser_sib = re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
  7273. if sib.name in ['p', 'div'] and len(sib_text)<100 and ser_sib:
  7274. package_sib = ser_sib.group(0)
  7275. package_sib = uniform_package_name(package_sib)
  7276. rs_dic[package_sib] = rs_dic.pop('Project')
  7277. table.extract()
  7278. return rs_dic, candidate_set
  7279. def get_candidates_from_text(self, list_sentences, list_entitys):
  7280. candidates = set()
  7281. tenderee_or_agency = set()
  7282. sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
  7283. for ent in list_entitys[0]:
  7284. if ent.entity_type in ['org', 'company']:
  7285. sen_index = ent.sentence_index
  7286. text = sentences[sen_index].sentence_text
  7287. b = ent.wordOffset_begin
  7288. e = ent.wordOffset_end
  7289. if ent.label in [2,3,4]: # 直接加实体预测的候选人, 否则规则检查是否为候选人
  7290. candidates.add(ent.entity_text)
  7291. elif isinstance(b, int) and isinstance(e, int) and ent.label in [5]:
  7292. foreword = text[max(0, b - 10):b]
  7293. if re.search(self.p, foreword):
  7294. candidates.add(ent.entity_text)
  7295. if ent.label in [0, 1] and ent.values[ent.label]>0.5:
  7296. tenderee_or_agency.add(ent.entity_text)
  7297. candidates -= tenderee_or_agency # 2024/05/10 463166661 把 四川省第二中医医院作为候选人 过滤掉为招标或代理角色 的候选人
  7298. return candidates
  7299. def predict(self, html, list_sentences, list_entitys, nlp_enterprise):
  7300. self.nlp_enterprise = nlp_enterprise
  7301. html = html.replace('比选申请单位', '中标候选人') # 82347769
  7302. html = re.sub("<html>|</html>|<body>|</body>","",html)
  7303. html = re.sub("##attachment##","",html)
  7304. soup = BeautifulSoup(html, 'lxml')
  7305. richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
  7306. in_attachment = False
  7307. if richText:
  7308. richText = richText.extract() # 过滤掉附件
  7309. del_tabel_achievement(soup) # 20240819 过滤掉业绩表格 例:500817166
  7310. prem, candidate_set = self.get_prem(soup)
  7311. if prem == {} and richText:
  7312. del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
  7313. prem, candidate_set = self.get_prem(richText)
  7314. in_attachment = True
  7315. candidate_set2 = self.get_candidates_from_text(list_sentences, list_entitys)
  7316. candidate_set.update(candidate_set2)
  7317. return prem, {'candidate': ','.join(candidate_set)}, in_attachment
  7318. def role_special_predictor(web_source_name, content, nlp_enterprise):
  7319. if web_source_name == '中国电子科技集团有限公司电子采购平台':
  7320. ser = re.search(',(\w{5,30}),发布时间:\d+', content)
  7321. if ser and ser.group(1) in nlp_enterprise:
  7322. return ser.group(1)
  7323. elif web_source_name == '高校仪器设备竞价网':
  7324. ser = re.search('--(\w{5,30}),申购单主题', content)
  7325. if ser and ser.group(1) in nlp_enterprise:
  7326. return ser.group(1)
  7327. elif web_source_name == '台泥阳光采购平台':
  7328. ser = re.search(',(\w{5,30})招标公告,', content)
  7329. if ser and ser.group(1) in nlp_enterprise:
  7330. return ser.group(1)
  7331. class WebsourceTenderee():
  7332. def __init__(self):
  7333. with open(os.path.dirname(__file__)+'/websource_tenderee.pkl', 'r', encoding='utf-8') as f:
  7334. self.webno2ree = json.load(f)
  7335. def get_websource_tenderee(self, web_source_no, web_source_name, prem):
  7336. '''
  7337. 通过数据源唯一招标人召回调整prem中的招标人,
  7338. :param web_source_no:
  7339. :param prem:
  7340. :return:
  7341. '''
  7342. p = '(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校|银行|研究院|血站|红十字会|防治院|研究所)'
  7343. web_ree = self.webno2ree.get(web_source_no, '')
  7344. if web_source_no.startswith('18591-') and web_ree == "":
  7345. web_ree = '中国人民解放军总医院'
  7346. elif web_source_no.startswith('Y00484-') and web_ree == "":
  7347. web_ree = '航空总医院'
  7348. if web_ree == "" and re.search('\w{2,8}(大学|医院)$', web_source_name): # 20240524 大学、医院类站源没唯一招标人默认为站源名称
  7349. web_ree = web_source_name
  7350. if web_ree != '':
  7351. if 'Project' in prem[0]['prem']:
  7352. find_tenderee = False
  7353. for d in prem[0]['prem']['Project']['roleList']:
  7354. if d['role_name'] == 'tenderee':
  7355. find_tenderee = True
  7356. if d['role_text'] == "":
  7357. d['role_text'] = web_ree
  7358. elif re.search('大学$', web_ree) and re.search('学院$', d['role_text']) and web_ree not in d['role_text']:
  7359. d['role_text'] = web_ree
  7360. elif d.get('role_prob', 0) < 0.8 and get_business_data(d['role_text'])[0] == False: # 20240201 概率低于0.8且没有工商数据的替换为站源招标人
  7361. d['role_text'] = web_ree
  7362. # elif re.search(p, web_ree) and (re.search(p, d['role_text'])==None and len(d['role_text'])<6): # 数据源唯一招标人以医院等结尾,角色中无相关关键词的,替换为数据源招标人
  7363. # d['role_text'] = web_ree
  7364. # elif re.search('有限(责任)?公司', web_ree) and (re.search('有限(责任)?公司', d['role_text'])==None and len(d['role_text'])<6):
  7365. # d['role_text'] = web_ree
  7366. break
  7367. if not find_tenderee: # 没招标人的添加
  7368. prem[0]['prem']['Project']['roleList'].append({'role_name': 'tenderee',
  7369. 'role_text': '%s' % web_ree,
  7370. 'role_money': {'money': 0, 'money_unit': '',
  7371. 'floating_ratio': '',
  7372. 'downward_floating_ratio': '',
  7373. 'discount_ratio': ''},
  7374. 'linklist': [],
  7375. 'serviceTime': '',
  7376. 'address': ''})
  7377. else:
  7378. prem[0]['prem']['Project'] = {'code': '',
  7379. 'tendereeMoney': 0,
  7380. 'roleList': [
  7381. {'role_name': 'tenderee',
  7382. 'role_text': '%s' % web_ree,
  7383. 'role_money': {'money': 0, 'money_unit': '', 'floating_ratio': '',
  7384. 'downward_floating_ratio': '', 'discount_ratio': ''},
  7385. 'linklist': [],
  7386. 'serviceTime': '',
  7387. 'address': ''}
  7388. ]}
  7389. tenderee_l = [d2['role_text'] for v in prem[0]['prem'].values() for d2 in v['roleList'] if
  7390. d2['role_name'] == 'tenderee']
  7391. winner_l = [d2['role_text'] for v in prem[0]['prem'].values() for d2 in v['roleList'] if
  7392. d2['role_name'] == 'win_tenderer']
  7393. if set(tenderee_l) & set(winner_l) and web_ree in tenderee_l: # 删除与站源招标人冲突的中标人
  7394. for k in prem[0]['prem']:
  7395. prem[0]['prem'][k]['roleList'] = [d for d in prem[0]['prem'][k]['roleList'] if
  7396. not (d['role_name'] == 'win_tenderer' and d['role_text'] in tenderee_l)]
  7397. return prem
  7398. def get_header_line(list_item):
  7399. '''
  7400. 判断列表内文本哪些是表头,哪些不是
  7401. :param list_item: [ '批复结果', '许可/同意', '批复文号',]
  7402. :return:
  7403. '''
  7404. rs = []
  7405. x = []
  7406. for item in list_item:
  7407. x.append(getPredictor("form").encode(item))
  7408. predict_y = getPredictor("form").predict(np.array(x), type="item")
  7409. for item, values in zip(list_item, list(predict_y)):
  7410. item = str(item).replace(' ', '')
  7411. lb = 1 if values[1] > 0.5 else 0
  7412. if item in ['许可/同意', '办结(通过)', '办结(准予许可)','批准', '合格', '民间投资', '备案']:
  7413. lb = 0
  7414. elif item in ['环境影响评价机构', '建设单位或地方政府作出的相关环保承诺', '环境影响评价技术服务机构', '报告全本'] or re.search('^比例\d{1,2}%$', item):
  7415. lb = 1
  7416. elif lb == 0 and item in header_set:
  7417. lb = 1
  7418. rs.append(lb)
  7419. return rs
  7420. class ApprovalPredictor():
  7421. def __init__(self):
  7422. '''
  7423. 项目(法人)单位
  7424. '''
  7425. self.other_part = {
  7426. "project_name": "((项目|工程|采购|招标|计划|建设|规划)名称?|生产建设项目|申请项目):(?P<main>[^:。]{5,50})[,。]([\w()]{2,15}:|$)?", # 项目名称
  7427. "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标|计划|任务|备案|索引)([编代][号码]|号)):?(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)([\w()]{2,15}:|$)?", # 项目编号
  7428. "doc_num": "((环评|\w{,3})(审[批查核]|批[复准]|立项|[定知文]书|[公发批]文|用地|决定|备案|核准|许可|确认|受理|申请报告|文[件书]|意见书|办件)[文编证]?号|综合受理号|文书?号|合格书号|申报号|(办件|事项)[编代][号码]|收件号))?为?:?(?P<main>[()〔〕【】\[\]0-9]{,8}([\w()〔〕【】]{2,15})?[()〔〕【】\[\]a-zA-Z0-9-.]{3,30}号?)[,。]?([\w()]{2,15}:|$)?", # 文号
  7429. "pro_type": "((申[报请]|审核备|项目|立项)(类型|种类)|项目所属行业|行业(分类|归属)|产业领域|项目行业):(?P<main>[^:。]{2,30})[,。]([\w()]{2,15}:|$)?", # 项目类型
  7430. "year_limit": "((建设|工程|服务|项目)(起止|\w{,2})?(年限|期限|时长|工期)):(约|超过|大概|建设工期|共计|合计)?(?P<main>[\d一二三四五六七八九十]+个月|\d{1,3}(日?历?天|小时)|20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?([至—-]+20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?)?)[(,。]([\w()]{2,15}:|$)?", # 建设年限
  7431. "construction_scale": "([\d一二三四五六七八九十]{1,2}、|([\d一二三四五六七八九十]{1,2}))?(工程|项目|\w{,4})?((建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|(建设|招标|采购))?内容|(建设|工程|项目)(主要)?(规模|内容|概况|面积)([及和](主要)?(规模|内容|概况|面积))?(如下|为)?)|^规模(情况)?):(?P<main>[^:。]{2,500})[,。]?([\w()]{2,30}:|$)?", # 建设规模 #56924861 主要环境影响及预防或者减轻不良环境影响的对策和措施:
  7432. "approval_items": "((审[批查核]|批[复准]|申请|监管|受理)(事项|内容|名称)|事项名称|事项审批):(?P<main>[^:。]{2,150})[,。]([\w()]{2,15}:|$)?", # 审批事项
  7433. "properties": "((建设|工程|项目)性质):(?P<main>[^:。]{2,50})[,。]([\w()]{2,15}:|$)?", # 建设性质
  7434. "approval_result": "((审[批查核]|批[复准]|核[发准]|许可|抽查|备案)(结果|决定|结论|状态|回复|意见)|(办[理件]|,)(状态|意见|结果)|项目(当前|目前)?状态):(?P<main>[^:。]{2,20})[,。]([\w()]{2,15}:|$)?", # 审批结果
  7435. "phone": "(联系)?电话:(?P<main>1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|" # 联系电话
  7436. '\+86.?1[3-9]\d{9}|'
  7437. '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
  7438. '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|'
  7439. '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|'
  7440. '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|'
  7441. '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|'
  7442. '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|'
  7443. '400\d{7}转\d{1,4}|'
  7444. '[2-9]\d{6,7})[,。]([\w()]{2,15}:|$)?'
  7445. }
  7446. self.role_type = {
  7447. "declare_company": "(申[请报]|填报|呈报)(人|部门|机关|单位|企业|公司|机构|组织)", # 申报单位
  7448. "construct_company": "(业主|建设|用地|委托|发包|产权|项目|法人|采购|招标|询价))?(部门|机关|单位|企业|公司|方|业主|人)|主送机关|法人单位|甲方", # 建设单位
  7449. "approver": "(审[批查核议图]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|承办))?(部门|机关|单位|企业|公司|机构)|实施主体", # 审批部门
  7450. "evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)" , # 环评机构
  7451. "compilation_unit": "编制单位", # 编制单位 20240701加
  7452. "publisher": "(发布|发文|公示|公告)(人|部门|机关|单位|企业|公司|机构|组织)" # 发布机构 20240703加
  7453. }
  7454. self.person_type = {
  7455. "legal_person": "项目法人|法定代表人|企业法人" # 项目法人
  7456. }
  7457. self.date_type = {
  7458. "time_declare": "(申[请报]|填报|呈报)(时间|日期)", # 申报时间
  7459. "time_commencement": "(开工|动工|(项目|建设|工程|施工)开始)(时间|日期)", # 开工时间
  7460. "time_completion": "(竣工|完工|验收|(项目|建设|工程|施工)(完成|结束))(备案)?(时间|日期)", # 竣工时间
  7461. "time_approval": "(审[批查核查议]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|决定)(时间|日期)", # 审批时间 20240701加
  7462. "time_release": "(发布|发文|公告|生成|成文)(时间|日期)" # 发布时间
  7463. }
  7464. self.addr_type = {
  7465. "project_addr": "((建设|工程|项目|施工|地块|用地)\w{,2}(地址|地点|位置|所在地)|[宗土]地坐落)" # 建设地址
  7466. }
  7467. self.money_type = {
  7468. "total_tendereeMoney": "(项目|概算|投资)金额|项目投资|总投资|总预算|总概算|投资(规模|总额|估算|概算)|批复概算|投资额|项目概算", # 总投资
  7469. }
  7470. self.head_rule_dic = {**self.role_type, **self.person_type, **self.date_type, **self.addr_type, **self.money_type}
  7471. self.head_rule_dic.update({k: v.split(':')[0] for k,v in self.other_part.items()})
  7472. self.tb = TableTag2List()
  7473. def recursive_text(self, tag):
  7474. '''
  7475. 递归获取 soup 节点文本
  7476. :param tag:
  7477. :return:
  7478. '''
  7479. texts = []
  7480. for child in tag.children:
  7481. if child.name:
  7482. if child.name in ['p'] and len(child.find_all('br'))>2:
  7483. texts.extend(self.recursive_text(child))
  7484. if child.name in ["td", "th", "p", "li", "h1", "h2", "h3", "h4", "h5",
  7485. "h6"] and child.get_text().strip():
  7486. texts.append(re.sub('\s', '', child.get_text().strip().replace(':', ':').replace('(', '(').replace(')', ')')))
  7487. else:
  7488. texts.extend(self.recursive_text(child))
  7489. else:
  7490. if child.strip():
  7491. texts.append(re.sub('\s', '', child.strip().replace(':', ':').replace('(', '(').replace(')', ')')))
  7492. return texts
  7493. def get_table_info(self, df, nlp_enterprise):
  7494. def get_header_index(datas):
  7495. '''
  7496. 根据表格表头判断结果0/1 得到哪些行和列是表头
  7497. :param datas: 表格内容表头判断结果数据[[1,1,1,1],[0,0,0,0]]
  7498. :return: 表头所在的行和列序号
  7499. '''
  7500. header_row = []
  7501. header_col = []
  7502. df_h = pd.DataFrame(datas) # 表头判断数据 , columns=columns
  7503. for i in df_h.index:
  7504. line = df_h.loc[i].values
  7505. if sum(line) == len(line):
  7506. header_row.append((i, sum(line) / len(line)))
  7507. elif sum(line) / len(line) > 0.8:
  7508. header_row.append((i, sum(line) / len(line)))
  7509. elif len(line) > 3 and len(re.findall('11', ''.join([str(it) for it in line]))) > len(
  7510. re.findall('10', ''.join([str(it) for it in line]))):
  7511. header_row.append((i, sum(line) / len(line)))
  7512. for i in df_h.columns:
  7513. col = df_h[i].values
  7514. if sum(col) == len(col):
  7515. header_col.append((i, sum(col) / len(col)))
  7516. elif sum(col) / len(col) > 0.8:
  7517. header_col.append((i, sum(col) / len(col)))
  7518. elif len(col) > 3 and len(re.findall('11', ''.join([str(it) for it in line]))) > len(
  7519. re.findall('10', ''.join([str(it) for it in line]))):
  7520. header_col.append((i, sum(col) / len(col)))
  7521. return header_row, header_col
  7522. def get_header(l, head_rule_dic):
  7523. header_dic = {}
  7524. for i in range(len(l)):
  7525. text = l[i].replace(' ', '') # 修复54969575 项目 名称 被空格分割
  7526. num = 0
  7527. tmp_dic = {}
  7528. for k, v in head_rule_dic.items():
  7529. if re.search(v, text):
  7530. tmp_dic[k] = i
  7531. num += 1
  7532. for k, v in tmp_dic.items():
  7533. if k not in header_dic:
  7534. header_dic[k] = v
  7535. return header_dic
  7536. result_l = []
  7537. datas = []
  7538. for i in df.index:
  7539. line = get_header_line(df.loc[i].values)
  7540. datas.append(line)
  7541. header_row, header_col = get_header_index(datas)
  7542. if len(header_col) == 1 and header_col[0][0] > 1: # 列表头不可能在第1列后面开始
  7543. header_col = []
  7544. if len(header_row) >= 1 and len(header_col) == 0: # 有行表头无列表头
  7545. i = 0
  7546. while i < len(header_row):
  7547. idx, ratio = header_row[i]
  7548. if idx + 1 >= len(df):
  7549. break
  7550. header_dic = get_header(df.loc[idx].values, self.head_rule_dic)
  7551. i += 1
  7552. range_from = idx + 1
  7553. range_to = len(df)
  7554. if i < len(header_row):
  7555. next_header = i
  7556. for j in range(i, len(header_row)):
  7557. idx2, ratio2 = header_row[j]
  7558. if idx2 - idx == 1:
  7559. header_dic2 = get_header(df.loc[idx2].values, self.head_rule_dic)
  7560. if set(df.loc[idx].values) & set(df.loc[idx2].values) != set():
  7561. header_dic.update(header_dic2)
  7562. else:
  7563. header_dic = header_dic2
  7564. range_from = idx2 + 1
  7565. range_to = len(df)
  7566. next_header = j + 1
  7567. idx = idx2
  7568. else:
  7569. range_from = idx + 1
  7570. range_to = idx2
  7571. next_header = j
  7572. break
  7573. i = next_header
  7574. if len(header_dic) >= 2 and 'project_name' in header_dic:
  7575. for index in range(range_from, range_to):
  7576. if len(set(df.loc[index, :])) <= 2: # 修复 56873031 补全内容跟表头错误连接
  7577. continue
  7578. tmp_dic = {}
  7579. for k, v in header_dic.items():
  7580. if k.startswith('time_'):
  7581. content = timeFormat(df.loc[index, v], default_first_day=False) if k in [
  7582. 'time_completion'] else timeFormat(df.loc[index, v])
  7583. elif k in self.role_type:
  7584. content = get_role(df.loc[index, v], nlp_enterprise)
  7585. elif k == 'moneysource':
  7586. content = turnMoneySource(df.loc[index, v])
  7587. else:
  7588. content = df.loc[index, v]
  7589. if content != '':
  7590. tmp_dic[k] = content
  7591. if len(tmp_dic) > 1 and 'project_name' in tmp_dic and tmp_dic not in result_l:
  7592. result_l.append(tmp_dic)
  7593. elif len(header_row) == 0 and len(header_col) >= 1:
  7594. return result_l # 不提取列向表格,容易出错 例 53489774 作多标段
  7595. i = 0
  7596. while i < len(header_col):
  7597. idx, ratio = header_col[i]
  7598. if idx + 1 >= len(df.columns):
  7599. break
  7600. header_dic = get_header(df[idx].values, self.head_rule_dic)
  7601. i += 1
  7602. range_from = idx + 1
  7603. range_to = len(df.columns)
  7604. if i < len(header_col):
  7605. next_header = i
  7606. for j in range(i, len(header_col)):
  7607. idx2, ratio2 = header_col[j]
  7608. if idx2 - idx == 1:
  7609. header_dic2 = get_header(df[idx2].values, self.head_rule_dic)
  7610. if set(df[idx].values) & set(df[idx2].values) != set():
  7611. header_dic.update(header_dic2)
  7612. else:
  7613. header_dic = header_dic2
  7614. range_from = idx2 + 1
  7615. range_to = len(df.columns)
  7616. next_header = j + 1
  7617. idx = idx2
  7618. else:
  7619. range_from = idx + 1
  7620. range_to = idx2
  7621. next_header = j
  7622. break
  7623. i = next_header
  7624. if len(header_dic) >= 2 and 'project_name' in header_dic:
  7625. for index in range(range_from, range_to):
  7626. if len(set(df.loc[:, index])) <= 2:
  7627. continue
  7628. tmp_dic = {}
  7629. for k, v in header_dic.items():
  7630. if k.startswith('time_'):
  7631. content = timeFormat(df.loc[v, index], default_first_day=False) if k in [
  7632. 'time_completion'] else timeFormat(df.loc[v, index])
  7633. elif k in self.role_type:
  7634. content = get_role(df.loc[v, index], nlp_enterprise)
  7635. elif k == 'moneysource':
  7636. content = turnMoneySource(df.loc[v, index])
  7637. else:
  7638. content = df.loc[v, index]
  7639. if content != '':
  7640. tmp_dic[k] = content
  7641. if len(tmp_dic) > 2 and 'project_name' in tmp_dic and tmp_dic not in result_l:
  7642. result_l.append(tmp_dic)
  7643. elif len(header_row) == 1 and len(header_col) == 1:
  7644. pass
  7645. return result_l
  7646. def predict_table(self, html, nlp_enterprise=[]):
  7647. html = re.sub("<html>|</html>|<body>|</body>", "", html)
  7648. html = re.sub("##attachment##", "", html)
  7649. soup = BeautifulSoup(html, 'lxml')
  7650. richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
  7651. self.nlp_enterprise = nlp_enterprise
  7652. if richText:
  7653. richText = richText.extract() # 过滤掉附件
  7654. tables = soup.find_all('table')
  7655. if len(tables) == 0 and richText:
  7656. tables = richText.find_all('table')
  7657. tables.reverse()
  7658. data_list = []
  7659. for table in tables:
  7660. trs = self.tb.table2list(table)
  7661. if len(trs) > 1 and len(set(trs[0])) > 0 and len(set([len(tr) for tr in trs])) == 1: # 表格两行以上且每行列数一样才处理
  7662. df = pd.DataFrame(trs)
  7663. rs_l = self.get_table_info(df, nlp_enterprise)
  7664. for d in rs_l: # 53338603 项目名称+建设内容才是唯一
  7665. if d not in data_list:
  7666. data_list.append(d)
  7667. if rs_l:
  7668. table.extract()
  7669. return data_list
  7670. def predict(self, list_sentences, list_entitys, html, nlp_enterprise=[], span=12):
  7671. tabel_rs = self.predict_table(html, nlp_enterprise) # 表格多项目提取
  7672. soup = BeautifulSoup(html, 'lxml')
  7673. texts_list = self.recursive_text(soup)
  7674. rs_dic = {k: "" for k in
  7675. self.other_part.keys() | self.role_type.keys() | self.date_type.keys() | self.addr_type.keys() | self.money_type.keys() | self.person_type.keys()}
  7676. rs_dic['moneysource'] = ""
  7677. sentences = [it.sentence_text for it in sorted(list_sentences[0], key=lambda x: x.sentence_index)]
  7678. entities = [[] for _ in range(len(sentences))]
  7679. rs_l = []
  7680. found_key = 0
  7681. code_name_set = set() # 项目编号、名称集合
  7682. org_set = set() # 保存可能为审批部门的角色
  7683. not_sure_role = '' # 不确定角色, 例:单位名称:长沙驰能新能源开发有限公司眉县分公司
  7684. for entity in list_entitys[0]:
  7685. entities[entity.sentence_index].append(entity)
  7686. for i in range(len(sentences)):
  7687. multi_project = {k: "" for k in
  7688. self.other_part.keys() | self.role_type.keys() | self.date_type.keys() | self.addr_type.keys() | self.money_type.keys() | self.person_type.keys()}
  7689. multi_project['moneysource'] = ''
  7690. text = sentences[i]
  7691. for entity in entities[i]:
  7692. b, e = entity.wordOffset_begin, entity.wordOffset_end
  7693. if entity.entity_type in ['org', 'company']:
  7694. flag = 1
  7695. role_l = []
  7696. for k, v in self.role_type.items():
  7697. ser = re.search(v, sentences[entity.sentence_index][max(0, b - span):b])
  7698. if ser:
  7699. role_l.append((k, ser.end()))
  7700. if role_l:
  7701. role_l = sorted(role_l, key=lambda x: x[1]) # 解决 400064746000 表格某个为空导致两个表头相近提取错误 申报单位名称:备案机关:海门经济技术开发区管理委员会,备案证号:海开审备〔2024〕346号
  7702. k, _ = role_l[-1]
  7703. if rs_dic[k] == '':
  7704. rs_dic[k] = entity.entity_text
  7705. multi_project[k] = entity.entity_text
  7706. found_key = 1
  7707. flag = 0
  7708. if not_sure_role == entity.entity_text:
  7709. not_sure_role = ''
  7710. elif re.search('(,|^)单位名称:', sentences[entity.sentence_index][max(0, b - span):b]):
  7711. not_sure_role = entity.entity_text
  7712. if flag and entity.entity_type == "org" and re.search('(局|委员会|委|厅)$', entity.entity_text):
  7713. org_set.add(entity.entity_text)
  7714. elif entity.entity_type in ['person']:
  7715. for k, v in self.person_type.items():
  7716. if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
  7717. if rs_dic[k] == '':
  7718. rs_dic[k] = entity.entity_text
  7719. multi_project[k] = entity.entity_text
  7720. found_key = 1
  7721. break
  7722. elif entity.entity_type in ['time']:
  7723. time_l = []
  7724. for k, v in self.date_type.items():
  7725. ser = re.search(v, sentences[entity.sentence_index][max(0, b - span):b])
  7726. if ser:
  7727. time_l.append((k, ser.end()))
  7728. if time_l:
  7729. time_l = sorted(time_l, key=lambda x: x[1])
  7730. k, end = time_l[-1]
  7731. time = timeFormat(entity.entity_text, default_first_day=False) if k in [
  7732. 'time_completion'] else timeFormat(entity.entity_text)
  7733. if time == "":
  7734. continue
  7735. if rs_dic[k] == '':
  7736. rs_dic[k] = time
  7737. multi_project[k] = time
  7738. found_key = 1
  7739. elif entity.entity_type in ['location']:
  7740. for k, v in self.addr_type.items():
  7741. if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
  7742. if rs_dic[k] == '':
  7743. rs_dic[k] = entity.entity_text
  7744. multi_project[k] = entity.entity_text
  7745. found_key = 1
  7746. elif entity.entity_type in ['money']:
  7747. for k, v in self.money_type.items():
  7748. if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
  7749. if rs_dic[k] == '':
  7750. rs_dic[k] = entity.entity_text
  7751. multi_project[k] = entity.entity_text
  7752. found_key = 1
  7753. elif entity.entity_type in ['moneysource']:
  7754. rs_dic['moneysource'] = turnMoneySource(entity.entity_text)
  7755. multi_project['moneysource'] = turnMoneySource(entity.entity_text)
  7756. elif entity.entity_type in ['code']:
  7757. k = 'project_code'
  7758. v = self.other_part[k].split(':', maxsplit=1)[0]
  7759. if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
  7760. if rs_dic[k] == '':
  7761. rs_dic[k] = entity.entity_text
  7762. multi_project[k] = entity.entity_text
  7763. found_key = 1
  7764. elif entity.entity_type in ['name']:
  7765. k = 'project_name'
  7766. v = self.other_part[k].split(':', maxsplit=1)[0]
  7767. if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
  7768. if rs_dic[k] == '':
  7769. rs_dic[k] = entity.entity_text
  7770. multi_project[k] = entity.entity_text
  7771. found_key = 1
  7772. for k, v in self.other_part.items(): # 规则提取非实体类信息
  7773. ser = re.search(v, text)
  7774. if ser:
  7775. if rs_dic[k] == '' or (k == 'project_name' and ',审批事项:' in rs_dic[k]): # 修复 54087410 项目名称包含错误
  7776. rs_dic[k] = ser.group('main')
  7777. multi_project[k] = ser.group('main')
  7778. found_key = 1
  7779. for k, v in self.date_type.items(): # 规则补充时间实体
  7780. if multi_project[k] != '':
  7781. continue
  7782. ser = re.search(v+':?(?P<main>20\d{2}-\d{1,2}(-\d{1,2})?|20\d{2}/\d{1,2}(/\d{1,2})?|20\d{2}\.\d{1,2}(\.\d{1,2})?|20\d{2}(0[1-9]|1[0-2])(0[1-9]|[1-2][0-9]|3[0-1])?)', text)
  7783. if ser:# 规则补充实体识别不到的日期时间
  7784. time = timeFormat(ser.group('main'), default_first_day=False) if k in ['time_completion'] else timeFormat(ser.group('main'))
  7785. if time == "":
  7786. continue
  7787. if rs_dic[k] == '':
  7788. rs_dic[k] = time
  7789. multi_project[k] = time
  7790. found_key = 1
  7791. for k, v in self.addr_type.items(): # 规则补充地址实体 400063690529 实体不完整 建设地点:湖北省-咸宁市-通城县 通城县大坪乡沙口村15组(通城经济开发区)
  7792. ser = re.search(v + ':?(?P<main>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])',text)
  7793. if ser:
  7794. if rs_dic[k] == '' or len(rs_dic[k]) < len(ser.group('main')):
  7795. rs_dic[k] = ser.group('main')
  7796. if len(multi_project[k]) < len(ser.group('main')):
  7797. multi_project[k] = ser.group('main')
  7798. found_key = 1
  7799. for k, v in self.role_type.items(): # 规则补充公司实体
  7800. if multi_project[k] != '':
  7801. continue
  7802. ser = re.search('(%s):(?P<main>[\w()]{6,30}(局|发改|超市|棋牌室|店|(个体工商户)))[,。]'%self.role_type[k], text)
  7803. if ser:
  7804. if rs_dic[k] == '':
  7805. rs_dic[k] = ser.group('main')
  7806. multi_project[k] = ser.group('main')
  7807. if (multi_project['project_code'] != "" or multi_project['project_name'] != "") and multi_project['project_code']+multi_project['project_name'] not in code_name_set:
  7808. code_name_set.add(multi_project['project_code']+multi_project['project_name'])
  7809. if len(set([k for k,v in multi_project.items() if v!=''])-set(['project_name', 'project_code']))<2: # 除了包其他要素少于两个的不作为多包
  7810. continue
  7811. district = getPredictor('district').get_area(
  7812. multi_project['approver'] + multi_project['project_name'] + multi_project['project_addr'], '')
  7813. if district['district']['province'] != '全国':
  7814. multi_project['area'] = district['district']['area']
  7815. multi_project['province'] = district['district']['province']
  7816. multi_project['city'] = district['district']['city']
  7817. multi_project['district'] = district['district']['district']
  7818. multi_project = {k: v for k, v in multi_project.items() if v != ''}
  7819. rs_l.append(multi_project)
  7820. if not_sure_role != '' and rs_dic.get('construct_company', '') == '' and not_sure_role not in org_set: # 补充,单位名称:这种作为建设单位 例:400069851014
  7821. rs_dic['construct_company'] = not_sure_role
  7822. if len(tabel_rs) > 1:
  7823. rs_dic_key = [k for k, v in rs_dic.items() if v != '']
  7824. keys = set(["approver", "publisher", "time_release", "phone", "doc_num"]) & set(rs_dic_key) - set(tabel_rs[0].keys())
  7825. if keys:
  7826. for d in tabel_rs:
  7827. for k in keys:
  7828. d[k] = rs_dic[k]
  7829. return tabel_rs
  7830. if len(rs_l)>1 and len(set(rs_l[0].keys()))>2 and set(rs_l[0].keys())==set(rs_l[1].keys()):
  7831. for k in self.role_type.keys(): # 多项目无建设单位等通过整篇提取补充
  7832. if rs_dic.get(k, '') != '' and k not in rs_l[0].get(k, '') == '':
  7833. for d in rs_l:
  7834. if d.get(k, '') == '':
  7835. d[k] = rs_dic[k]
  7836. return rs_l
  7837. if found_key == 1:
  7838. district = getPredictor('district').get_area(
  7839. rs_dic['approver'] + rs_dic['project_name'] + rs_dic['project_addr'], '')
  7840. if district['district']['province'] != '全国':
  7841. rs_dic['area'] = district['district']['area']
  7842. rs_dic['province'] = district['district']['province']
  7843. rs_dic['city'] = district['district']['city']
  7844. rs_dic['district'] = district['district']['district']
  7845. if len(org_set) == 1 and rs_dic['approver'] == "":
  7846. rs_dic['approver'] == org_set.pop()
  7847. n = 0
  7848. scale_l = [] # 保存以建设规模开头的文本,如果只有一个且比原来长的替换为此文本,避免提取不完成情况
  7849. for text in texts_list: # 补充纠正内容
  7850. for k, v in self.other_part.items():
  7851. kw = v.split(':')[0]
  7852. if re.search('^(%s)$'%kw, text) and rs_dic[k]=='': # 处理非表格表头内容 排列数据 例:400064764198,web_no: XM0016-5
  7853. if n >1 and n+2 < len(texts_list) and get_header_line(texts_list[n-2:n+3]) == [1,0,1,0,1]:
  7854. rs_dic[k] = texts_list[n+1]
  7855. elif n in [0,1] and n+2 < len(texts_list) and get_header_line(texts_list[n:n+3]) == [1,0,1]:
  7856. rs_dic[k] = texts_list[n + 1]
  7857. elif n >1 and n+2 == len(texts_list) and get_header_line(texts_list[n-2:n+2]) == [1,0,1,0]:
  7858. rs_dic[k] = texts_list[n + 1]
  7859. elif k == 'construction_scale' and re.search('^(?[一二三四五六七八九十][)、]', text) and n+1 < len(texts_list): # 大纲 例:53375037
  7860. rs_dic[k] = texts_list[n + 1]
  7861. if k == 'construction_scale' and len(rs_dic.get(k, '')) < len(text):
  7862. ser = re.search('^(%s):(?P<main>.+)'%kw, text)
  7863. if ser:
  7864. rs_dic[k] = ser.group('main')
  7865. n += 1
  7866. if 0<len(rs_dic['construction_scale'])<len(text) and rs_dic['construction_scale'][-1] not in [',', '。'] and text.find(rs_dic['construction_scale'])==0:
  7867. scale_l.append(text)
  7868. if len(scale_l)==1 and len(scale_l[0])>len(rs_dic['construction_scale']): # 规则补充不完整规模信息 例:53334434
  7869. rs_dic['construction_scale'] = scale_l[0]
  7870. if 0<len(rs_dic['construction_scale'])<8 and re.search('([编代][号码]|名称|时间|日期|金额|单位|机构)$', rs_dic['construction_scale']):
  7871. rs_dic['construction_scale'] = ''
  7872. for k, v in rs_dic.items(): # 限制最大长度
  7873. if len(v)>500:
  7874. v = v[:500]+'...后面省略%d字'%(len(v)-500)
  7875. rs_dic[k] = v
  7876. if v == 'null':
  7877. rs_dic[k] = ''
  7878. rs_dic = {k: v for k, v in rs_dic.items() if v != ''}
  7879. return [rs_dic]
  7880. return []
  7881. def add_ree2approval(self, approval, prem):
  7882. '''
  7883. 把招标人补充到审批项目建设单位
  7884. :param approval:
  7885. :param prem:
  7886. :return:
  7887. '''
  7888. ree = ''
  7889. if "Project" in prem:
  7890. for d in prem["Project"]['roleList']:
  7891. if d["role_name"] == "tenderee":
  7892. ree = d["role_text"]
  7893. break
  7894. if ree != '':
  7895. for d in approval:
  7896. if d.get('construct_company', '') == '':
  7897. d['construct_company'] = ree
  7898. else:
  7899. break
  7900. return approval
  7901. def add_codename2approval(self, approval, codeName):
  7902. if len(approval) == 1 and codeName: # 根据整个公告项目编号及名称补充审批信息
  7903. if 'project_code' not in approval[0] and codeName[0].get('code', []) != []:
  7904. approval[0]['project_code'] = codeName[0].get('code', [])[0]
  7905. if 'project_name' not in approval[0] and codeName[0].get('name', '') != '':
  7906. approval[0]['project_name'] = codeName[0].get('name', '')
  7907. return approval
  7908. class BiddingScore():
  7909. def __init__(self):
  7910. self.head_rule_dic = {
  7911. "tenderer": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
  7912. "score_price": "(价格|报价|单价|总价|经济)(部分|\w{,2})?([得评]分|评审)",
  7913. "score_technical": "技术(部分|\w{,2})?标?([得评]分|评审)",
  7914. "score_commercial": "商务(部分|\w{,2})?标?([得评]分|评审)",
  7915. "score_integrity": "诚信(部分|\w{,2})?([得评]分|评审)",
  7916. "score_comprehensive": "(综合(标|评估)?|总|最终)得?分$",
  7917. "ranking": "(得分)?排名",
  7918. "qualification_review": "资格性审查|是否通过资格",
  7919. "compliance_review": "符合性审查|是否通过符合"
  7920. }
  7921. self.tb = TableTag2List()
  7922. def get_table_info(self, df, nlp_enterprise):
  7923. def get_header_index(datas):
  7924. '''
  7925. 根据表格表头判断结果0/1 得到哪些行和列是表头
  7926. :param datas: 表格内容表头判断结果数据[[1,1,1,1],[0,0,0,0]]
  7927. :return: 表头所在的行和列序号
  7928. '''
  7929. header_row = []
  7930. header_col = []
  7931. df_h = pd.DataFrame(datas) # 表头判断数据 , columns=columns
  7932. for i in df_h.index:
  7933. line = df_h.loc[i].values
  7934. if sum(line) == len(line):
  7935. header_row.append((i, sum(line) / len(line)))
  7936. elif sum(line) / len(line) > 0.8:
  7937. header_row.append((i, sum(line) / len(line)))
  7938. elif len(line) > 3 and len(re.findall('11', ''.join([str(it) for it in line]))) > len(
  7939. re.findall('10', ''.join([str(it) for it in line]))):
  7940. header_row.append((i, sum(line) / len(line)))
  7941. for i in df_h.columns:
  7942. col = df_h[i].values
  7943. if sum(col) == len(col):
  7944. header_col.append((i, sum(col) / len(col)))
  7945. elif sum(col) / len(col) > 0.8:
  7946. header_col.append((i, sum(col) / len(col)))
  7947. elif len(col) > 3 and len(re.findall('11', ''.join([str(it) for it in line]))) > len(
  7948. re.findall('10', ''.join([str(it) for it in line]))):
  7949. header_col.append((i, sum(col) / len(col)))
  7950. return header_row, header_col
  7951. def get_header(l, head_rule_dic):
  7952. header_dic = {}
  7953. for i in range(len(l)):
  7954. text = l[i]
  7955. num = 0
  7956. tmp_dic = {}
  7957. for k, v in head_rule_dic.items():
  7958. # print('k : ', k)
  7959. if re.search(v, text):
  7960. tmp_dic[k] = i
  7961. num += 1
  7962. # if num > 1:
  7963. # if tmp_dic.keys() == set(['qualification_review', 'compliance_review']):
  7964. # for k, v in tmp_dic.items():
  7965. # if k not in header_dic:
  7966. # header_dic[k] = v
  7967. # elif tmp_dic:
  7968. for k, v in tmp_dic.items():
  7969. if k not in header_dic:
  7970. header_dic[k] = v
  7971. return header_dic
  7972. def get_score(text):
  7973. text = text.strip()
  7974. if re.search('^\d{1,2}(\.\d{2})$', text):
  7975. return text
  7976. elif re.search('^\d{1,2}(\.\d{2})?[\d,,;\.]*$', text):
  7977. return text
  7978. return ''
  7979. result_l = []
  7980. datas = []
  7981. for i in df.index:
  7982. line = get_header_line(df.loc[i].values)
  7983. datas.append(line)
  7984. header_row, header_col = get_header_index(datas)
  7985. if len(header_col) == 1 and header_col[0][0] > 1: # 列表头不可能在第1列后面开始
  7986. header_col = []
  7987. if len(header_row) >= 1 and len(header_col) == 0: # 有行表头无列表头
  7988. i = 0
  7989. while i < len(header_row):
  7990. idx, ratio = header_row[i]
  7991. if idx + 1 >= len(df):
  7992. break
  7993. header_dic = get_header(df.loc[idx].values, self.head_rule_dic)
  7994. i += 1
  7995. range_from = idx + 1
  7996. range_to = len(df)
  7997. if i < len(header_row):
  7998. next_header = i
  7999. for j in range(i, len(header_row)):
  8000. idx2, ratio2 = header_row[j]
  8001. if idx2 - idx == 1:
  8002. header_dic2 = get_header(df.loc[idx2].values, self.head_rule_dic)
  8003. if set(df.loc[idx].values) & set(df.loc[idx2].values) != set():
  8004. header_dic.update(header_dic2)
  8005. else:
  8006. header_dic = header_dic2
  8007. range_from = idx2 + 1
  8008. range_to = len(df)
  8009. next_header = j + 1
  8010. idx = idx2
  8011. else:
  8012. range_from = idx + 1
  8013. range_to = idx2
  8014. next_header = j
  8015. break
  8016. i = next_header
  8017. if len(header_dic) >= 2 and 'tenderer' in header_dic:
  8018. for index in range(range_from, range_to):
  8019. tmp_dic = {}
  8020. for k, v in header_dic.items():
  8021. if k.startswith('score'):
  8022. content = get_score(df.loc[index, v])
  8023. elif k == 'tenderer':
  8024. content = get_role(df.loc[index, v], nlp_enterprise)
  8025. elif k == 'ranking':
  8026. content = df.loc[index, v] if re.search('^第?[\d一二三四五六七八九十]+名?$',df.loc[index, v]) else ''
  8027. else:
  8028. content = df.loc[index, v]
  8029. if content != '':
  8030. tmp_dic[k] = content
  8031. if len(tmp_dic) > 1 and 'tenderer' in tmp_dic and tmp_dic not in result_l:
  8032. result_l.append(tmp_dic)
  8033. elif len(header_row) == 0 and len(header_col) >= 1:
  8034. i = 0
  8035. while i < len(header_col):
  8036. idx, ratio = header_col[i]
  8037. if idx + 1 >= len(df.columns):
  8038. break
  8039. header_dic = get_header(df[idx].values, self.head_rule_dic)
  8040. i += 1
  8041. range_from = idx + 1
  8042. range_to = len(df.columns)
  8043. if i < len(header_col):
  8044. next_header = i
  8045. for j in range(i, len(header_col)):
  8046. idx2, ratio2 = header_col[j]
  8047. if idx2 - idx == 1:
  8048. header_dic2 = get_header(df[idx2].values, self.head_rule_dic)
  8049. if set(df[idx].values) & set(df[idx2].values) != set():
  8050. header_dic.update(header_dic2)
  8051. else:
  8052. header_dic = header_dic2
  8053. range_from = idx2 + 1
  8054. range_to = len(df.columns)
  8055. next_header = j + 1
  8056. idx = idx2
  8057. else:
  8058. range_from = idx + 1
  8059. range_to = idx2
  8060. next_header = j
  8061. break
  8062. i = next_header
  8063. if len(header_dic.keys()&set(['tenderer','score_technical', 'score_commercial', 'score_price', 'score_comprehensive'])) >= 2 and 'tenderer' in header_dic:
  8064. for index in range(range_from, range_to):
  8065. tmp_dic = {}
  8066. for k, v in header_dic.items():
  8067. if k.startswith('score'):
  8068. content = get_score(df.loc[v, index])
  8069. elif k == 'tenderer':
  8070. content = get_role(df.loc[v, index], nlp_enterprise)
  8071. elif k == 'ranking':
  8072. content = df.loc[v, index] if re.search('^第?[\d一二三四五六七八九十]+名?$', df.loc[v, index]) else ''
  8073. else:
  8074. content = df.loc[v, index]
  8075. if content != '':
  8076. tmp_dic[k] = content
  8077. if len(tmp_dic) > 2 and 'tenderer' in tmp_dic and tmp_dic not in result_l:
  8078. result_l.append(tmp_dic)
  8079. elif len(header_row) == 1 and len(header_col) == 1:
  8080. pass
  8081. return result_l
  8082. def predict(self, html, nlp_enterprise=[]):
  8083. html = re.sub("<html>|</html>|<body>|</body>", "", html)
  8084. html = re.sub("##attachment##", "", html)
  8085. soup = BeautifulSoup(html, 'lxml')
  8086. richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
  8087. self.nlp_enterprise = nlp_enterprise
  8088. if richText:
  8089. richText = richText.extract() # 过滤掉附件
  8090. tables = soup.find_all('table')
  8091. if len(tables) == 0 and richText:
  8092. tables = richText.find_all('table')
  8093. tables.reverse()
  8094. rs_dic = {}
  8095. for table in tables:
  8096. trs = self.tb.table2list(table)
  8097. if len(trs)>1 and len(trs[0])>1 and len(set([len(tr) for tr in trs])) == 1:
  8098. df = pd.DataFrame(trs)
  8099. rs_l = self.get_table_info(df, nlp_enterprise)
  8100. for d in rs_l:
  8101. if d['tenderer'] not in rs_dic:
  8102. rs_dic[d['tenderer']] = d
  8103. elif len(d) > len(rs_dic[d['tenderer']]):
  8104. rs_dic[d['tenderer']] = d
  8105. table.extract()
  8106. return list(rs_dic.values())
  8107. class EntityTypeRulePredictor():
  8108. def __init__(self):
  8109. self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选))?(会议)?地[点址区]([((]网址[))])?[:为]'
  8110. self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址区]([((]网址[))])?[:为]'
  8111. self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|供货|卸货)((期|时间)[及和、])?)?(地[点址区]?|区域)[:为]'
  8112. self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|展示|看样|拍卖)(实施|服务|现场)?(地[点址区]|位置|所在地区?)(位于)?[:为]|项目位于|[^\w]所[属在](区域|地区?):|存放地[点址]?[:为]' # 银行所属区域:北京市西城区 不作项目地址
  8113. self.pattern_addr_contact = '(联系|收件人?|邮寄)地[点址区][:为]|行政区:'
  8114. self.pattern_time_planned = '(计划|预计|预期)(招标|采购|发标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
  8115. self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'
  8116. self.pattern_addr_dic = {'addr_bidopen': self.pattern_addr_bidopen,
  8117. 'addr_bidsend': self.pattern_addr_bidsend,
  8118. 'addr_delivery': self.pattern_addr_delivery,
  8119. 'addr_project': self.pattern_addr_project,
  8120. 'addr_contact': self.pattern_addr_contact}
  8121. def predict(self, list_entitys, list_sentences, list_articles):
  8122. addr_dic = {}
  8123. time_dic = {}
  8124. code_investment = ''
  8125. for entity in list_entitys[0]:
  8126. if entity.entity_type == 'location':
  8127. b = entity.wordOffset_begin
  8128. s_index = entity.sentence_index
  8129. sentance_text = list_sentences[0][s_index].sentence_text
  8130. for k, v in self.pattern_addr_dic.items():
  8131. v = v.replace('[:为]', '')
  8132. if re.search(v, sentance_text[max(0, b-10): b]) and len(entity.entity_text)>2:
  8133. addr_dic[k] = entity.entity_text
  8134. elif entity.entity_type == 'time':
  8135. b = entity.wordOffset_begin
  8136. s_index = entity.sentence_index
  8137. sentance_text = list_sentences[0][s_index].sentence_text
  8138. if re.search(self.pattern_time_planned, sentance_text[max(0, b-12): b]):
  8139. time_dic['time_planned'] = entity.entity_text
  8140. elif entity.entity_type == 'code':
  8141. b = entity.wordOffset_begin
  8142. s_index = entity.sentence_index
  8143. sentance_text = list_sentences[0][s_index].sentence_text
  8144. if code_investment == '' and re.search(self.pattern_code_investment, sentance_text[max(0, b-12): b]):
  8145. code_investment = entity.entity_text
  8146. ser1 = re.search('(%s)(?P<addr>[\w():\.-]{5,100})[,。]'%self.pattern_addr_bidopen, list_articles[0].content)
  8147. ser2 = re.search('(%s)(?P<addr>[\w():\.-]{5,100})[,。]'%self.pattern_addr_bidsend, list_articles[0].content)
  8148. ser3 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_delivery, list_articles[0].content)
  8149. ser4 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_project, list_articles[0].content)
  8150. ser5 = re.search('(%s)(?P<code>[\da-zA-Z()-]{5,30})[,。]'%self.pattern_code_investment, list_articles[0].content)
  8151. if ser1 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
  8152. addr_dic['addr_bidopen'] = ser1.group('addr')
  8153. if ser2 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
  8154. addr_dic['addr_bidsend'] = ser2.group('addr')
  8155. if ser3 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]', ser3.group('addr')) and addr_dic.get('addr_delivery', '') in ser3.group('addr'):
  8156. addr_dic['addr_delivery'] = ser3.group('addr')
  8157. if ser4 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]', ser4.group('addr')) and addr_dic.get('addr_project', '') in ser4.group('addr'):
  8158. addr_dic['addr_project'] = ser4.group('addr')
  8159. if ser5 and code_investment == '':
  8160. code_investment = ser5.group('code')
  8161. return addr_dic, time_dic, code_investment
  8162. def getSavedModel():
  8163. #predictor = FormPredictor()
  8164. graph = tf.Graph()
  8165. with graph.as_default():
  8166. model = tf.keras.models.load_model("../form/model/model_form.model_item.hdf5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
  8167. #print(tf.graph_util.remove_training_nodes(model))
  8168. tf.saved_model.simple_save(
  8169. tf.keras.backend.get_session(),
  8170. "./h5_savedmodel/",
  8171. inputs={"image": model.input},
  8172. outputs={"scores": model.output}
  8173. )
  8174. def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
  8175. '''
  8176. model = models.Sequential()
  8177. model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
  8178. model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
  8179. crf = CRF(len(chunk_tags), sparse_target=True)
  8180. model.add(crf)
  8181. model.summary()
  8182. model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
  8183. return model
  8184. '''
  8185. input = layers.Input(shape=(None,),dtype="int32")
  8186. if weights is not None:
  8187. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
  8188. else:
  8189. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
  8190. bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
  8191. bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
  8192. crf = CRF(len(chunk_tags),sparse_target=True)
  8193. crf_out = crf(bilstm_dense)
  8194. model = models.Model(input=[input],output = [crf_out])
  8195. model.summary()
  8196. model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
  8197. return model
  8198. import h5py
  8199. def h5_to_graph(sess,graph,h5file):
  8200. f = h5py.File(h5file,'r') #打开h5文件
  8201. def getValue(v):
  8202. _value = f["model_weights"]
  8203. list_names = str(v.name).split("/")
  8204. for _index in range(len(list_names)):
  8205. print(v.name)
  8206. if _index==1:
  8207. _value = _value[list_names[0]]
  8208. _value = _value[list_names[_index]]
  8209. return _value.value
  8210. def _load_attributes_from_hdf5_group(group, name):
  8211. """Loads attributes of the specified name from the HDF5 group.
  8212. This method deals with an inherent problem
  8213. of HDF5 file which is not able to store
  8214. data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
  8215. # Arguments
  8216. group: A pointer to a HDF5 group.
  8217. name: A name of the attributes to load.
  8218. # Returns
  8219. data: Attributes data.
  8220. """
  8221. if name in group.attrs:
  8222. data = [n.decode('utf8') for n in group.attrs[name]]
  8223. else:
  8224. data = []
  8225. chunk_id = 0
  8226. while ('%s%d' % (name, chunk_id)) in group.attrs:
  8227. data.extend([n.decode('utf8')
  8228. for n in group.attrs['%s%d' % (name, chunk_id)]])
  8229. chunk_id += 1
  8230. return data
  8231. def readGroup(gr,parent_name,data):
  8232. for subkey in gr:
  8233. print(subkey)
  8234. if parent_name!=subkey:
  8235. if parent_name=="":
  8236. _name = subkey
  8237. else:
  8238. _name = parent_name+"/"+subkey
  8239. else:
  8240. _name = parent_name
  8241. if str(type(gr[subkey]))=="<class 'h5py._hl.group.Group'>":
  8242. readGroup(gr[subkey],_name,data)
  8243. else:
  8244. data.append([_name,gr[subkey].value])
  8245. print(_name,gr[subkey].shape)
  8246. layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names')
  8247. list_name_value = []
  8248. readGroup(f["model_weights"], "", list_name_value)
  8249. '''
  8250. for k, name in enumerate(layer_names):
  8251. g = f["model_weights"][name]
  8252. weight_names = _load_attributes_from_hdf5_group(g, 'weight_names')
  8253. #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
  8254. for weight_name in weight_names:
  8255. list_name_value.append([weight_name,np.asarray(g[weight_name])])
  8256. '''
  8257. for name_value in list_name_value:
  8258. name = name_value[0]
  8259. '''
  8260. if re.search("dense",name) is not None:
  8261. name = name[:7]+"_1"+name[7:]
  8262. '''
  8263. value = name_value[1]
  8264. print(name,graph.get_tensor_by_name(name),np.shape(value))
  8265. sess.run(tf.assign(graph.get_tensor_by_name(name),value))
  8266. def initialize_uninitialized(sess):
  8267. global_vars = tf.global_variables()
  8268. is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars])
  8269. not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f]
  8270. adam_vars = []
  8271. for _vars in not_initialized_vars:
  8272. if re.search("Adam",_vars.name) is not None:
  8273. adam_vars.append(_vars)
  8274. print([str(i.name) for i in adam_vars]) # only for testing
  8275. if len(adam_vars):
  8276. sess.run(tf.variables_initializer(adam_vars))
  8277. def save_codename_model():
  8278. # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
  8279. filepath = "../../dl_dev/projectCode/models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt"
  8280. vocabpath = "../projectCode/models/vocab.pk"
  8281. classlabelspath = "../projectCode/models/classlabels.pk"
  8282. # vocab = load(vocabpath)
  8283. # class_labels = load(classlabelspath)
  8284. w2v_matrix = load('codename_w2v_matrix.pk')
  8285. graph = tf.get_default_graph()
  8286. with graph.as_default() as g:
  8287. ''''''
  8288. # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
  8289. #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function})
  8290. sess = tf.Session(graph=g)
  8291. # sess = tf.keras.backend.get_session()
  8292. char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
  8293. #with sess.as_default():
  8294. sess.run(tf.global_variables_initializer())
  8295. # print(sess.run("time_distributed_1/kernel:0"))
  8296. # model.load_weights(filepath)
  8297. saver = tf.train.Saver()
  8298. saver.restore(sess, filepath)
  8299. # print("logits",sess.run(logits))
  8300. # print("#",sess.run("time_distributed_1/kernel:0"))
  8301. # x = load("codename_x.pk")
  8302. #y = model.predict(x)
  8303. # y = sess.run(model.output,feed_dict={model.input:x})
  8304. # for item in np.argmax(y,-1):
  8305. # print(item)
  8306. tf.saved_model.simple_save(
  8307. sess,
  8308. "./codename_savedmodel_tf/",
  8309. inputs={"inputs": char_input,
  8310. "inputs_length":length,
  8311. 'keepprob':keepprob},
  8312. outputs={"logits": logits,
  8313. "trans":trans}
  8314. )
  8315. def save_role_model():
  8316. '''
  8317. @summary: 保存model为savedModel,部署到PAI平台上调用
  8318. '''
  8319. model_role = PREMPredict().model_role
  8320. with model_role.graph.as_default():
  8321. model = model_role.getModel()
  8322. sess = tf.Session(graph=model_role.graph)
  8323. print(type(model.input))
  8324. sess.run(tf.global_variables_initializer())
  8325. h5_to_graph(sess, model_role.graph, model_role.model_role_file)
  8326. model = model_role.getModel()
  8327. tf.saved_model.simple_save(sess,
  8328. "./role_savedmodel/",
  8329. inputs={"input0":model.input[0],
  8330. "input1":model.input[1],
  8331. "input2":model.input[2]},
  8332. outputs={"outputs":model.output}
  8333. )
  8334. def save_money_model():
  8335. model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
  8336. graph = tf.Graph()
  8337. with graph.as_default():
  8338. sess = tf.Session(graph=graph)
  8339. with sess.as_default():
  8340. # model = model_money.getModel()
  8341. # model.summary()
  8342. # sess.run(tf.global_variables_initializer())
  8343. # h5_to_graph(sess, model_money.graph, model_money.model_money_file)
  8344. model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  8345. model.summary()
  8346. print(model.weights)
  8347. tf.saved_model.simple_save(sess,
  8348. "./money_savedmodel2/",
  8349. inputs = {"input0":model.input[0],
  8350. "input1":model.input[1],
  8351. "input2":model.input[2]},
  8352. outputs = {"outputs":model.output}
  8353. )
  8354. def save_person_model():
  8355. model_person = EPCPredict().model_person
  8356. with model_person.graph.as_default():
  8357. x = load("person_x.pk")
  8358. _data = np.transpose(np.array(x),(1,0,2,3))
  8359. model = model_person.getModel()
  8360. sess = tf.Session(graph=model_person.graph)
  8361. with sess.as_default():
  8362. sess.run(tf.global_variables_initializer())
  8363. model_person.load_weights()
  8364. #h5_to_graph(sess, model_person.graph, model_person.model_person_file)
  8365. predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]})
  8366. #predict_y = model.predict([_data[0],_data[1]])
  8367. print(np.argmax(predict_y,-1))
  8368. tf.saved_model.simple_save(sess,
  8369. "./person_savedmodel/",
  8370. inputs={"input0":model.input[0],
  8371. "input1":model.input[1]},
  8372. outputs = {"outputs":model.output})
  8373. def save_form_model():
  8374. model_form = FormPredictor()
  8375. with model_form.graph.as_default():
  8376. model = model_form.getModel("item")
  8377. sess = tf.Session(graph=model_form.graph)
  8378. sess.run(tf.global_variables_initializer())
  8379. h5_to_graph(sess, model_form.graph, model_form.model_file_item)
  8380. tf.saved_model.simple_save(sess,
  8381. "./form_savedmodel/",
  8382. inputs={"inputs":model.input},
  8383. outputs = {"outputs":model.output})
  8384. def save_codesplit_model():
  8385. filepath_code = "../../dl_dev/projectCode/models/model_code.hdf5"
  8386. graph = tf.Graph()
  8387. with graph.as_default():
  8388. model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  8389. sess = tf.Session()
  8390. sess.run(tf.global_variables_initializer())
  8391. h5_to_graph(sess, graph, filepath_code)
  8392. tf.saved_model.simple_save(sess,
  8393. "./codesplit_savedmodel/",
  8394. inputs={"input0":model_code.input[0],
  8395. "input1":model_code.input[1],
  8396. "input2":model_code.input[2]},
  8397. outputs={"outputs":model_code.output})
  8398. def save_timesplit_model():
  8399. filepath = '../time/model_label_time_classify.model.hdf5'
  8400. with tf.Graph().as_default() as graph:
  8401. time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  8402. with tf.Session() as sess:
  8403. sess.run(tf.global_variables_initializer())
  8404. h5_to_graph(sess, graph, filepath)
  8405. tf.saved_model.simple_save(sess,
  8406. "./timesplit_model/",
  8407. inputs={"input0":time_model.input[0],
  8408. "input1":time_model.input[1]},
  8409. outputs={"outputs":time_model.output})
  8410. if __name__=="__main__":
  8411. #save_role_model()
  8412. # save_codename_model()
  8413. # save_money_model()
  8414. #save_person_model()
  8415. #save_form_model()
  8416. #save_codesplit_model()
  8417. # save_timesplit_model()
  8418. '''
  8419. # with tf.Session(graph=tf.Graph()) as sess:
  8420. # from tensorflow.python.saved_model import tag_constants
  8421. # meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
  8422. # graph = tf.get_default_graph()
  8423. # signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  8424. # signature = meta_graph_def.signature_def
  8425. # input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
  8426. # input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
  8427. # outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
  8428. # x = load("person_x.pk")
  8429. # _data = np.transpose(x,[1,0,2,3])
  8430. # y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
  8431. # print(np.argmax(y,-1))
  8432. '''
  8433. # MAX_LEN = 1000
  8434. # # vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
  8435. # # vocab = load(vocabpath)
  8436. # # word2index = dict((w, i) for i, w in enumerate(np.array(vocab)))
  8437. # # index_unk = word2index.get("<unk>")
  8438. # # sentence = "招标人:广州市重点公共建设项目管理中心,联系人:李工,联系方式:020-22905689,招标代理:广东重工建设监理有限公司," \
  8439. # # "代理联系人:薛家伟,代理联系方式:13535014481,招标监督机构:广州市重点公共建设项目管理中心,监督电话:020-22905690," \
  8440. # # "备注:以上为招标公告简要描述,招标公告详细信息请查看“招标公告”附件,"
  8441. # # sentence = sentence*5
  8442. # # list_sentence = [sentence]*200
  8443. # # # print(list_sentence)
  8444. # # x = [[word2index.get(word, index_unk) for word in sentence] for sentence in
  8445. # # list_sentence]
  8446. # # x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
  8447. # # # print(x_len)
  8448. # # x = pad_sequences(x, maxlen=MAX_LEN, padding="post", truncating="post")
  8449. # #
  8450. # # requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},
  8451. # # verify=True)
  8452. # # # predict_y = json.loads(requests_result.text)['result']
  8453. # # print("cost_time:", json.loads(requests_result.text)['cost_time'])
  8454. # # print(MAX_LEN, len(sentence), len(list_sentence))
  8455. # # requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},
  8456. # # verify=True)
  8457. # # # predict_y = json.loads(requests_result.text)['result']
  8458. # # print("cost_time:", json.loads(requests_result.text)['cost_time'])
  8459. # # print(MAX_LEN, len(sentence), len(list_sentence))
  8460. # docid = ""
  8461. # title = ''
  8462. # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
  8463. # html = f.read()
  8464. # product_attr = ProductAttributesPredictor()
  8465. # rs = product_attr.predict(docid='', html=html, page_time="")
  8466. # print(rs)
  8467. docid = ""
  8468. title = '甘肃省妇幼保健院(甘肃省中心医院)(第二期)采购结果公告'
  8469. with open('d:/html/2.html', 'r', encoding='utf-8') as f:
  8470. html = f.read()
  8471. tb_extract = TablePremExtractor()
  8472. rs = tb_extract.predict(html, [
  8473. "江苏中联铸本混凝土有限公司",
  8474. "鼓楼区协荣机械设备经销部"
  8475. ], web_source_name = '', all_winner=False)
  8476. print('标段数:',len(rs[0]))
  8477. print(rs)
  8478. # bdscore = BiddingScore()
  8479. # rs = bdscore.predict(html)
  8480. # print(type(rs), len(rs))
  8481. # print(rs)
  8482. # # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
  8483. # # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
  8484. # # # ids = [42078089, 51828144, 60511017, 69042200, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
  8485. # # # ids = [ 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
  8486. # # # ids = [37756133, 39743626, 42068246, 51176657, 70624901, 75687028, 85489552, 95342532, 97337474, 109601526, 111464967, 112548665, 116223553, 117329696, 117850214, 120619166, 121717252, 122345499, 128511969, 133403846, 133602236, 136564970, 137772969, 138020374, 140929169, 147414295, 152659064, 155485083, 186412244, 195546784, 196135909, 202981523, 214647448, 216377830, 217957372, 218789230, 225050691, 228064464, 228590691, 236342514, 237352780, 239814252]
  8487. # # # ids = [51176657, 70624901, 85489552, 95342532, 109601526, 111464967, 112548665, 116223553, 117329696, 117850214, 120619166, 121717252, 122345499, 128511969, 133403846, 133602236, 136564970, 137772969, 138020374, 140929169, 147414295, 152659064, 155485083, 186412244, 195546784, 196135909, 202981523, 214647448, 216377830, 217957372, 218789230, 225050691, 228064464, 228590691, 236342514, 237352780, 239814252]
  8488. # ids = [31995310, 33586422, 34213587, 36093749, 37238528, 37739743, 39150739, 39281429, 40038908, 40289771, 40581071, 40591331, 42200293, 42739447, 42923948, 43351479, 44237678, 44506815, 44592013, 45106514, 45469037, 48411467, 51822565, 52127391, 54236264, 54706723, 54894477, 54898083, 55934378, 56104538, 56218948, 59606477, 60116927, 60638934, 61523351, 61685037, 61706106, 62187765, 62203118, 62843892, 63850238, 64139401, 65707507, 66072846, 66137391, 66738991, 67676932, 67902417, 69795866, 70868740, 71180456, 71796375, 77613620, 77641817, 77748144, 77761818, 78250390, 78606698, 78717682, 78854831, 79597122, 79597366, 79819968, 80377018, 82461832, 84018089, 84134439, 84815332, 85123470, 85123525, 85456789, 87474450, 88129399, 88288685, 88329278, 88342999, 88747517, 89632339, 89861712, 89985134, 91538446, 93323837, 94609104, 95522891, 97476802, 97629540, 98662744, 100207494, 100558146, 100755026, 101009561, 101275254, 101348782, 101462933, 101857772, 102924005, 103432276, 103459091, 104062674, 106601819, 106812124, 107065735, 107559314, 108201680, 108455612, 108544389, 108832580, 108995821, 109196083, 110726641, 110780095, 111234020, 111588327, 111656418, 111797176, 111993708, 114376859, 115869547, 117725909, 118032923, 118349683, 119080451, 119224972, 120120112, 120304657, 120830324, 122331341, 122856799, 123439110, 123641276, 123733047, 123733333, 123874242, 123918651, 124253086, 124942182, 125372140, 125464462, 125568385, 126185770, 126305386, 126512513, 126840529, 126844209, 126902118, 127254675, 127510817, 127670247, 128441465, 128498056, 129557176, 129833289, 129875792, 130121559, 130554345, 130556979, 131051006, 131142204, 131480539, 133743564, 133834740, 133984477, 134796953, 135533772, 135986763, 136777096, 137403576, 137864604, 138148591, 139840028, 139974803, 140105753, 145439181, 149105875, 150129836, 150828866, 152675649, 153688731, 155564708, 155599250, 155600699, 156728197, 161246902, 161775170, 162476194, 162914022, 162963943, 164007344, 164775490, 165339842, 175705079, 176218853, 176944891, 178251502, 178372090, 179732253, 180379187, 181626147, 184044160, 184404217, 186383436, 188468811, 192103014, 192574092, 192754157, 193358322, 195686462, 195868255, 196060419, 199113788, 201588003, 201874243, 201879319, 204796942, 205348530, 206735492, 208308899, 210310963, 210313993, 212124901, 212363133, 212389173, 213573782, 213818877, 214044075, 214989980, 215356671, 215367201, 215646443, 216212563, 216377823, 216490415, 217483041, 217486509, 218429429, 219181483, 219411056, 219971724, 220400698, 220780247, 221398716, 222545237, 223267606, 223906281, 224074580, 224383778, 224995705, 225390819, 227536610, 227829175, 227908020, 227980430, 229421942, 229862241, 230217038, 230227848, 230391553, 230592027, 233836843, 234465556, 235108306, 235217324, 235995802, 236010068, 236359727, 236419142, 236997002, 238069580, 238106585, 238534142, 238567209, 238839802, 239260141, 240214254, 240263848, 240535275, 240680028]
  8489. # df = pd.read_csv('E:\产品单价数量/待预测数据html内容4.csv')
  8490. # print('公告数:', len(df), len(ids))
  8491. # df = df[df['docid'].isin(ids)]
  8492. # ids = []
  8493. # for docid,html in zip(df['docid'],df['dochtmlcon']):
  8494. # product_attr = ProductAttributesPredictor()
  8495. # rs, _ = product_attr.predict(docid='', html=html, page_time="")
  8496. # # print(docid, rs)
  8497. # # print(docid, rs[0]['product_attrs']['header_col'])
  8498. # # print('*'*20)
  8499. # if rs[0]['product_attrs']['header_col'] == []:
  8500. # ids.append(docid)
  8501. # print(docid, rs[0]['product_attrs']['header_col'])
  8502. # print('*' * 20)
  8503. # else:
  8504. # print(docid, rs[0]['product_attrs']['header_col'])
  8505. # print('*' * 20)
  8506. # print(len(ids), ids)
  8507. # role = RoleRulePredictor()
  8508. # labels = []
  8509. # keywords = []
  8510. # # df = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果.xlsx')
  8511. # df = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果60000-90000.xlsx')
  8512. # columns = ['docid', 'type', 'label', 'value', 'front', 'behind',
  8513. # 'front6', 'entity_text', 'behind6', 'front6_reverse', 'rule_label', 'keyword', 'pos']
  8514. # print(df.columns)
  8515. # df.fillna('', inplace=True)
  8516. # for front, center, behind, entity_text in zip(df['front'], df['entity_text'], df['behind'], df['entity_text']):
  8517. # front = str(front)
  8518. # behind = str(behind)
  8519. # label, _prob, _flag, keyword = role.rule_predict(front, center, behind, entity_text)
  8520. # labels.append(label)
  8521. # keywords.append(keyword)
  8522. # df['rule_label'] = pd.Series(labels)
  8523. # df['keyword'] = pd.Series(keywords)
  8524. # df['front6'] = df['front'].apply(lambda x: str(x)[-6:])
  8525. # df['behind6'] = df['behind'].apply(lambda x: str(x)[:6])
  8526. # df['pos'] = df.apply(lambda x: 1 if x['label']==x['rule_label'] else 0, axis=1)
  8527. # # df.to_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果_rule_predict.xlsx', index=False, columns=columns)
  8528. # df.to_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果60000-90000_rule_predict.xlsx', index=False, columns=columns)
  8529. # print(get_header_line(['环评项目登记号','/','环评批文文号','金环许[2023]126号','环评批文日期']))
  8530. # print(get_header_line(['序号', '项目名称', '建设地点', '建设单位', '环评机构', '项目概况', '主要环境影响及预防或者减轻不良环境影响的对策和措施', '建设单位或地方政府作出的相关环保承诺', '公众反馈意见的联系方式']))