3 years ago · 4340c0cbeb
--- a/BiddingKG/dl/channel/channel_predictor.py
+++ b/BiddingKG/dl/channel/channel_predictor.py
@@ -11,6 +11,7 @@ import copy
 
															 import tensorflow as tf
														
 
															 import fool
														
 
															 import re
														
 
															+import os
														
 
															 import time
														
 
															 word_model = getModel_w2v()
														
@@ -23,7 +24,7 @@ sentence_num = 10
 
															 kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
														
 
															 class DocChannel():
														
 
															-  def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'):
														
 
															+  def __init__(self, life_model='/model/channel.pb', type_model='/model/doctype.pb'):
														
 
															     self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
														
 
															     self.mask, self.mask_title = self.load_life(life_model)
														
 
															     self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
														
@@ -36,7 +37,7 @@ class DocChannel():
 
															   def load_life(self,life_model):
														
 
															     with tf.Graph().as_default() as graph:
														
 
															       output_graph_def = graph.as_graph_def()
														
 
															-      with open(life_model, 'rb') as f:
														
 
															+      with open(os.path.dirname(__file__)+life_model, 'rb') as f:
														
 
															         output_graph_def.ParseFromString(f.read())
														
 
															         tf.import_graph_def(output_graph_def, name='')
														
 
															         print("%d ops in the final graph" % len(output_graph_def.node))
														
@@ -55,7 +56,7 @@ class DocChannel():
 
															   def load_type(self,type_model):
														
 
															     with tf.Graph().as_default() as graph:
														
 
															       output_graph_def = graph.as_graph_def()
														
 
															-      with open(type_model, 'rb') as f:
														
 
															+      with open(os.path.dirname(__file__)+type_model, 'rb') as f:
														
 
															         output_graph_def.ParseFromString(f.read())
														
 
															         tf.import_graph_def(output_graph_def, name='')
														
 
															         print("%d ops in the final graph" % len(output_graph_def.node))
														
@@ -172,7 +173,6 @@ class DocChannel():
 
															       # words = [it for sen in sen_words for it in sen]
														
 
															       # segword_content = ' '.join(words)
														
 
															       segword_title = ' '.join(fool.cut(doctitle)[0])
														
 
															-
														
 
															       segword_content = dochtmlcon
														
 
															       # segword_title = doctitle
														
@@ -217,8 +217,12 @@ class DocChannel():
 
															     else:
														
 
															       return 0
														
 
															-  def predict(self, title, content):
														
 
															+  def predict(self, title='', content=''):
														
 
															     # print('准备预测')
														
 
															+    if isinstance(content, list):
														
 
															+      token_l = [it.tokens for it in content]
														
 
															+      tokens = [it for l in token_l for it in l]
														
 
															+      content = ' '.join(tokens)
														
 
															     data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
														
 
															     pred = self.type_sess.run(self.type_softmax,
														
 
															                                     feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
														
@@ -241,10 +245,14 @@ class DocChannel():
 
															       prob = pred[0][id]
														
 
															       if id == 6:
														
 
															         if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
														
 
															-          return '候选人公示', prob
														
 
															-      return self.id2life[id], prob
														
 
															+          # return '候选人公示', prob
														
 
															+          return [{'docchannel': '候选人公示'}]
														
 
															+      # return self.id2life[id], prob
														
 
															+      return [{'docchannel':self.id2life[id]}]
														
 
															     else:
														
 
															-      return self.id2type[id], prob
														
 
															+      # return self.id2type[id], prob
														
 
															+      return [{'docchannel':self.id2type[id]}]
														
 
															+
														
 
															   def predict_batch(self, title_content_list):
														
 
															     # print('准备预测')
														
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -419,7 +419,8 @@ def getUnifyMoney(money):
 
															     money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
														
 
															     result = Decimal(0)
														
 
															     chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
														
 
															-    chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
														
 
															+    # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
														
 
															+    chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾", "圆", "元", "角", "分", '十', '百', '千']
														
 
															     LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
														
 
															     BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
														
--- a/BiddingKG/dl/interface/Entitys.py
+++ b/BiddingKG/dl/interface/Entitys.py
@@ -162,7 +162,7 @@ class Entity():
 
															         self.pointer_address = None
														
 
															         self.pointer_tendereeMoney = None
														
 
															         self.person_phone = person_phone
														
 
															-        
														
 
															+        self.notes = ''  # 2021/7/20 新增，保存金额大小写，单位等备注
														
 
															     def set_Role(self,role_label,role_values):
														
 
															         self.label = int(role_label)
														
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -199,7 +199,7 @@ def tableToText(soup):
 
															             '''
														
 
															             @summary: 计算每个节点受到的挤压度来判断是否需要染色
														
 
															             '''
														
 
															-            #print("B",inner_table[index])
														
 
															+            ## print("B",inner_table[index])
														
 
															             min_presure = 3
														
 
															             list_dye = []
														
 
															             first = None
														
@@ -260,7 +260,7 @@ def tableToText(soup):
 
															                             dye_set.add((inner_table[index][h][0],dye_type))
														
 
															                             key_set.add(inner_table[index][h][0])
														
 
															                     begin = end
														
 
															-                #print("E",inner_table[index])
														
 
															+                ## print("E",inner_table[index])
														
@@ -388,17 +388,17 @@ def tableToText(soup):
 
															         for item,values in zip(list_item,list(predict_y)):
														
 
															             _dict[item] = values[1]
														
 
															-            # print("##",item,values)
														
 
															-        #print(_dict)
														
 
															+            # # print("##",item,values)
														
 
															+        ## print(_dict)
														
 
															         for i in range(height):
														
 
															             for j in range(width):
														
 
															                 item = inner_table[i][j][0]
														
 
															                 inner_table[i][j][1] = 1 if _dict[item]>prob_min else (1 if re.search(pat_head,item) is not None and len(item)<8 else 0)
														
 
															-        # print("=====")
														
 
															+        # # print("=====")
														
 
															         # for item in inner_table:
														
 
															-        #     print(item)
														
 
															-        # print("======")
														
 
															+        #     # print(item)
														
 
															+        # # print("======")
														
 
															         repairTable(inner_table)
														
 
															         head_list = sliceTable(inner_table)
														
@@ -422,10 +422,10 @@ def tableToText(soup):
 
															                 if re.search(pat_head,_item) is not None and len(item)<8:
														
 
															                     inner_table[_h][_w][1] = 1
														
 
															-        # print("=====")
														
 
															+        # # print("=====")
														
 
															         # for item in inner_table:
														
 
															-        #     print(item)
														
 
															-        # print("======")
														
 
															+        #     # print(item)
														
 
															+        # # print("======")
														
 
															         repairTable(inner_table)
														
 
															         head_list = sliceTable(inner_table)
														
@@ -470,7 +470,7 @@ def tableToText(soup):
 
															                 else:
														
 
															                     is_head = False
														
 
															-            #print(temp_item,form_prob)
														
 
															+            ## print(temp_item,form_prob)
														
 
															             if len(inner_table[i][0][0])>40:
														
 
															                 is_long_value = True
														
 
															             if is_head or is_long_value or is_same_value:
														
@@ -751,12 +751,12 @@ def tableToText(soup):
 
															                                     pack_text += head+cell["text"]+"，"
														
 
															                                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题，if 改elif
														
 
															                                     #排名替换为同一种表达
														
 
															-                                    print("====",head)
														
 
															+                                    # print("====",head)
														
 
															                                     rank_text += head+cell["text"]+"，"
														
 
															-                                    #print(rank_text)
														
 
															+                                    ## print(rank_text)
														
 
															                                 elif re.search(entityPattern,head) is not None:
														
 
															                                     entity_text += head+cell["text"]+"，"
														
 
															-                                    #print(entity_text)
														
 
															+                                    ## print(entity_text)
														
 
															                                 else:
														
 
															                                     if re.search(moneyPattern,head) is not None and entity_text!="":
														
 
															                                         money_text += head+cell["text"]+","
														
@@ -788,10 +788,10 @@ def tableToText(soup):
 
															                                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题，if 改elif
														
 
															                                     #排名替换为同一种表达
														
 
															                                     rank_text += head+cell["text"]+"，"
														
 
															-                                    #print(rank_text)
														
 
															+                                    ## print(rank_text)
														
 
															                                 elif re.search(entityPattern,head) is not None:
														
 
															                                     entity_text += head+cell["text"]+"，"
														
 
															-                                    #print(entity_text)
														
 
															+                                    ## print(entity_text)
														
 
															                                 else:
														
 
															                                     text_line += head+cell["text"]+"，"
														
 
															                                 text_set.add(str(head+cell["text"]))
														
@@ -862,10 +862,10 @@ def tableToText(soup):
 
															                 #                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题，if 改elif
														
 
															                 #                     #排名替换为同一种表达
														
 
															                 #                     rank_text += head+inner_table[i][j][0]+"，"
														
 
															-                #                     #print(rank_text)
														
 
															+                #                     ## print(rank_text)
														
 
															                 #                 elif re.search(entityPattern,head) is not None:
														
 
															                 #                     entity_text += head+inner_table[i][j][0]+"，"
														
 
															-                #                     #print(entity_text)
														
 
															+                #                     ## print(entity_text)
														
 
															                 #                 else:
														
 
															                 #                     text_line += head+inner_table[i][j][0]+"，"
														
 
															                 #                 text_set.add(str(head+inner_table[i][j][0]))
														
@@ -924,10 +924,10 @@ def tableToText(soup):
 
															                 #                     continue
														
 
															                 #                 if re.search(rankPattern,head) is not None:
														
 
															                 #                     rank_text += head+inner_table[i][j][0]+"，"
														
 
															-                #                     #print(rank_text)
														
 
															+                #                     ## print(rank_text)
														
 
															                 #                 elif re.search(entityPattern,head) is not None:
														
 
															                 #                     entity_text += head+inner_table[i][j][0]+"，"
														
 
															-                #                     #print(entity_text)
														
 
															+                #                     ## print(entity_text)
														
 
															                 #                 else:
														
 
															                 #                     text_line += head+inner_table[i][j][0]+"，"
														
 
															                 #                 text_set.add(str(head+inner_table[i][j][0]))
														
@@ -952,22 +952,22 @@ def tableToText(soup):
 
															             #inner_table,head_list = setHead_inline(inner_table)
														
 
															             inner_table,head_list = setHead_initem(inner_table,pat_head)
														
 
															             # inner_table,head_list = setHead_incontext(inner_table,pat_head)
														
 
															-            # print(inner_table)
														
 
															+            # # print(inner_table)
														
 
															             # for begin in range(len(head_list[:-1])):
														
 
															             #     for item in inner_table[head_list[begin]:head_list[begin+1]]:
														
 
															-            #         print(item)
														
 
															-            #     print("====")
														
 
															+            #         # print(item)
														
 
															+            #     # print("====")
														
 
															             removeFix(inner_table)
														
 
															-            # print("----")
														
 
															-            # print(head_list)
														
 
															+            # # print("----")
														
 
															+            # # print(head_list)
														
 
															             # for item in inner_table:
														
 
															-            #     print(item)
														
 
															+            #     # print(item)
														
 
															             tbody.string = getTableText(inner_table,head_list)
														
 
															-            #print(tbody.string)
														
 
															+            ## print(tbody.string)
														
 
															             tbody.name = "turntable"
														
 
															             return inner_table
														
 
															         return None
														
@@ -998,9 +998,9 @@ def tableToText(soup):
 
															 #数据清洗
														
 
															 def segment(soup,final=True):
														
 
															-    # print("==")
														
 
															-    # print(soup)
														
 
															-    # print("====")
														
 
															+    # # print("==")
														
 
															+    # # print(soup)
														
 
															+    # # print("====")
														
 
															     #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
														
 
															     subspaceList = ["td",'a',"span","p"]
														
 
															     if soup.name in subspaceList:
														
@@ -1223,7 +1223,7 @@ def union_ner(list_ner):
 
															         if i not in union_index_set:
														
 
															             result_list.append(list_ner[i])
														
 
															     for item in union_index:
														
 
															-        #print(str(list_ner[item[0]][3])+str(list_ner[item[1]][3]))
														
 
															+        ## print(str(list_ner[item[0]][3])+str(list_ner[item[1]][3]))
														
 
															         result_list.append((list_ner[item[0]][0],list_ner[item[1]][1],'company',str(list_ner[item[0]][3])+str(list_ner[item[1]][3])))
														
 
															     return result_list
														
@@ -1358,8 +1358,8 @@ def union_ner(list_ner):
 
															 #                     index = 0
														
 
															 #                     for i in range(len(all_match)):
														
 
															 #                         if len(all_match[i][0])>0:
														
 
															-#                             # print("===",all_match[i])
														
 
															-#                             #print(all_match[i][0])
														
 
															+#                             # # print("===",all_match[i])
														
 
															+#                             ## print(all_match[i][0])
														
 
															 #                             unit = ""
														
 
															 #                             entity_text = all_match[i][3]
														
 
															 #                             if pattern_key in ["key_word","front_m"]:
														
@@ -1570,6 +1570,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															         #限流执行
														
 
															         key_nerToken = "nerToken"
														
 
															         start_time = time.time()
														
 
															+        found_yeji = 0 # 2021/8/6 增加判断是否正文包含评标结果 及类似业绩判断用于过滤后面的金额
														
 
															+        # found_pingbiao = False
														
 
															         ner_entitys_all = getNers(sentences,useselffool=useselffool)
														
 
															         if key_nerToken not in cost_time:
														
 
															             cost_time[key_nerToken] = 0
														
@@ -1627,10 +1629,16 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															             entity_type = "money"
														
 
															             #money_patten_str = "(([1-9][\d,，]*(?:\.\d+)?[百千万亿]?[\(\)（）元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[￥¥]+，?|报价|标价)[（\(]?([万])?元?[）\)]?[:：]?.{,7}?([1-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]?)|([1-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]?)[\(（]?([万元]{1,2}))*"
														
 
															+            # list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
														
 
															+            #                       "key_word":"((?P<text_key_word>(?:[￥¥]+，?|[单报标限]价|金额|价格|标的基本情况|CNY|成交结果：)(?:[,（\(]*\s*(?P<unit_key_word_before>[万元]*(?P<filter_unit2>[台个只]*))\s*[）\)]?)\s*[，,:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,8}?))(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千万亿元]*)(?:[（\(]?(?P<filter_>[%])*\s*(?P<unit_key_word_behind>[万元]*(?P<filter_unit1>[台个只]*))\s*[）\)]?))",
														
 
															+            #                       "front_m":"((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[）\)])\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千万亿元]*)())",
														
 
															+            #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]*)[\(（]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\)）]?)"}
														
 
															             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
														
 
															-                                  "key_word":"((?P<text_key_word>(?:[￥¥]+，?|[单报标限]价|金额|价格|标的基本情况|CNY|成交结果：)(?:[,（\(]*\s*(?P<unit_key_word_before>[万元]*(?P<filter_unit2>[台个只]*))\s*[）\)]?)\s*[，,:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,8}?))(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千万亿元]*)(?:[（\(]?(?P<filter_>[%])*\s*(?P<unit_key_word_behind>[万元]*(?P<filter_unit1>[台个只]*))\s*[）\)]?))",
														
 
															+                                  "key_word":"((?P<text_key_word>(?:[￥¥]+，?|[单报标限总]价|金额|成交报?价|价格|标的基本情况|CNY|成交结果：)(?:[,（\(]*\s*(?P<unit_key_word_before>[万元]*(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?[）\)]?)\s*[，,:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号]{,8}?))(第[123一二三]名[：:])?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千万亿]{,1})(?:[（\(]?(?P<filter_>[%])*\s*(单位[:：])?(?P<unit_key_word_behind>[万元]*(?P<filter_unit1>[台个只吨斤棵株页亩方条米]*))\s*[）\)]?))",
														
 
															                                   "front_m":"((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[）\)])\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千万亿元]*)())",
														
 
															-                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]*)[\(（]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\)）]?)"}
														
 
															+                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]*)[\(（]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
														
 
															+            # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。
														
 
															+
														
 
															             pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
														
 
															             set_begin = set()
														
 
															             # for pattern_key in list_money_pattern.keys():
														
@@ -1641,8 +1649,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                 # index = 0
														
 
															                 # for i in range(len(all_match)):
														
 
															                 #     if len(all_match[i][0])>0:
														
 
															-                #         print("===",all_match[i])
														
 
															-                #         #print(all_match[i][0])
														
 
															+                #         # print("===",all_match[i])
														
 
															+                #         ## print(all_match[i][0])
														
 
															                 #         unit = ""
														
 
															                 #         entity_text = all_match[i][3]
														
 
															                 #         if pattern_key in ["key_word","front_m"]:
														
@@ -1689,18 +1697,32 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                 #
														
 
															                 #     else:
														
 
															                 #         index += 1
														
 
															-            all_match = re.finditer(pattern_money, sentence_text)
														
 
															+
														
 
															+            # if re.search('评标结果|候选人公示', sentence_text):
														
 
															+            #     found_pingbiao = True
														
 
															+            if re.search('业绩', sentence_text):
														
 
															+                found_yeji += 1
														
 
															+            if found_yeji >= 2: # 过滤掉业绩后面的所有金额
														
 
															+                all_match = []
														
 
															+            else:
														
 
															+                all_match = re.finditer(pattern_money, sentence_text)
														
 
															             index = 0
														
 
															             for _match in all_match:
														
 
															                 if len(_match.group())>0:
														
 
															                     # print("===",_match.group())
														
 
															-                    # print(_match.groupdict())
														
 
															+                    # # print(_match.groupdict())
														
 
															+                    notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
														
 
															                     unit = ""
														
 
															                     entity_text = ""
														
 
															                     text_beforeMoney = ""
														
 
															                     filter = ""
														
 
															                     filter_unit = False
														
 
															                     notSure = False
														
 
															+                    if re.search('业绩', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
														
 
															+                        # print('金额在业绩后面: ', _match.group(0))
														
 
															+                        found_yeji += 1
														
 
															+                        break
														
 
															+
														
 
															                     for k,v in _match.groupdict().items():
														
 
															                         if v!="" and v is not None:
														
 
															                             if k=='text_key_word':
														
@@ -1715,8 +1737,33 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                                 filter = v
														
 
															                             if re.search("filter_unit",k) is not None:
														
 
															                                 filter_unit = True
														
 
															+
														
 
															+                    if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
														
 
															+                        if re.search('[幢栋号楼层]', sentence_text[_match.span()[0]-2:_match.span()[0]]):
														
 
															+                            entity_text = re.sub('\d+,', '', entity_text)
														
 
															+                        else:
														
 
															+                            entity_text = entity_text.replace(',', '.')
														
 
															+                        # print(' 修正OCR识别小数点为逗号')
														
 
															+
														
 
															                     if entity_text.find("元")>=0:
														
 
															                         unit = ""
														
 
															+                    if unit == "":  #2021/7/21 有明显金额特征的补充单位，避免被过滤
														
 
															+                        if ('￥' in text_beforeMoney or '¥' in text_beforeMoney):
														
 
															+                            unit = '元'
														
 
															+                            # print('明显金额特征补充单位 元')
														
 
															+                        elif re.search('[单报标限]价|金额|价格[:：]+$', text_beforeMoney.strip()) and \
														
 
															+                                re.search('\d{5,}',entity_text) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}',entity_text)==None:
														
 
															+                            unit = '元'
														
 
															+                            # print('明显金额特征补充单位 元')
														
 
															+                        elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7})$)|(^\d{,3}(,\d{3})+$)',entity_text):
														
 
															+                            unit = '元'
														
 
															+                            # print('明显金额特征补充单位 元')
														
 
															+                    if unit.find("万") >= 0 and entity_text.find("万") >= 0:  #2021/7/19修改为金额文本有万，不计算单位
														
 
															+                        # print('修正金额及单位都有万， 金额：',entity_text, '单位:',unit)
														
 
															+                        unit = "元"
														
 
															+                    if re.search('.*万元万元', entity_text):  #2021/7/19 修正两个万元
														
 
															+                        # print(' 修正两个万元',entity_text)
														
 
															+                        entity_text = entity_text.replace('万元万元','万元')
														
 
															                     else:
														
 
															                         if filter_unit:
														
 
															                             continue
														
@@ -1742,15 +1789,36 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
														
 
															                     entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
														
 
															+                    # print('转换前金额：', entity_text, '单位:', unit)
														
 
															+                    if re.search('总投资', sentence_text[_match.span()[0] - 6:_match.span()[0]]):  # 2021/8/5过滤掉总投资金额
														
 
															+                        # print('总投资金额: ', _match.group(0))
														
 
															+                        notes = '总投资'
														
 
															+                    if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
														
 
															+                        notes = '大写'
														
 
															+                        # print("补充备注：notes = 大写")
														
 
															+                    elif re.search('单价', sentence_text[_match.span()[0]:_match.span()[1]]):
														
 
															+                        notes = '单价'
														
 
															+                        # print("补充备注：单价 ",sentence_text[_match.span()[0]-2:_match.span()[1]])
														
 
															                     if len(unit)>0:
														
 
															-                        entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0]))
														
 
															+                        if unit.find('万')>=0 and len(entity_text.split('.')[0])>=8: # 2021/7/19 修正万元金额过大的情况
														
 
															+                            # print('修正单位万元金额过大的情况 金额：', entity_text, '单位:', unit)
														
 
															+                            entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(unit[0])/10000)
														
 
															+                        else:
														
 
															+                            # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
														
 
															+                            entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0]))
														
 
															                     else:
														
 
															-                        entity_text = str(getUnifyMoney(entity_text))
														
 
															+                        if entity_text.find('万')>=0 and entity_text.split('.')[0].isdigit() and len(entity_text.split('.')[0])>=8:
														
 
															+                            entity_text = str(getUnifyMoney(entity_text)/10000)
														
 
															+                            # print('修正金额字段含万 过大的情况')
														
 
															+                        else:
														
 
															+                            entity_text = str(getUnifyMoney(entity_text))
														
 
															                     if float(entity_text)<100 or float(entity_text)>100000000000:
														
 
															+                        # print('过滤掉金额：float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
														
 
															                         continue
														
 
															                     if notSure and unit=="" and float(entity_text)>100*10000:
														
 
															+                        # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000：', entity_text, unit)
														
 
															                         continue
														
 
															                     _exists = False
														
@@ -1762,7 +1830,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                     if not _exists:
														
 
															                         if float(entity_text)>1:
														
 
															                             list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp))
														
 
															-
														
 
															+                            list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
														
 
															                 else:
														
 
															                     index += 1
														
@@ -1824,7 +1892,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                 end_index = changeIndexFromWordToWords(tokens, end_index_temp)
														
 
															                 if begin_index is None or end_index is None:
														
 
															                     continue
														
 
															-                print(begin_index_temp,end_index_temp,begin_index,end_index)
														
 
															+                # print(begin_index_temp,end_index_temp,begin_index,end_index)
														
 
															                 entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
														
 
															                 entity_text = bidway['body']
														
 
															                 list_sentence_entitys.append(
														
@@ -1949,7 +2017,7 @@ def getPredictTable():
 
															             df_data["docid"].append(item["docid"])
														
 
															             df_data["json_table"].append(item["json_table"])
														
 
															         except Exception as e:
														
 
															-            print(e)
														
 
															+            # print(e)
														
 
															             break
														
 
															     df_1 = pd.DataFrame(df_data)
														
 
															     df_1.to_csv("../form/websource_67000_table.csv",columns=["docid","json_table"])
														
@@ -1965,7 +2033,7 @@ if __name__=="__main__":
 
															             f.write(segment(tableToText(BeautifulSoup(content,"lxml"))))
														
 
															     '''        
														
 
															     # content = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
														
 
															-    # print(segment(tableToText(BeautifulSoup(content,"lxml"))))
														
 
															+    # # print(segment(tableToText(BeautifulSoup(content,"lxml"))))
														
 
															     getPredictTable()
														
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -91,8 +91,13 @@ def predict(doc_id,text,title=""):
 
															     list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
														
 
															     cost_time["punish"] = time.time()-start_time
														
 
															+    start_time = time.time()
														
 
															+    list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
														
 
															+    cost_time["channel"] = time.time()-start_time
														
 
															+
														
 
															     #print(prem)
														
 
															-    data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
														
 
															+    # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
														
 
															+    data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
														
 
															     data_res["cost_time"] = cost_time
														
 
															     data_res["success"] = True
														
@@ -122,4 +127,13 @@ def test(name,content):
 
															 if __name__=="__main__":
														
 
															+    import pandas as pd
														
 
															+    df = pd.read_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0812.xlsx')
														
 
															+    # for i in range(50):
														
 
															+    i = 246
														
 
															+    doc_id = df.loc[i, 'docid']
														
 
															+    text = df.loc[i, 'dochtmlcon']
														
 
															+    title = df.loc[i, 'doctitle']
														
 
															+    rs = predict(doc_id,text,title)
														
 
															+    print(rs)
														
 
															     pass
														
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -2,6 +2,7 @@
 
															 from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date
														
 
															 from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
														
 
															+from decimal import Decimal
														
 
															 import re
														
 
															 import copy
														
 
															 import math
														
@@ -240,11 +241,11 @@ def get_legal_comba(list_entity,dict_role_combination):
 
															         # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
														
 
															         _list_all_selution = circle_package(dict_role_combination[packageName])
														
 
															         '''
														
 
															-        print("===1")
														
 
															-        print(packageName)
														
 
															+        # print("===1")
														
 
															+        # print(packageName)
														
 
															         for item in _list_all_selution:
														
 
															-            print(item)
														
 
															-        print("===2")
														
 
															+            # print(item)
														
 
															+        # print("===2")
														
 
															         '''
														
 
															         #去除包含子集
														
 
															         list_all_selution_simple = []
														
@@ -759,8 +760,16 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															     def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
														
 
															         for i in range(len(packDict[packageName]["roleList"])):
														
 
															             if packDict[packageName]["roleList"][i].entity_text==entity:
														
 
															-                if money_prob>packDict[packageName]["roleList"][i].money_prob:
														
 
															-                    packDict[packageName]["roleList"][i].money = money
														
 
															+                # if money_prob>packDict[packageName]["roleList"][i].money_prob:
														
 
															+                #     packDict[packageName]["roleList"][i].money = money
														
 
															+                #     packDict[packageName]["roleList"][i].money_prob = money_prob
														
 
															+                if packDict[packageName]["roleList"][i].money_prob==0 :  # 2021/7/20第一次更新金额
														
 
															+                    packDict[packageName]["roleList"][i].money = money.entity_text
														
 
															+                    packDict[packageName]["roleList"][i].money_prob = money_prob
														
 
															+                elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or money.notes in ['大写']: # 2021/7/20改为优先选择大写金额,
														
 
															+                    # print('已连接金额概率：money_prob:',packDict[packageName]["roleList"][i].money_prob)
														
 
															+                    # print('链接金额备注 ',money.notes, money.entity_text, money.values)
														
 
															+                    packDict[packageName]["roleList"][i].money = money.entity_text
														
 
															                     packDict[packageName]["roleList"][i].money_prob = money_prob
														
 
															         return packDict
														
@@ -778,6 +787,15 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                 return True
														
 
															     p_entity = 0
														
 
															+
														
 
															+    # 2021/7/19 顺序比较金额，前面是后面的一万倍则把前面金额/10000
														
 
															+    money_list = [it for it in list_entity if it.entity_type=="money"]
														
 
															+    for i in range(len(money_list)-1):
														
 
															+        for j in range(1, len(money_list)):
														
 
															+            if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
														
 
															+                    Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
														
 
															+                money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
														
 
															+                # print('连接前修改大于50亿金额：前面是后面的一万倍则把前面金额/10000')
														
 
															     #遍历所有实体
														
 
															     while(p_entity<len(list_entity)):
														
@@ -818,28 +836,29 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                                 if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
														
 
															                                     _entity.pointer_person = entity
														
 
															     '''
														
 
															-        #金额往前找实体
														
 
															-        if entity.entity_type=="money":
														
 
															-            if entity.values[entity.label]>=on_value:
														
 
															-                p_entity_money= p_entity
														
 
															-                entity_money = list_entity[p_entity_money]
														
 
															-                if len(PackageSet)>0:
														
 
															-                    packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
														
 
															-                    if packagePointer is None:
														
 
															-                        packageName_entity = "Project"
														
 
															-                    else:
														
 
															-                        packageName_entity = packagePointer.entity_text
														
 
															-                else:
														
 
															-                    packageName_entity = "Project"
														
 
															-                while(p_entity_money>0):
														
 
															-                    entity_before = list_entity[p_entity_money]
														
 
															-                    if entity_before.entity_type in ['org','company']:
														
 
															-                        if str(entity_before.label)=="1":
														
 
															-                            addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
														
 
															-                            #add pointer_money
														
 
															-                            entity_before.pointer_money = entity_money
														
 
															-                        break
														
 
															-                    p_entity_money -= 1
														
 
															+        # #金额往前找实体
														
 
															+        # if entity.entity_type=="money":
														
 
															+        #     if entity.values[entity.label]>=on_value:
														
 
															+        #         p_entity_money= p_entity
														
 
															+        #         entity_money = list_entity[p_entity_money]
														
 
															+        #         if len(PackageSet)>0:
														
 
															+        #             packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
														
 
															+        #             if packagePointer is None:
														
 
															+        #                 packageName_entity = "Project"
														
 
															+        #             else:
														
 
															+        #                 packageName_entity = packagePointer.entity_text
														
 
															+        #         else:
														
 
															+        #             packageName_entity = "Project"
														
 
															+        #         while(p_entity_money>0):
														
 
															+        #             entity_before = list_entity[p_entity_money]
														
 
															+        #             if entity_before.entity_type in ['org','company']:
														
 
															+        #                 if str(entity_before.label)=="1":
														
 
															+        #                     addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
														
 
															+        #                     #add pointer_money
														
 
															+        #                     entity_before.pointer_money = entity_money
														
 
															+        #                 break
														
 
															+        #             p_entity_money -= 1
														
 
															+
														
 
															         #如果实体属于角色集合，则往后找属性
														
@@ -882,9 +901,23 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                                 else:
														
 
															                                     packageName_entity = "Project"
														
 
															                                 if str(entity.label) in ["2","3","4"]:
														
 
															-                                    addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
														
 
															+                                    # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
														
 
															+                                    if entity_after.notes == '单价':
														
 
															+                                        addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
														
 
															+                                                         0.5)
														
 
															+                                        entity.pointer_money = entity_after
														
 
															+                                        # print('role zhao money', entity.entity_text, '中标金额：', entity_after.entity_text)
														
 
															+                                    else:
														
 
															+                                        addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
														
 
															+                                                         entity_after.values[entity_after.label])
														
 
															+                                        entity.pointer_money = entity_after
														
 
															+                                        # print('role zhao money', entity.entity_text, '中标金额：', entity_after.entity_text)
														
 
															+                                        break # 2021/7/16 新增，找到中标金额，非单价即停止，不再往后找金额
														
 
															                                     #add pointer_money
														
 
															-                                    entity.pointer_money = entity_after
														
 
															+                                    # entity.pointer_money = entity_after
														
 
															+                                    # print('role zhao money', entity.entity_text, '中标金额：', entity_after.entity_text)
														
 
															+                                    # if entity_after.notes!='单价':
														
 
															+                                    #     break  # 2021/7/16 新增，找到中标金额即停止，不再往后找金额
														
 
															                         '''
														
 
															                     if entity_after.entity_type=="person":
														
 
															                         if entity_after.values[entity_after.label]>=on_value_person:
														
@@ -1034,6 +1067,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															     p_entity = len(list_entity)-1
														
 
															     set_tenderer_money = set()
														
 
															+    list_tenderer_money = []  #2021/7/16 新增列表，倒序保存所有中标金额
														
 
															+
														
 
															     #遍历所有实体
														
 
															     while(p_entity>=0):
														
 
															         entity = list_entity[p_entity]
														
@@ -1041,7 +1076,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															             if entity.values[entity.label]>=on_value:
														
 
															                 if str(entity.label)=="1":
														
 
															                     set_tenderer_money.add(float(entity.entity_text))
														
 
															-                if str(entity.label)=="0":
														
 
															+                    list_tenderer_money.append(float(entity.entity_text))  # 2021/7/16 新增列表，倒序保存所有中标金额
														
 
															+                # if str(entity.label)=="0":
														
 
															+                if str(entity.label)=="0" and entity.notes!='总投资':
														
 
															                     '''
														
 
															                     if p_entity>0:
														
 
															                         p_before = list_entity[p_entity-1]
														
@@ -1056,8 +1093,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                         packageName = packagePointer.entity_text
														
 
															                     if packageName=="Project":
														
 
															-                        if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
														
 
															-                            PackDict["Project"]["tendereeMoney"] = float(entity.entity_text) 
														
 
															+                        # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
														
 
															+                        #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
														
 
															+                        if entity.values[entity.label]>on_value:
														
 
															+                            PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
														
 
															                     else:
														
 
															                         PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
														
 
															                         #add pointer_tendereeMoney
														
@@ -1091,6 +1130,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															     #只找到一个中标人和中标金额
														
 
															     if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
														
 
															         list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
														
 
															+        # print('一个中标人一个金额：', list(set_tenderer_money)[0])
														
 
															     #找到一个中标人和多个招标金额
														
 
															     if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
														
 
															         _maxMoney = 0
														
@@ -1101,8 +1141,15 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                 _maxMoney = _m
														
 
															         if _sumMoney/_maxMoney==2:
														
 
															             list(set_tenderer_role)[0].money = _maxMoney
														
 
															+            # print('一人多金额分项合计 取最大金额：', _maxMoney)
														
 
															         else:
														
 
															-            list(set_tenderer_role)[0].money = _maxMoney
														
 
															+            # list(set_tenderer_role)[0].money = _maxMoney
														
 
															+            if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000:
														
 
															+                list(set_tenderer_role)[0].money = min(list_tenderer_money)
														
 
															+                # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额：', min(list_tenderer_money))
														
 
															+            else:
														
 
															+                list(set_tenderer_role)[0].money = list_tenderer_money[-1]  # 2021/7/16 修改 不是单价合计方式取第一个中标金额
														
 
															+                # print('一人多金额 取第一个中标金额：', list_tenderer_money[-1])
														
 
															     #每个包都只找到一个金额
														
 
															     _flag_pack_money = True
														
 
															     for k,v in dict_pack_tenderer_money.items():
														
@@ -1111,6 +1158,31 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															     if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
														
 
															         for k,v in dict_pack_tenderer_money.items():
														
 
															             v[0].money = list(v[1])[0]
														
 
															+            # print('k,v in dict_pack_tenderer_money.items', k, v)
														
 
															+
														
 
															+    # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
														
 
															+    for pack in PackDict.keys():
														
 
															+        for i in range(len(PackDict[pack]["roleList"])):
														
 
															+            if PackDict[pack]["tendereeMoney"] > 0:
														
 
															+                # print('金额数据类型：',type(PackDict[pack]["roleList"][i].money))
														
 
															+                if float(PackDict[pack]["roleList"][i].money) >10000000 and \
														
 
															+                        float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
														
 
															+                    PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
														
 
															+                    # print('招标金额校正中标金额')
														
 
															+
														
 
															+    # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额
														
 
															+    for pack in PackDict.keys():
														
 
															+        tmp_moneys = []
														
 
															+        for i in range(len(PackDict[pack]["roleList"])):
														
 
															+            if float(PackDict[pack]["roleList"][i].money) >100000:
														
 
															+                tmp_moneys.append(float(PackDict[pack]["roleList"][i].money))
														
 
															+        if len(tmp_moneys)>2 and max(tmp_moneys)/min(tmp_moneys)>1000:
														
 
															+            for i in range(len(PackDict[pack]["roleList"])):
														
 
															+                if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000:
														
 
															+                    PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
														
 
															+                    # print('通过其他中标人投标金额校正中标金额')
														
 
															+
														
 
															+
														
 
															     for pack in PackDict.keys():
														
 
															         for i in range(len(PackDict[pack]["roleList"])):
														
 
															             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
														
@@ -1148,7 +1220,7 @@ def getPackageRoleMoney(list_sentence,list_entity):
 
															     RoleList,RoleSet,PackageList,PackageSet = theRole
														
 
															     '''
														
 
															     for item in PackageList:
														
 
															-        print(item)
														
 
															+        # print(item)
														
 
															     '''
														
 
															     # print("=2")
														
 
															     PackDict = initPackageAttr(RoleList, PackageSet)
														
@@ -1185,7 +1257,8 @@ def getOtherAttributes(list_entity):
 
															                   "time_bidopen":"",
														
 
															                   "time_bidclose":"",
														
 
															                   "serviceTime":"",
														
 
															-                  "product":[]}
														
 
															+                  "product":[],
														
 
															+                  "total_tendereeMoney":0}
														
 
															     for entity in list_entity:
														
 
															         if entity.entity_type == 'bidway':
														
 
															             dict_other["bidway"] = turnBidWay(entity.entity_text)
														
@@ -1203,6 +1276,8 @@ def getOtherAttributes(list_entity):
 
															             dict_other["person_review"].append(entity.entity_text)
														
 
															         elif entity.entity_type=='product':
														
 
															             dict_other["product"].append(entity.entity_text)
														
 
															+        elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
														
 
															+            dict_other["total_tendereeMoney"] = float(entity.entity_text)
														
 
															     dict_other["product"] = list(set(dict_other["product"]))
														
 
															     return dict_other
														
@@ -1241,7 +1316,7 @@ if __name__=="__main__":
 
															     for row in rows:
														
 
															         count += 1
														
 
															-        print(count)
														
 
															+        # print(count)
														
 
															         doc_id = row[0]
														
 
															         roleList = getPackageRoleMoney(doc_id)
														
--- a/BiddingKG/dl/interface/modelFactory.py
+++ b/BiddingKG/dl/interface/modelFactory.py
@@ -9,7 +9,7 @@ import sys
 
															 sys.path.append(os.path.abspath("../.."))
														
 
															 from keras import models
														
 
															 from keras import layers
														
 
															-from keras_contrib.layers import CRF
														
 
															+# from keras_contrib.layers import CRF
														
 
															 from keras.preprocessing.sequence import pad_sequences
														
 
															 from keras import optimizers,losses,metrics
														
 
															 from BiddingKG.dl.common.Utils import *
														
@@ -77,7 +77,7 @@ class Model_role_classify_word():
 
															     def encode(self,tokens,begin_index,end_index,entity_text,**kwargs):
														
 
															         _span = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=12,center_include=True,word_flag=True,text=entity_text)
														
 
															         # print(_span)
														
 
															-        _encode_span = encodeInput(_span, word_len=50, word_flag=True,userFool=False)
														
 
															+        _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False)
														
 
															         # print(_encode_span)
														
 
															         return _encode_span
														
@@ -109,6 +109,7 @@ class Model_money_classify():
 
															         with self.sess_money.as_default() as sess:
														
 
															           with sess.graph.as_default():
														
 
															             meta_graph_def = tf.saved_model.loader.load(sess,tags=["serve"],export_dir=os.path.dirname(__file__)+"/money_savedmodel")
														
 
															+            # meta_graph_def = tf.saved_model.loader.load(sess,tags=["serve"],export_dir=os.path.dirname(__file__)+"/money_savedmodel_bilstmonly")
														
 
															             signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
														
 
															             signature_def = meta_graph_def.signature_def
														
@@ -135,7 +136,7 @@ class Model_money_classify():
 
															     def encode(self,tokens,begin_index,end_index,**kwargs):
														
 
															         _span = spanWindow(tokens=tokens, begin_index=begin_index, end_index=end_index, size=10, center_include=True, word_flag=True)
														
 
															         # print(_span)
														
 
															-        return encodeInput(_span, word_len=50, word_flag=True,userFool=False)
														
 
															+        return encodeInput(_span, word_len=30, word_flag=True,userFool=False)
														
 
															         return embedding_word(_span,shape=(3,100,60))
														
 
															     def predict(self,x):
														
--- a/BiddingKG/dl/interface/money_savedmodel/saved_model.pb
+++ b/BiddingKG/dl/interface/money_savedmodel/saved_model.pb
--- a/BiddingKG/dl/interface/money_savedmodel/variables/variables.data-00000-of-00001
+++ b/BiddingKG/dl/interface/money_savedmodel/variables/variables.data-00000-of-00001
--- a/BiddingKG/dl/interface/money_savedmodel/variables/variables.index
+++ b/BiddingKG/dl/interface/money_savedmodel/variables/variables.index
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -6,6 +6,7 @@ Created on 2018年12月26日
 
															 import os
														
 
															 import sys
														
 
															+import fool
														
 
															 sys.path.append(os.path.abspath("../.."))
														
 
															 # from keras.engine import topology
														
 
															 # from keras import models
														
@@ -28,7 +29,8 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
 
															                   "form":{"predictor":None,"Lock":RLock()},
														
 
															                   "time":{"predictor":None,"Lock":RLock()},
														
 
															                   "punish":{"predictor":None,"Lock":RLock()},
														
 
															-                  "product":{"predictor":None,"Lock":RLock()}}
														
 
															+                  "product":{"predictor":None,"Lock":RLock()},
														
 
															+                  "channel": {"predictor": None, "Lock": RLock()}}
														
 
															 def getPredictor(_type):
														
@@ -51,6 +53,8 @@ def getPredictor(_type):
 
															                     dict_predictor[_type]["predictor"] = Punish_Extract()
														
 
															                 if _type=="product":
														
 
															                     dict_predictor[_type]["predictor"] = ProductPredictor()
														
 
															+                if _type == "channel":
														
 
															+                    dict_predictor[_type]["predictor"] = DocChannel()
														
 
															             return dict_predictor[_type]["predictor"]
														
 
															     raise NameError("no this type of predictor")
														
@@ -1286,13 +1290,189 @@ class ProductPredictor():
 
															                                                  wordOffset_end=end)
														
 
															                                 list_entity.append(_entity)
														
 
															                                 temp_list.append(sentence.sentence_text[start:end])
														
 
															-                        item["product"] = list(set(temp_list))
														
 
															-                        result.append(item)
														
 
															+                        # item["product"] = list(set(temp_list))
														
 
															+                        # result.append(item)
														
 
															                         if _begin_index+_LEN >= len(list_sentence):
														
 
															                             break
														
 
															                         _begin_index += _LEN
														
 
															+                    item["product"] = list(set(temp_list))
														
 
															+                    result.append(item) # 修正bug
														
 
															                 return result
														
 
															+# docchannel类型提取
														
 
															+class DocChannel():
														
 
															+  def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):
														
 
															+    self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
														
 
															+    self.mask, self.mask_title = self.load_life(life_model)
														
 
															+    self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
														
 
															+    self.type_mask, self.type_mask_title = self.load_type(type_model)
														
 
															+    self.sequen_len = 200  # 150 200
														
 
															+    self.title_len = 30
														
 
															+    self.sentence_num = 10
														
 
															+    self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
														
 
															+
														
 
															+    lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
														
 
															+    lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
														
 
															+    self.id2type = {k: v for k, v in enumerate(lb_type)}
														
 
															+    self.id2life = {k: v for k, v in enumerate(lb_life)}
														
 
															+
														
 
															+  def load_life(self,life_model):
														
 
															+    with tf.Graph().as_default() as graph:
														
 
															+      output_graph_def = graph.as_graph_def()
														
 
															+      with open(os.path.dirname(__file__)+life_model, 'rb') as f:
														
 
															+        output_graph_def.ParseFromString(f.read())
														
 
															+        tf.import_graph_def(output_graph_def, name='')
														
 
															+        print("%d ops in the final graph" % len(output_graph_def.node))
														
 
															+        del output_graph_def
														
 
															+        sess = tf.Session(graph=graph)
														
 
															+        sess.run(tf.global_variables_initializer())
														
 
															+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+        title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
														
 
															+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
														
 
															+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+        return sess, title, inputs, prob, softmax, mask, mask_title
														
 
															+
														
 
															+  def load_type(self,type_model):
														
 
															+    with tf.Graph().as_default() as graph:
														
 
															+      output_graph_def = graph.as_graph_def()
														
 
															+      with open(os.path.dirname(__file__)+type_model, 'rb') as f:
														
 
															+        output_graph_def.ParseFromString(f.read())
														
 
															+        tf.import_graph_def(output_graph_def, name='')
														
 
															+        print("%d ops in the final graph" % len(output_graph_def.node))
														
 
															+        del output_graph_def
														
 
															+        sess = tf.Session(graph=graph)
														
 
															+        sess.run(tf.global_variables_initializer())
														
 
															+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+        title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
														
 
															+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
														
 
															+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+        return sess, title, inputs, prob, softmax, mask, mask_title
														
 
															+
														
 
															+  def predict_process(self, docid='', doctitle='', dochtmlcon=''):
														
 
															+    # print('准备预处理')
														
 
															+    def get_kw_senten(s, span=10):
														
 
															+      doc_sens = []
														
 
															+      tmp = 0
														
 
															+      num = 0
														
 
															+      end_idx = 0
														
 
															+      for it in re.finditer(self.kws, s):  # '|'.join(keywordset)
														
 
															+        left = s[end_idx:it.end()].split()
														
 
															+        right = s[it.end():].split()
														
 
															+        tmp_seg = s[tmp:it.start()].split()
														
 
															+        if len(tmp_seg) > span or tmp == 0:
														
 
															+          doc_sens.append(' '.join(left[-span:] + right[:span]))
														
 
															+          end_idx = it.end() + 1 + len(' '.join(right[:span]))
														
 
															+          tmp = it.end()
														
 
															+          num += 1
														
 
															+          if num >= self.sentence_num:
														
 
															+            break
														
 
															+      if doc_sens == []:
														
 
															+        doc_sens.append(s)
														
 
															+      return doc_sens
														
 
															+
														
 
															+    def word2id(wordlist, max_len=self.sequen_len):
														
 
															+      ids = [getIndexOfWords(w) for w in wordlist]
														
 
															+      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
														
 
															+      assert len(ids) == max_len
														
 
															+      return ids
														
 
															+
														
 
															+    cost_time = dict()
														
 
															+    datas = []
														
 
															+    datas_title = []
														
 
															+    try:
														
 
															+      segword_title = ' '.join(fool.cut(doctitle)[0])
														
 
															+      segword_content = dochtmlcon
														
 
															+    except:
														
 
															+      segword_content = ''
														
 
															+      segword_title = ''
														
 
															+    if isinstance(segword_content, float):
														
 
															+      segword_content = ''
														
 
															+    if isinstance(segword_title, float):
														
 
															+      segword_title = ''
														
 
															+    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
														
 
															+      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
														
 
															+      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
														
 
															+    segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
														
 
															+    segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
														
 
															+    doc_word_list = segword_content.split()
														
 
															+    if len(doc_word_list) > self.sequen_len / 2:
														
 
															+      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
														
 
															+      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
														
 
															+    else:
														
 
															+      doc_sens = ' '.join(doc_word_list[:self.sequen_len])
														
 
															+    datas.append(doc_sens.split())
														
 
															+    datas_title.append(segword_title.split())
														
 
															+    # print('完成预处理')
														
 
															+    return datas, datas_title
														
 
															+
														
 
															+  def is_houxuan(self, title, content):
														
 
															+    '''
														
 
															+    通过标题和中文内容判断是否属于候选人公示类别
														
 
															+    :param title: 公告标题
														
 
															+    :param content: 公告正文文本内容
														
 
															+    :return: 1 是候选人公示 ；0 不是
														
 
															+    '''
														
 
															+    if re.search('候选人的?公示|评标结果|评审结果|中标公示', title):  # (中标|成交|中选|入围)
														
 
															+      if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
														
 
															+        return 0
														
 
															+      return 1
														
 
															+    if re.search('候选人的?公示', content[:100]):
														
 
															+      if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
														
 
															+        return 0
														
 
															+      return 1
														
 
															+    else:
														
 
															+      return 0
														
 
															+
														
 
															+  def predict(self, title='', content=''):
														
 
															+    # print('准备预测')
														
 
															+    if isinstance(content, list):
														
 
															+      token_l = [it.tokens for it in content]
														
 
															+      tokens = [it for l in token_l for it in l]
														
 
															+      content = ' '.join(tokens)
														
 
															+
														
 
															+    data_content, data_title = self.predict_process(docid='', doctitle=title[:50], dochtmlcon=content) # 标题最多取50字
														
 
															+    text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
														
 
															+    title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
														
 
															+
														
 
															+    array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
														
 
															+    array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
														
 
															+    pred = self.type_sess.run(self.type_softmax,
														
 
															+                                    feed_dict={
														
 
															+                                              self.type_title: array_title,
														
 
															+                                              self.type_content: array_content,
														
 
															+                                              self.type_mask:[[0]*text_len+[1]*(self.sequen_len-text_len)],
														
 
															+                                              self.type_mask_title:[[0]*title_len+[1]*(self.title_len-title_len)],
														
 
															+                                              self.type_prob:1}
														
 
															+                            )
														
 
															+    id = np.argmax(pred, axis=1)[0]
														
 
															+    prob = pred[0][id]
														
 
															+    if id == 0:
														
 
															+      pred = self.lift_sess.run(self.lift_softmax,
														
 
															+                                      feed_dict={
														
 
															+                                                self.lift_title: array_title,
														
 
															+                                                self.lift_content: array_content,
														
 
															+                                                self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
														
 
															+                                                self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
														
 
															+                                                self.lift_prob:1}
														
 
															+                              )
														
 
															+      id = np.argmax(pred, axis=1)[0]
														
 
															+      prob = pred[0][id]
														
 
															+      if id == 6:
														
 
															+        if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
														
 
															+          # return '候选人公示', prob
														
 
															+          return [{'docchannel': '候选人公示'}]
														
 
															+      # return self.id2life[id], prob
														
 
															+      return [{'docchannel':self.id2life[id]}]
														
 
															+    else:
														
 
															+      # return self.id2type[id], prob
														
 
															+      return [{'docchannel':self.id2type[id]}]
														
 
															+
														
 
															 def getSavedModel():
														
 
															     #predictor = FormPredictor()
														
 
															     graph = tf.Graph()
														
@@ -1493,20 +1673,28 @@ def save_role_model():
 
															                                    )
														
 
															 def save_money_model():
														
 
															-    model_money = PREMPredict().model_money
														
 
															-    with model_money.graph.as_default():
														
 
															-        model = model_money.getModel()
														
 
															-        sess = tf.Session(graph=model_money.graph)
														
 
															-        model.summary()
														
 
															-        sess.run(tf.global_variables_initializer())
														
 
															-        h5_to_graph(sess, model_money.graph, model_money.model_money_file)
														
 
															-        tf.saved_model.simple_save(sess,
														
 
															-                                   "./money_savedmodel/",
														
 
															-                                   inputs = {"input0":model.input[0],
														
 
															-                                             "input1":model.input[1],
														
 
															-                                             "input2":model.input[2]},
														
 
															-                                   outputs = {"outputs":model.output}
														
 
															-                                   )
														
 
															+    model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
														
 
															+    graph = tf.Graph()
														
 
															+    with graph.as_default():
														
 
															+
														
 
															+        sess = tf.Session(graph=graph)
														
 
															+
														
 
															+        with sess.as_default():
														
 
															+            # model = model_money.getModel()
														
 
															+            # model.summary()
														
 
															+            # sess.run(tf.global_variables_initializer())
														
 
															+            # h5_to_graph(sess, model_money.graph, model_money.model_money_file)
														
 
															+
														
 
															+            model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
														
 
															+            model.summary()
														
 
															+            print(model.weights)
														
 
															+            # tf.saved_model.simple_save(sess,
														
 
															+            #                            "./money_savedmodel2/",
														
 
															+            #                            inputs = {"input0":model.input[0],
														
 
															+            #                                      "input1":model.input[1],
														
 
															+            #                                      "input2":model.input[2]},
														
 
															+            #                            outputs = {"outputs":model.output}
														
 
															+            #                            )
														
 
															 def save_person_model():
														
@@ -1582,23 +1770,23 @@ def save_timesplit_model():
 
															 if __name__=="__main__":
														
 
															     #save_role_model()
														
 
															     # save_codename_model()
														
 
															-    #save_money_model()
														
 
															+    save_money_model()
														
 
															     #save_person_model()
														
 
															     #save_form_model()
														
 
															     #save_codesplit_model()
														
 
															     # save_timesplit_model()
														
 
															     '''
														
 
															-    with tf.Session(graph=tf.Graph()) as sess:
														
 
															-        from tensorflow.python.saved_model import tag_constants
														
 
															-        meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
														
 
															-        graph = tf.get_default_graph()
														
 
															-        signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
														
 
															-        signature = meta_graph_def.signature_def
														
 
															-        input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
														
 
															-        input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
														
 
															-        outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
														
 
															-        x = load("person_x.pk")
														
 
															-        _data = np.transpose(x,[1,0,2,3])
														
 
															-        y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
														
 
															-        print(np.argmax(y,-1))
														
 
															+    # with tf.Session(graph=tf.Graph()) as sess:
														
 
															+    #     from tensorflow.python.saved_model import tag_constants
														
 
															+    #     meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
														
 
															+    #     graph = tf.get_default_graph()
														
 
															+    #     signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
														
 
															+    #     signature = meta_graph_def.signature_def
														
 
															+    #     input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
														
 
															+    #     input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
														
 
															+    #     outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
														
 
															+    #     x = load("person_x.pk")
														
 
															+    #     _data = np.transpose(x,[1,0,2,3])
														
 
															+    #     y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
														
 
															+    #     print(np.argmax(y,-1))
														
 
															     '''
														
--- a/BiddingKG/dl/interface/role_savedmodel/saved_model.pb
+++ b/BiddingKG/dl/interface/role_savedmodel/saved_model.pb
--- a/BiddingKG/dl/interface/role_savedmodel/variables/variables.data-00000-of-00001
+++ b/BiddingKG/dl/interface/role_savedmodel/variables/variables.data-00000-of-00001
--- a/BiddingKG/dl/interface/role_savedmodel/variables/variables.index
+++ b/BiddingKG/dl/interface/role_savedmodel/variables/variables.index
--- a/BiddingKG/dl/money/money_keras.py
+++ b/BiddingKG/dl/money/money_keras.py
@@ -0,0 +1,230 @@
 
															+#!/usr/bin/python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# @Author  : bidikeji
														
 
															+# @Time    : 2021/7/27 0027 15:05 
														
 
															+
														
 
															+import os
														
 
															+import sys
														
 
															+import h5py
														
 
															+from keras import models,layers,losses,optimizers
														
 
															+sys.path.append(os.path.abspath("../../.."))
														
 
															+import pandas as pd
														
 
															+import math
														
 
															+from keras.callbacks import ModelCheckpoint
														
 
															+from BiddingKG.dl.common.Utils import *
														
 
															+import tensorflow as tf
														
 
															+from keras.models import load_model
														
 
															+
														
 
															+lb = ['招标金额','中标金额','其他金额']
														
 
															+id2lb = {k:v for k,v in enumerate(lb)}
														
 
															+lb2id = {v:k for k,v in id2lb.items()}
														
 
															+seq_len = 30
														
 
															+
														
 
															+def labeling(label, out_len=3):
														
 
															+    out = np.zeros((out_len))
														
 
															+    out[label] = 1
														
 
															+    return out
														
 
															+
														
 
															+
														
 
															+def getTrainData(percent=0.9):
														
 
															+    df = pd.read_excel('traindata/2兼职标注数据_test22.xlsx')
														
 
															+    df2 = pd.read_excel('traindata/原金额模型标注数据.xls')
														
 
															+    df = df.append(df2, ignore_index=True)
														
 
															+    df.dropna(subset=['left'], inplace=True)
														
 
															+    df.fillna('', inplace=True)
														
 
															+    if 'relabel' in df.columns:
														
 
															+        df['label'] = df.apply(lambda x:x['relabel'] if x['relabel']!="" else x['label'], axis=1)
														
 
															+        print('更新标注完成')
														
 
															+    for i in df.index:
														
 
															+        if df.loc[i, 'label'] not in lb:
														
 
															+            print('标签错误：',df.loc[i, 'label'])
														
 
															+    df['label'] = df['label'].apply(lambda x:lb2id.get(x, 0))
														
 
															+
														
 
															+    print('总样本：', len(df))
														
 
															+    train_x = []
														
 
															+    train_y = []
														
 
															+    test_x = []
														
 
															+    test_y = []
														
 
															+
														
 
															+    for before, text, after, label in zip(df["left"], df["center"], df["right"], df["label"]):
														
 
															+        before = str(before) if str(before) != "nan" else ""
														
 
															+        text = str(text)
														
 
															+        after = str(after) if str(after) != "nan" else ""
														
 
															+
														
 
															+        x = encodeInput([before, text, after], word_len=seq_len, word_flag=True, userFool=False)
														
 
															+        y = labeling(label)
														
 
															+        if np.random.random() < percent:
														
 
															+            train_x.append(x)
														
 
															+            train_y.append(y)
														
 
															+        else:
														
 
															+            test_x.append(x)
														
 
															+            test_y.append(y)
														
 
															+    return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y), np.transpose(np.array(test_x),
														
 
															+                                                                                       (1, 0, 2)), np.array(test_y)
														
 
															+def word2id(df):
														
 
															+    train_x = []
														
 
															+    train_y = []
														
 
															+    test_x = []
														
 
															+    test_y = []
														
 
															+
														
 
															+    for before, text, after, label in zip(df["left"], df["center"], df["right"], df["label"]):
														
 
															+        before = str(before) if str(before) != "nan" else ""
														
 
															+        text = str(text)
														
 
															+        after = str(after) if str(after) != "nan" else ""
														
 
															+
														
 
															+        x = encodeInput([before, text, after], word_len=seq_len, word_flag=True, userFool=False)
														
 
															+        y = labeling(label)
														
 
															+        train_x.append(x)
														
 
															+        train_y.append(y)
														
 
															+    return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y)
														
 
															+
														
 
															+
														
 
															+def train():
														
 
															+    # pk_file = "traindata/all_data.pk"
														
 
															+    # if os.path.exists(pk_file):
														
 
															+    #     train_x, train_y, test_x, test_y = load(pk_file)
														
 
															+    # else:
														
 
															+    #     train_x, train_y, test_x, test_y = getTrainData()
														
 
															+    #     save([train_x, train_y, test_x, test_y], pk_file)
														
 
															+
														
 
															+    df_train = pd.read_excel('traindata/df_train.xlsx')
														
 
															+    df_test = pd.read_excel('traindata/df_test.xlsx')
														
 
															+    train_x, train_y = word2id(df_train)
														
 
															+    test_x, test_y = word2id(df_test)
														
 
															+    with tf.Session() as sess:
														
 
															+        vocab, matrix = getVocabAndMatrix(getModel_word())
														
 
															+        model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=3)
														
 
															+        print("loading weights")
														
 
															+        # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
														
 
															+
														
 
															+        callback = ModelCheckpoint(
														
 
															+            filepath="log/" + "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",
														
 
															+            monitor="val_loss", save_best_only=True, save_weights_only=True, mode="min")
														
 
															+        model.fit(x=[train_x[0], train_x[1], train_x[2]], y=train_y, batch_size=128, epochs=600,callbacks=[callback],
														
 
															+                  validation_data=[[test_x[0], test_x[1], test_x[2]], test_y]) #
														
 
															+
														
 
															+
														
 
															+def test(_span = ['：预算金额1000000元，中标金额', '1151元', '；']):
														
 
															+    input = encodeInput(_span, word_len=seq_len, word_flag=True, userFool=False)
														
 
															+    print(input)
														
 
															+    graph = tf.get_default_graph()
														
 
															+    with graph.as_default():
														
 
															+        sess = tf.Session(graph=graph)
														
 
															+        with sess.as_default():
														
 
															+            vocab, matrix = getVocabAndMatrix(getModel_word())
														
 
															+            model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix,
														
 
															+                                   classes=3)
														
 
															+            model.load_weights("log/ep007-loss0.079-val_loss0.099-f1_score0.966.h5", by_name=True, skip_mismatch=True)
														
 
															+            logit = model.predict([np.array([input[0]]), np.array([input[1]]), np.array([input[2]])])
														
 
															+            print(logit)
														
 
															+            return logit
														
 
															+
														
 
															+def get_savedModel():
														
 
															+    sess = tf.Session(graph=tf.Graph())
														
 
															+    with sess.as_default():
														
 
															+        with sess.graph.as_default():
														
 
															+            vocab, matrix = getVocabAndMatrix(getModel_word())
														
 
															+            model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=3)
														
 
															+            sess.run(tf.global_variables_initializer())
														
 
															+            # model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5")
														
 
															+            # model.load_weights(filepath="log/ep007-loss0.079-val_loss0.099-f1_score0.966.h5") # 2021/7/27调整模型30字最优模型
														
 
															+            model.load_weights(filepath="log/ep029-loss0.081-val_loss0.094-f1_score0.971.h5") # 2021/08/06 调整模型30字最优模型
														
 
															+            tf.saved_model.simple_save(session=sess,
														
 
															+                                       # export_dir="money_savedmodel20210727_3",
														
 
															+                                       export_dir="money_savedmodel20210806",
														
 
															+                                       inputs={"input0": model.input[0],
														
 
															+                                               "input1": model.input[1],
														
 
															+                                               "input2": model.input[2]},
														
 
															+                                       outputs={"outputs": model.output})
														
 
															+
														
 
															+def tensorboard_model():
														
 
															+    with tf.Session(graph=tf.Graph()).as_default() as sess:
														
 
															+        with sess.graph.as_default():
														
 
															+            tf.saved_model.loader.load(sess, tags=["serve"], export_dir="money_savedmodel1")
														
 
															+            tf.summary.FileWriter(graph=sess.graph, logdir="log2")
														
 
															+
														
 
															+def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
														
 
															+    assert len(input_shape)==3
														
 
															+    list_input = []
														
 
															+    for i in range(input_shape[0]):
														
 
															+        list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
														
 
															+    print("list_input",list_input)
														
 
															+    list_embedding = []
														
 
															+
														
 
															+    embedding_input = list_input
														
 
															+    embedding = layers.Embedding(len(vocab),input_shape[2],
														
 
															+                                 weights=[embedding_weights] if embedding_weights is not None else None,
														
 
															+                                 trainable=True,name="char_embeding")
														
 
															+    for i in range(len(embedding_input)):
														
 
															+        print(i)
														
 
															+        list_embedding.append(embedding(embedding_input[i]))
														
 
															+        print(list_embedding)
														
 
															+
														
 
															+    list_w2v = list_embedding
														
 
															+    list_lstm = []
														
 
															+
														
 
															+    list_lstm.append(layers.Bidirectional(layers.LSTM(32, dropout=0.5, recurrent_dropout=0.5))(list_w2v[0]))
														
 
															+    list_lstm.append(layers.Bidirectional(layers.LSTM(8, dropout=0.5, recurrent_dropout=0.5))(list_w2v[1]))
														
 
															+    list_lstm.append(layers.Bidirectional(layers.LSTM(16, dropout=0.5, recurrent_dropout=0.5))(list_w2v[2]))
														
 
															+
														
 
															+    concat = layers.concatenate(list_lstm)
														
 
															+    dropout = layers.Dropout(0.5)(concat)
														
 
															+
														
 
															+    out = layers.Dense(classes,activation="softmax")(dropout)
														
 
															+
														
 
															+    model = models.Model(list_input,out)
														
 
															+    model.compile(optimizer=optimizers.Adam(lr=0.001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
														
 
															+    model.summary()
														
 
															+
														
 
															+    return model
														
 
															+def verification():
														
 
															+    graph = tf.get_default_graph()
														
 
															+    with graph.as_default():
														
 
															+        sess = tf.Session(graph=graph)
														
 
															+        with sess.as_default():
														
 
															+            vocab, matrix = getVocabAndMatrix(getModel_word())
														
 
															+            model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix,
														
 
															+                                   classes=3)
														
 
															+            model.load_weights("log/ep029-loss0.081-val_loss0.094-f1_score0.971.h5", by_name=True, skip_mismatch=True)
														
 
															+
														
 
															+            df_val = pd.read_excel('traindata/df_val_predict.xlsx')
														
 
															+            val_x, val_y = word2id(df_val)
														
 
															+            logit = model.predict([val_x[0], val_x[1], val_x[2]])
														
 
															+            lg = np.argmax(logit, axis=-1)
														
 
															+            df_val['pred_kera'] = pd.DataFrame(lg)
														
 
															+            df_val['prob_kera'] = pd.DataFrame(np.amax(logit, axis=1))
														
 
															+            df_val['tf=kera'] = df_val.apply(lambda x:1 if x['pred_kera']==x['pred_tf'] else 0, axis=1)
														
 
															+            df_val['tf=lb'] = df_val.apply(lambda x:1 if x['label']==x['pred_tf'] else 0, axis=1)
														
 
															+            df_val['kera=lb'] = df_val.apply(lambda x:1 if x['pred_kera']==x['label'] else 0, axis=1)
														
 
															+            df_val.to_excel('traindata/df_val_predict2.xlsx')
														
 
															+
														
 
															+
														
 
															+
														
 
															+            df = pd.read_excel('traindata/2兼职标注数据_test22.xlsx')
														
 
															+            df.fillna('', inplace=True)
														
 
															+            df.reset_index(drop=True, inplace=True)
														
 
															+            preds = []
														
 
															+            if 'relabel' in df.columns:
														
 
															+                df['label'] = df.apply(lambda x:x['relabel'] if x['relabel']!="" else x['label'], axis=1)
														
 
															+                print('更新标注完成')
														
 
															+            for left, center, right, label in zip(df['left'], df['center'], df['right'], df['label']):
														
 
															+                _span=[left, center, right]
														
 
															+                input = encodeInput(_span, word_len=seq_len, word_flag=True, userFool=False)
														
 
															+                logit = model.predict([np.array([input[0]]), np.array([input[1]]), np.array([input[2]])])
														
 
															+                lg = np.argmax(logit, axis=-1)[0]
														
 
															+                prob = logit[0][lg]
														
 
															+                lg = id2lb.get(lg, '')
														
 
															+                preds.append(lg)
														
 
															+                # if lg != label:
														
 
															+                #     print(left, '###', center, '###', right)
														
 
															+                #     print('预测类别：%s, 预测：%.4f, 标签：%s'%(lg, prob, label))
														
 
															+                #     print()
														
 
															+            df['pred'] = pd.DataFrame(preds)
														
 
															+            df.to_excel('traindata/2兼职标注数据_test22_predict.xlsx')
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # train()
														
 
															+    verification()
														
 
															+    # test(_span=['预算金额：50万，中标金额：','100.600万','元，'])
														
 
															+    # get_savedModel()
														
 
															+    # tensorboard_model()
														
--- a/BiddingKG/dl/role/role_train.py
+++ b/BiddingKG/dl/role/role_train.py
@@ -0,0 +1,185 @@
 
															+#!/usr/bin/python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# @Author  : bidikeji
														
 
															+# @Time    : 2021/7/28 0028 11:32 
														
 
															+
														
 
															+import os
														
 
															+import sys
														
 
															+import h5py
														
 
															+from keras import models,layers,losses,optimizers
														
 
															+sys.path.append(os.path.abspath("../../.."))
														
 
															+import pandas as pd
														
 
															+import math
														
 
															+from keras.callbacks import ModelCheckpoint
														
 
															+from BiddingKG.dl.interface.modelFactory import Model_role_classify_word
														
 
															+from BiddingKG.dl.common.Utils import *
														
 
															+import tensorflow as tf
														
 
															+
														
 
															+seq_len = 20
														
 
															+lb2id = {'招标人':0,
														
 
															+         '代理人':1,
														
 
															+         '中标人':2,
														
 
															+         '第二候选人':3,
														
 
															+         '第三候选人':4,
														
 
															+         '其他角色':5}
														
 
															+def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
														
 
															+    # assert len(input_shape)==3
														
 
															+    list_input = []
														
 
															+    for i in range(input_shape[0]):
														
 
															+        list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
														
 
															+    list_embedding = []
														
 
															+
														
 
															+    embedding_input = list_input
														
 
															+    embedding = layers.Embedding(len(vocab),input_shape[2],
														
 
															+                                 weights=[embedding_weights] if embedding_weights is not None else None,
														
 
															+                                 mask_zero=True,trainable=True,name="char_embeding")
														
 
															+    for i in range(len(embedding_input)):
														
 
															+        list_embedding.append(embedding(embedding_input[i]))
														
 
															+
														
 
															+    list_w2v = list_embedding
														
 
															+    list_lstm = []
														
 
															+
														
 
															+    list_lstm.append(layers.Bidirectional(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.5))(list_w2v[0])) #dropout=0.2, recurrent_dropout=0.5
														
 
															+    list_lstm.append(layers.Bidirectional(layers.LSTM(8, dropout=0.2, recurrent_dropout=0.5))(list_w2v[1]))
														
 
															+    list_lstm.append(layers.Bidirectional(layers.LSTM(16, dropout=0.2, recurrent_dropout=0.5))(list_w2v[2]))
														
 
															+    concat = layers.concatenate(list_lstm)
														
 
															+
														
 
															+    concat = layers.Dropout(0.5)(concat)
														
 
															+    out = layers.Dense(classes,activation="softmax")(concat)
														
 
															+    model = models.Model(list_input,out)
														
 
															+    model.compile(optimizer=optimizers.Adam(lr=0.001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
														
 
															+    model.summary()
														
 
															+
														
 
															+    return model
														
 
															+
														
 
															+def labeling(label, out_len=6):
														
 
															+    out = np.zeros((out_len))
														
 
															+    out[label] = 1
														
 
															+    return out
														
 
															+
														
 
															+def word2id(df, seq_len=seq_len):
														
 
															+    train_x = []
														
 
															+    train_y = []
														
 
															+    test_x = []
														
 
															+    test_y = []
														
 
															+    # print(set(df['label']))
														
 
															+    # print(set(lb2id))
														
 
															+    if set(df['label']) == set(lb2id):
														
 
															+        df['label'] = df['label'].apply(lambda x:lb2id[x])
														
 
															+
														
 
															+    for before, text, after, label in zip(df["left"], df["center"], df["right"], df["label"]):
														
 
															+        before = str(before) if str(before) != "nan" else ""
														
 
															+        text = str(text)
														
 
															+        after = str(after) if str(after) != "nan" else ""
														
 
															+
														
 
															+        x = encodeInput([before, text, after], word_len=seq_len, word_flag=True, userFool=False)
														
 
															+        y = labeling(label)
														
 
															+        train_x.append(x)
														
 
															+        train_y.append(y)
														
 
															+    return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y)
														
 
															+    # return train_x, np.array(train_y)
														
 
															+
														
 
															+def train():
														
 
															+    df_train = pd.read_excel('traindata/df_train.xlsx')
														
 
															+    df_test = pd.read_excel('traindata/df_test.xlsx')
														
 
															+    train_x, train_y = word2id(df_train)
														
 
															+    test_x, test_y = word2id(df_test)
														
 
															+    with tf.Session() as sess:
														
 
															+        vocab, matrix = getVocabAndMatrix(getModel_word())
														
 
															+        model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
														
 
															+        print("loading weights")
														
 
															+        # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
														
 
															+
														
 
															+        callback = ModelCheckpoint(
														
 
															+            filepath="log/" + "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",
														
 
															+            monitor="val_loss", save_best_only=True, save_weights_only=True, mode="min")
														
 
															+        model.fit(x=[train_x[0], train_x[1], train_x[2]], y=train_y, batch_size=128, epochs=600, callbacks=[callback],
														
 
															+                  validation_data=[[test_x[0], test_x[1], test_x[2]], test_y])
														
 
															+def test():
														
 
															+    # df_val = pd.read_excel('traindata/df_val.xlsx')
														
 
															+    # df_val = pd.read_excel('traindata/兼职标注数据_test29.xlsx')
														
 
															+    # df_val = pd.read_excel('traindata/兼职标注数据_test3_predict.xlsx')
														
 
															+    df_val = pd.read_excel('traindata/兼职标注数据_test22_待测试数据.xlsx')
														
 
															+    df_val.reset_index(drop=True, inplace=True)
														
 
															+    val_x, val_y = word2id(df_val, seq_len=seq_len)
														
 
															+    # val_x = np.transpose(np.array(train_x), (1, 0, 2))
														
 
															+
														
 
															+    old_x, old_y = word2id(df_val, seq_len=50)
														
 
															+    old_x = np.transpose(np.array(old_x), (1, 0, 2))
														
 
															+    role_old = Model_role_classify_word()
														
 
															+
														
 
															+    with tf.Session() as sess:
														
 
															+        vocab, matrix = getVocabAndMatrix(getModel_word())
														
 
															+        model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
														
 
															+        print("loading weights")
														
 
															+        # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
														
 
															+        # model.load_weights("log/ep006-loss0.174-val_loss0.234-f1_score0.917.h5",by_name=True, skip_mismatch=True)
														
 
															+        # model.load_weights("log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5",by_name=True, skip_mismatch=True)
														
 
															+        model.load_weights("log/ep014-loss0.091-val_loss0.110-f1_score0.968.h5",by_name=True, skip_mismatch=True)
														
 
															+
														
 
															+        lg_old = role_old.predict(old_x)
														
 
															+        df_val['pred_old'] = pd.DataFrame(np.argmax(lg_old, axis=1))
														
 
															+        df_val['prob_old'] = pd.DataFrame(np.amax(lg_old, axis=1))
														
 
															+
														
 
															+        logit = model.predict([val_x[0], val_x[1], val_x[2]])
														
 
															+        print('新模型预测结果',logit[:3])
														
 
															+        print('旧模型预测结果:',lg_old[:3])
														
 
															+        df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1))
														
 
															+        df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1))
														
 
															+        # df_val['new=new3'] = df_val.apply(lambda x: 1 if x['pred_new3'] == x['pred_new2'] else 0, axis=1)
														
 
															+        df_val['new=old'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
														
 
															+        df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['pred_old'] else 0, axis=1)
														
 
															+        df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
														
 
															+        # df_val.to_excel('traindata/df_val_predict.xlsx')
														
 
															+        # df_val.to_excel('traindata/兼职标注数据_test29_predict.xlsx')
														
 
															+        # df_val.to_excel('traindata/兼职标注数据_test3_predict.xlsx')
														
 
															+        df_val.to_excel('traindata/兼职标注数据_test22_待测试数据_predict.xlsx')
														
 
															+        print('')
														
 
															+
														
 
															+def get_savedModel():
														
 
															+    sess = tf.Session(graph=tf.Graph())
														
 
															+    with sess.as_default():
														
 
															+        with sess.graph.as_default():
														
 
															+            vocab, matrix = getVocabAndMatrix(getModel_word())
														
 
															+            model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
														
 
															+            sess.run(tf.global_variables_initializer())
														
 
															+            # model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5")
														
 
															+            # model.load_weights(filepath="log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5")  #7月30日训练最优模型20字
														
 
															+            model.load_weights(filepath="log/ep015-loss0.090-val_loss0.113-f1_score0.967.h5") #8月5日调整部分招标人标注后重新训练结果20字
														
 
															+            tf.saved_model.simple_save(session=sess,
														
 
															+                                       export_dir="role_savedmodel2021-8-5",
														
 
															+                                       inputs={"input0": model.input[0],
														
 
															+                                               "input1": model.input[1],
														
 
															+                                               "input2": model.input[2]},
														
 
															+                                       outputs={"outputs": model.output})
														
 
															+
														
 
															+def predict_pb():
														
 
															+    df_val = pd.read_excel('traindata/df_val.xlsx')
														
 
															+    old_x, old_y = word2id(df_val, seq_len=20)
														
 
															+    # old_x = np.transpose(np.array(old_x), (1, 0, 2))
														
 
															+
														
 
															+    sess_role = tf.Session()
														
 
															+    with sess_role.as_default() as sess:
														
 
															+        with sess_role.graph.as_default():
														
 
															+            meta_graph_def = tf.saved_model.loader.load(sess=sess_role, tags=["serve"],
														
 
															+                                                        export_dir="role_savedmodel2021-8-5")
														
 
															+            signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
														
 
															+            signature_def = meta_graph_def.signature_def
														
 
															+
														
 
															+            input0 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)
														
 
															+            input1 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)
														
 
															+            input2 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
														
 
															+            output = sess_role.graph.get_tensor_by_name(
														
 
															+                signature_def[signature_key].outputs["outputs"].name)
														
 
															+            model_role = [[input0, input1, input2], output]
														
 
															+            lg_old = sess_role.run(output, feed_dict={input0:old_x[0],
														
 
															+                                                      input1:old_x[1],
														
 
															+                                                      input2:old_x[2]})
														
 
															+            print(lg_old[:3])
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # train()
														
 
															+    # test()
														
 
															+    get_savedModel()
														
 
															+    # predict_pb()
														
--- a/BiddingKG/dl/test/测试所有提取信息.py
+++ b/BiddingKG/dl/test/测试所有提取信息.py
@@ -30,6 +30,7 @@ import BiddingKG.dl.interface.getAttributes as getAttributes
 
															 import BiddingKG.dl.entityLink.entityLink as entityLink
														
 
															 import BiddingKG.dl.complaint.punish_predictor as punish_predictor
														
 
															 # import BiddingKG.dl.complaint.punish_rule as punish_predictor
														
 
															+import BiddingKG.dl.channel.channel_predictor as channel_predictor
														
 
															 import json
														
 
															 '''
														
@@ -53,11 +54,12 @@ row = cursor.fetchall()[0]
 
															 codeNamePredict = predictor.CodeNamePredict()
														
 
															 premPredict = predictor.PREMPredict()
														
 
															 epcPredict = predictor.EPCPredict()
														
 
															-roleRulePredict = predictor.RoleRulePredictor()
														
 
															+# roleRulePredict = predictor.RoleRulePredictor()
														
 
															 timePredict = predictor.TimePredictor()
														
 
															 # punish = punish_rule.Punish_Extract()
														
 
															 punish = punish_predictor.Punish_Extract()
														
 
															 productPredict = predictor.ProductPredictor()
														
 
															+channelPredict = channel_predictor.DocChannel()
														
 
															 # 自定义jsonEncoder
														
 
															 class MyEncoder(json.JSONEncoder):
														
@@ -97,14 +99,30 @@ def predict(doc_id, text, title=""):
 
															     print("getPREMs")
														
 
															     list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
														
 
															     product = productPredict.predict(list_sentences,list_entitys)
														
 
															+    channel = channelPredict.predict(title, list_sentences[0])
														
 
															+
														
 
															+    total_tendereeMoney_list = []
														
 
															+    for entity in list_entitys[0]:
														
 
															+        if entity.notes == '总投资':
														
 
															+            total_tendereeMoney_list.append(entity.entity_text)
														
 
															+    total_tendereeMoney = max([total_tendereeMoney_list]) if len(total_tendereeMoney_list)>=1 else 0
														
 
															     for entitys in list_entitys:
														
 
															         for entity in entitys:
														
 
															             print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
														
 
															                   entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end,entity.sentence_index)
														
 
															     # print(prem)
														
 
															-    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product)[0],
														
 
															-                      cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
														
 
															+    # return json.dumps(Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product)[0],
														
 
															+    #                   cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
														
 
															+    # return json.dumps(Preprocessing.union_result(
														
 
															+    #     Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product), channel)[0],
														
 
															+    #                   cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
														
 
															+    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(
														
 
															+        Preprocessing.union_result(
														
 
															+            Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic), product), [{'total_tendereeMoney':total_tendereeMoney}]
														
 
															+    ),
														
 
															+        channel),
														
 
															+            cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
														
 
															 def predict_back(doc_id, html):
														
@@ -189,84 +207,92 @@ def test(name, content):
 
															 if __name__ == "__main__":
														
 
															-    from tablestore import *
														
 
															-    endpoint = 'https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com'
														
 
															-    access_key_id = 'LTAI4GJxbioV1y2WM3XcZTmP'
														
 
															-    access_key_secret = 'B3VITMoCnKtTQE6eAkDkat6UNFHped'
														
 
															-    instance_name = 'bxkc-ots'
														
 
															-    ots_client = OTSClient(endpoint, access_key_id, access_key_secret, instance_name)
														
 
															-
														
 
															-    def get_data(query, max_rows, table_name='document',
														
 
															-                 index_name='document_index',
														
 
															-                 column_names=['docid', 'dochtmlcon','doctitle', 'info_type', 'page_time'],
														
 
															-                 sorters=[FieldSort("page_time", SortOrder.DESC), FieldSort("docid", SortOrder.DESC)]):
														
 
															-        '''
														
 
															-        从阿里云ots查询数据
														
 
															-        :param query: 查询命令
														
 
															-        :param max_rows: 最大返回多少数据
														
 
															-        :param table_name: 表名
														
 
															-        :param index_name: 表索引名
														
 
															-        :param column_names: 返回字段名
														
 
															-        :param sorters: 排序规则列表
														
 
															-        :return: 处理后的数据列表
														
 
															-        '''
														
 
															-        next_token = None
														
 
															-        data = []
														
 
															-        all_rows = []
														
 
															-        rows, next_token, total_count, is_all_succeed = \
														
 
															-            ots_client.search(table_name,
														
 
															-                              index_name,
														
 
															-                              SearchQuery(query,
														
 
															-                                          next_token=next_token,
														
 
															-                                          sort=Sort(sorters=sorters),  # ASC升序
														
 
															-                                          limit=100,
														
 
															-                                          get_total_count=True),
														
 
															-                              ColumnsToGet(column_names=column_names,
														
 
															-                                           return_type=ColumnReturnType.SPECIFIED))
														
 
															-        all_rows.extend(rows)
														
 
															-        while next_token:
														
 
															-            rows, next_token, total_count, is_all_succeed = \
														
 
															-                ots_client.search(table_name,
														
 
															-                                  index_name,
														
 
															-                                  SearchQuery(query,
														
 
															-                                              next_token=next_token,
														
 
															-                                              sort=None,
														
 
															-                                              limit=100,
														
 
															-                                              get_total_count=True),
														
 
															-                                  ColumnsToGet(column_names=column_names,
														
 
															-                                               return_type=ColumnReturnType.SPECIFIED))
														
 
															-            all_rows.extend(rows)
														
 
															-            if len(all_rows) > max_rows:
														
 
															-                print('已获取%d条数据' % len(all_rows))
														
 
															-                break
														
 
															-
														
 
															-        if all_rows:
														
 
															-            for row in all_rows:
														
 
															-                tmp = []
														
 
															-                tmp.append(row[0][1][1])
														
 
															-                for tup in row[1]:
														
 
															-                    tmp.append(tup[1])
														
 
															-                data.append(tmp)
														
 
															-        return data
														
 
															-
														
 
															-
														
 
															-    bool_query = TermQuery('docid','124113339')
														
 
															-    # bool_query = BoolQuery(
														
 
															-    #     must_queries=[TermsQuery(field_name='info_type', column_values=['办公设备', '计算机设备']),
														
 
															-    #                   RangeQuery('page_time', range_from='2020-11-01', range_to='2020-11-31')]
														
 
															-    # )
														
 
															-
														
 
															-    data = get_data(bool_query, 1)
														
 
															-    print(data)
														
 
															-    docid = str(data[0][0])
														
 
															-    html = data[0][1]
														
 
															-    title = data[0][2]
														
 
															+    # from tablestore import *
														
 
															+    # endpoint = 'https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com'
														
 
															+    # access_key_id = 'LTAI4GJxbioV1y2WM3XcZTmP'
														
 
															+    # access_key_secret = 'B3VITMoCnKtTQE6eAkDkat6UNFHped'
														
 
															+    # instance_name = 'bxkc-ots'
														
 
															+    # ots_client = OTSClient(endpoint, access_key_id, access_key_secret, instance_name)
														
 
															+    #
														
 
															+    # def get_data(query, max_rows, table_name='document',
														
 
															+    #              index_name='document_index',
														
 
															+    #              column_names=['docid', 'dochtmlcon','doctitle', 'info_type', 'page_time'],
														
 
															+    #              sorters=[FieldSort("page_time", SortOrder.DESC), FieldSort("docid", SortOrder.DESC)]):
														
 
															+    #     '''
														
 
															+    #     从阿里云ots查询数据
														
 
															+    #     :param query: 查询命令
														
 
															+    #     :param max_rows: 最大返回多少数据
														
 
															+    #     :param table_name: 表名
														
 
															+    #     :param index_name: 表索引名
														
 
															+    #     :param column_names: 返回字段名
														
 
															+    #     :param sorters: 排序规则列表
														
 
															+    #     :return: 处理后的数据列表
														
 
															+    #     '''
														
 
															+    #     next_token = None
														
 
															+    #     data = []
														
 
															+    #     all_rows = []
														
 
															+    #     rows, next_token, total_count, is_all_succeed = \
														
 
															+    #         ots_client.search(table_name,
														
 
															+    #                           index_name,
														
 
															+    #                           SearchQuery(query,
														
 
															+    #                                       next_token=next_token,
														
 
															+    #                                       sort=Sort(sorters=sorters),  # ASC升序
														
 
															+    #                                       limit=100,
														
 
															+    #                                       get_total_count=True),
														
 
															+    #                           ColumnsToGet(column_names=column_names,
														
 
															+    #                                        return_type=ColumnReturnType.SPECIFIED))
														
 
															+    #     all_rows.extend(rows)
														
 
															+    #     while next_token:
														
 
															+    #         rows, next_token, total_count, is_all_succeed = \
														
 
															+    #             ots_client.search(table_name,
														
 
															+    #                               index_name,
														
 
															+    #                               SearchQuery(query,
														
 
															+    #                                           next_token=next_token,
														
 
															+    #                                           sort=None,
														
 
															+    #                                           limit=100,
														
 
															+    #                                           get_total_count=True),
														
 
															+    #                               ColumnsToGet(column_names=column_names,
														
 
															+    #                                            return_type=ColumnReturnType.SPECIFIED))
														
 
															+    #         all_rows.extend(rows)
														
 
															+    #         if len(all_rows) > max_rows:
														
 
															+    #             print('已获取%d条数据' % len(all_rows))
														
 
															+    #             break
														
 
															+    #
														
 
															+    #     if all_rows:
														
 
															+    #         for row in all_rows:
														
 
															+    #             tmp = []
														
 
															+    #             tmp.append(row[0][1][1])
														
 
															+    #             for tup in row[1]:
														
 
															+    #                 tmp.append(tup[1])
														
 
															+    #             data.append(tmp)
														
 
															+    #     return data
														
 
															+    #
														
 
															+    #
														
 
															+    # bool_query = TermQuery('docid','124113339')
														
 
															+    # # bool_query = BoolQuery(
														
 
															+    # #     must_queries=[TermsQuery(field_name='info_type', column_values=['办公设备', '计算机设备']),
														
 
															+    # #                   RangeQuery('page_time', range_from='2020-11-01', range_to='2020-11-31')]
														
 
															+    # # )
														
 
															+    #
														
 
															+    # data = get_data(bool_query, 1)
														
 
															+    # print(data)
														
 
															+    # docid = str(data[0][0])
														
 
															+    # html = data[0][1]
														
 
															+    # title = data[0][2]
														
 
															     # text = '中标候选人第1名：哈尔滨龙网电力设备有限公司，投标报价：19.98万元，质量，合格，工期/交货期/服务期：30天。\
														
 
															     # 投诉处理公告，投诉人：张三。文章编号：京财采投字(2018)第42号。政府采购项目招标方式：公开招标，联系人：黎明。\
														
 
															     # 建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，\
														
 
															     # 二次供水泵房浊度仪进行国内组织公开招标采购，时间：2020-05-26，15:15:00，竞价结束时间：2020-05-26，15:45:00允许延时：是，'
														
 
															-    # docid = ""
														
 
															-    # html = '首都医科大学附属北京地坛医院1.5T核磁共振、16排CT和血管造影机维保服务医疗设备维修和保养服务采购项目政府采购合同公告'
														
 
															+    docid = ""
														
 
															+    # title = '招标公告'
														
 
															+    # html = '招标人：广州市人民医院。代理人：广州医疗代理服务公司。招标金额：3000元，总投资：5万元。中标人：比地科技有限公司，中标金额：1万元。'
														
 
															+    html = """， [ 正在 公告 ] 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) ， 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) 采购 结果 公告 ， 项目 名称 ， 公司 2020 - 2021 年度 打印 制作 服务 项目 编号 ， 20200803030110070001 采购 组织 人 ， 中 节能 建筑 节能 有限公司 河南 分公司 采购 方式 ， 谈判 采购 成交 信息 ， 序号 ， 标段 ( 包 ) 编号 ， 标段 ( 包 ) 名称 ， 成交 供应商 ， 成交 金额 20200803030110070001001 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) 郑州市 上街区 永达 文印部 null 元 公告 起 止 时间 2021年 04月 14日 - 2021年 04月 17日 ，
														
 
															+"""
														
 
															+    title = """[ 正在 公告 ] 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) ，
														
 
															+"""
														
 
															+    html = html.replace(' ', '')
														
 
															+    title = title.replace(' ', '')
														
 
															+    # html = '首都医科大学附属北京地坛医院1.5T核磁共振、16排CT和血管造影机维保服务医疗设备维修和保养服务采购项目政府采购中标候选人公示，中标人：广州比地科技有限公司，中标金额：6000万元'
														
 
															     # html = '编号：厦财企〔2020〕12号，各有关单位：341号。处罚编号：厦财企〔2020〕12号，文章编号：京财采投字(2018)第42号。公告编号：闽建筑招〔2018〕5号。处罚编号：松公管监[2020]2号,'
														
 
															     a = time.time()