Browse Source

更新角色模型,金额模型,金额预处理,金额连接,返回结果添加 总投资和channel信息

bidi 3 years ago
parent
commit
4340c0cbeb

+ 16 - 8
BiddingKG/dl/channel/channel_predictor.py

@@ -11,6 +11,7 @@ import copy
 import tensorflow as tf
 import tensorflow as tf
 import fool
 import fool
 import re
 import re
+import os
 import time
 import time
 
 
 word_model = getModel_w2v()
 word_model = getModel_w2v()
@@ -23,7 +24,7 @@ sentence_num = 10
 kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
 kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
 
 
 class DocChannel():
 class DocChannel():
-  def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'):
+  def __init__(self, life_model='/model/channel.pb', type_model='/model/doctype.pb'):
     self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
     self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
     self.mask, self.mask_title = self.load_life(life_model)
     self.mask, self.mask_title = self.load_life(life_model)
     self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
     self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
@@ -36,7 +37,7 @@ class DocChannel():
   def load_life(self,life_model):
   def load_life(self,life_model):
     with tf.Graph().as_default() as graph:
     with tf.Graph().as_default() as graph:
       output_graph_def = graph.as_graph_def()
       output_graph_def = graph.as_graph_def()
-      with open(life_model, 'rb') as f:
+      with open(os.path.dirname(__file__)+life_model, 'rb') as f:
         output_graph_def.ParseFromString(f.read())
         output_graph_def.ParseFromString(f.read())
         tf.import_graph_def(output_graph_def, name='')
         tf.import_graph_def(output_graph_def, name='')
         print("%d ops in the final graph" % len(output_graph_def.node))
         print("%d ops in the final graph" % len(output_graph_def.node))
@@ -55,7 +56,7 @@ class DocChannel():
   def load_type(self,type_model):
   def load_type(self,type_model):
     with tf.Graph().as_default() as graph:
     with tf.Graph().as_default() as graph:
       output_graph_def = graph.as_graph_def()
       output_graph_def = graph.as_graph_def()
-      with open(type_model, 'rb') as f:
+      with open(os.path.dirname(__file__)+type_model, 'rb') as f:
         output_graph_def.ParseFromString(f.read())
         output_graph_def.ParseFromString(f.read())
         tf.import_graph_def(output_graph_def, name='')
         tf.import_graph_def(output_graph_def, name='')
         print("%d ops in the final graph" % len(output_graph_def.node))
         print("%d ops in the final graph" % len(output_graph_def.node))
@@ -172,7 +173,6 @@ class DocChannel():
       # words = [it for sen in sen_words for it in sen]
       # words = [it for sen in sen_words for it in sen]
       # segword_content = ' '.join(words)
       # segword_content = ' '.join(words)
       segword_title = ' '.join(fool.cut(doctitle)[0])
       segword_title = ' '.join(fool.cut(doctitle)[0])
-
       segword_content = dochtmlcon
       segword_content = dochtmlcon
       # segword_title = doctitle
       # segword_title = doctitle
 
 
@@ -217,8 +217,12 @@ class DocChannel():
     else:
     else:
       return 0
       return 0
 
 
-  def predict(self, title, content):
+  def predict(self, title='', content=''):
     # print('准备预测')
     # print('准备预测')
+    if isinstance(content, list):
+      token_l = [it.tokens for it in content]
+      tokens = [it for l in token_l for it in l]
+      content = ' '.join(tokens)
     data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
     data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
     pred = self.type_sess.run(self.type_softmax,
     pred = self.type_sess.run(self.type_softmax,
                                     feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
                                     feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
@@ -241,10 +245,14 @@ class DocChannel():
       prob = pred[0][id]
       prob = pred[0][id]
       if id == 6:
       if id == 6:
         if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
         if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
-          return '候选人公示', prob
-      return self.id2life[id], prob
+          # return '候选人公示', prob
+          return [{'docchannel': '候选人公示'}]
+      # return self.id2life[id], prob
+      return [{'docchannel':self.id2life[id]}]
     else:
     else:
-      return self.id2type[id], prob
+      # return self.id2type[id], prob
+      return [{'docchannel':self.id2type[id]}]
+
 
 
   def predict_batch(self, title_content_list):
   def predict_batch(self, title_content_list):
     # print('准备预测')
     # print('准备预测')

+ 2 - 1
BiddingKG/dl/common/Utils.py

@@ -419,7 +419,8 @@ def getUnifyMoney(money):
     money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
     money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
     result = Decimal(0)
     result = Decimal(0)
     chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
     chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
-    chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
+    # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
+    chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾", "圆", "元", "角", "分", '十', '百', '千']
     
     
     LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
     LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
     BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
     BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))

+ 1 - 1
BiddingKG/dl/interface/Entitys.py

@@ -162,7 +162,7 @@ class Entity():
         self.pointer_address = None
         self.pointer_address = None
         self.pointer_tendereeMoney = None
         self.pointer_tendereeMoney = None
         self.person_phone = person_phone
         self.person_phone = person_phone
-        
+        self.notes = ''  # 2021/7/20 新增,保存金额大小写,单位等备注
         
         
     def set_Role(self,role_label,role_values):
     def set_Role(self,role_label,role_values):
         self.label = int(role_label)
         self.label = int(role_label)

+ 113 - 45
BiddingKG/dl/interface/Preprocessing.py

@@ -199,7 +199,7 @@ def tableToText(soup):
             '''
             '''
             @summary: 计算每个节点受到的挤压度来判断是否需要染色
             @summary: 计算每个节点受到的挤压度来判断是否需要染色
             '''
             '''
-            #print("B",inner_table[index])
+            ## print("B",inner_table[index])
             min_presure = 3
             min_presure = 3
             list_dye = []
             list_dye = []
             first = None
             first = None
@@ -260,7 +260,7 @@ def tableToText(soup):
                             dye_set.add((inner_table[index][h][0],dye_type))
                             dye_set.add((inner_table[index][h][0],dye_type))
                             key_set.add(inner_table[index][h][0])
                             key_set.add(inner_table[index][h][0])
                     begin = end
                     begin = end
-                #print("E",inner_table[index])
+                ## print("E",inner_table[index])
 
 
 
 
 
 
@@ -388,17 +388,17 @@ def tableToText(soup):
         
         
         for item,values in zip(list_item,list(predict_y)):
         for item,values in zip(list_item,list(predict_y)):
             _dict[item] = values[1]
             _dict[item] = values[1]
-            # print("##",item,values)
-        #print(_dict)
+            # # print("##",item,values)
+        ## print(_dict)
         for i in range(height):
         for i in range(height):
             for j in range(width):
             for j in range(width):
                 item = inner_table[i][j][0]
                 item = inner_table[i][j][0]
                 inner_table[i][j][1] = 1 if _dict[item]>prob_min else (1 if re.search(pat_head,item) is not None and len(item)<8 else 0)
                 inner_table[i][j][1] = 1 if _dict[item]>prob_min else (1 if re.search(pat_head,item) is not None and len(item)<8 else 0)
 
 
-        # print("=====")
+        # # print("=====")
         # for item in inner_table:
         # for item in inner_table:
-        #     print(item)
-        # print("======")
+        #     # print(item)
+        # # print("======")
         
         
         repairTable(inner_table)
         repairTable(inner_table)
         head_list = sliceTable(inner_table)
         head_list = sliceTable(inner_table)
@@ -422,10 +422,10 @@ def tableToText(soup):
                 if re.search(pat_head,_item) is not None and len(item)<8:
                 if re.search(pat_head,_item) is not None and len(item)<8:
                     inner_table[_h][_w][1] = 1
                     inner_table[_h][_w][1] = 1
 
 
-        # print("=====")
+        # # print("=====")
         # for item in inner_table:
         # for item in inner_table:
-        #     print(item)
-        # print("======")
+        #     # print(item)
+        # # print("======")
 
 
         repairTable(inner_table)
         repairTable(inner_table)
         head_list = sliceTable(inner_table)
         head_list = sliceTable(inner_table)
@@ -470,7 +470,7 @@ def tableToText(soup):
                 else:
                 else:
                     is_head = False
                     is_head = False
             
             
-            #print(temp_item,form_prob)
+            ## print(temp_item,form_prob)
             if len(inner_table[i][0][0])>40:
             if len(inner_table[i][0][0])>40:
                 is_long_value = True
                 is_long_value = True
             if is_head or is_long_value or is_same_value:
             if is_head or is_long_value or is_same_value:
@@ -751,12 +751,12 @@ def tableToText(soup):
                                     pack_text += head+cell["text"]+","
                                     pack_text += head+cell["text"]+","
                                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
                                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
                                     #排名替换为同一种表达
                                     #排名替换为同一种表达
-                                    print("====",head)
+                                    # print("====",head)
                                     rank_text += head+cell["text"]+","
                                     rank_text += head+cell["text"]+","
-                                    #print(rank_text)
+                                    ## print(rank_text)
                                 elif re.search(entityPattern,head) is not None:
                                 elif re.search(entityPattern,head) is not None:
                                     entity_text += head+cell["text"]+","
                                     entity_text += head+cell["text"]+","
-                                    #print(entity_text)
+                                    ## print(entity_text)
                                 else:
                                 else:
                                     if re.search(moneyPattern,head) is not None and entity_text!="":
                                     if re.search(moneyPattern,head) is not None and entity_text!="":
                                         money_text += head+cell["text"]+","
                                         money_text += head+cell["text"]+","
@@ -788,10 +788,10 @@ def tableToText(soup):
                                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
                                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
                                     #排名替换为同一种表达
                                     #排名替换为同一种表达
                                     rank_text += head+cell["text"]+","
                                     rank_text += head+cell["text"]+","
-                                    #print(rank_text)
+                                    ## print(rank_text)
                                 elif re.search(entityPattern,head) is not None:
                                 elif re.search(entityPattern,head) is not None:
                                     entity_text += head+cell["text"]+","
                                     entity_text += head+cell["text"]+","
-                                    #print(entity_text)
+                                    ## print(entity_text)
                                 else:
                                 else:
                                     text_line += head+cell["text"]+","
                                     text_line += head+cell["text"]+","
                                 text_set.add(str(head+cell["text"]))
                                 text_set.add(str(head+cell["text"]))
@@ -862,10 +862,10 @@ def tableToText(soup):
                 #                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
                 #                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
                 #                     #排名替换为同一种表达
                 #                     #排名替换为同一种表达
                 #                     rank_text += head+inner_table[i][j][0]+","
                 #                     rank_text += head+inner_table[i][j][0]+","
-                #                     #print(rank_text)
+                #                     ## print(rank_text)
                 #                 elif re.search(entityPattern,head) is not None:
                 #                 elif re.search(entityPattern,head) is not None:
                 #                     entity_text += head+inner_table[i][j][0]+","
                 #                     entity_text += head+inner_table[i][j][0]+","
-                #                     #print(entity_text)
+                #                     ## print(entity_text)
                 #                 else:
                 #                 else:
                 #                     text_line += head+inner_table[i][j][0]+","
                 #                     text_line += head+inner_table[i][j][0]+","
                 #                 text_set.add(str(head+inner_table[i][j][0]))
                 #                 text_set.add(str(head+inner_table[i][j][0]))
@@ -924,10 +924,10 @@ def tableToText(soup):
                 #                     continue
                 #                     continue
                 #                 if re.search(rankPattern,head) is not None:
                 #                 if re.search(rankPattern,head) is not None:
                 #                     rank_text += head+inner_table[i][j][0]+","
                 #                     rank_text += head+inner_table[i][j][0]+","
-                #                     #print(rank_text)
+                #                     ## print(rank_text)
                 #                 elif re.search(entityPattern,head) is not None:
                 #                 elif re.search(entityPattern,head) is not None:
                 #                     entity_text += head+inner_table[i][j][0]+","
                 #                     entity_text += head+inner_table[i][j][0]+","
-                #                     #print(entity_text)
+                #                     ## print(entity_text)
                 #                 else:
                 #                 else:
                 #                     text_line += head+inner_table[i][j][0]+","
                 #                     text_line += head+inner_table[i][j][0]+","
                 #                 text_set.add(str(head+inner_table[i][j][0]))
                 #                 text_set.add(str(head+inner_table[i][j][0]))
@@ -952,22 +952,22 @@ def tableToText(soup):
             #inner_table,head_list = setHead_inline(inner_table)
             #inner_table,head_list = setHead_inline(inner_table)
             inner_table,head_list = setHead_initem(inner_table,pat_head)
             inner_table,head_list = setHead_initem(inner_table,pat_head)
             # inner_table,head_list = setHead_incontext(inner_table,pat_head)
             # inner_table,head_list = setHead_incontext(inner_table,pat_head)
-            # print(inner_table)
+            # # print(inner_table)
             # for begin in range(len(head_list[:-1])):
             # for begin in range(len(head_list[:-1])):
             #     for item in inner_table[head_list[begin]:head_list[begin+1]]:
             #     for item in inner_table[head_list[begin]:head_list[begin+1]]:
-            #         print(item)
-            #     print("====")
+            #         # print(item)
+            #     # print("====")
 
 
             removeFix(inner_table)
             removeFix(inner_table)
             
             
-            # print("----")
-            # print(head_list)
+            # # print("----")
+            # # print(head_list)
             # for item in inner_table:
             # for item in inner_table:
-            #     print(item)
+            #     # print(item)
 
 
 
 
             tbody.string = getTableText(inner_table,head_list)
             tbody.string = getTableText(inner_table,head_list)
-            #print(tbody.string)
+            ## print(tbody.string)
             tbody.name = "turntable"
             tbody.name = "turntable"
             return inner_table
             return inner_table
         return None
         return None
@@ -998,9 +998,9 @@ def tableToText(soup):
 
 
 #数据清洗
 #数据清洗
 def segment(soup,final=True):
 def segment(soup,final=True):
-    # print("==")
-    # print(soup)
-    # print("====")
+    # # print("==")
+    # # print(soup)
+    # # print("====")
     #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
     #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
     subspaceList = ["td",'a',"span","p"]
     subspaceList = ["td",'a',"span","p"]
     if soup.name in subspaceList:
     if soup.name in subspaceList:
@@ -1223,7 +1223,7 @@ def union_ner(list_ner):
         if i not in union_index_set:
         if i not in union_index_set:
             result_list.append(list_ner[i])
             result_list.append(list_ner[i])
     for item in union_index:
     for item in union_index:
-        #print(str(list_ner[item[0]][3])+str(list_ner[item[1]][3]))
+        ## print(str(list_ner[item[0]][3])+str(list_ner[item[1]][3]))
         result_list.append((list_ner[item[0]][0],list_ner[item[1]][1],'company',str(list_ner[item[0]][3])+str(list_ner[item[1]][3])))
         result_list.append((list_ner[item[0]][0],list_ner[item[1]][1],'company',str(list_ner[item[0]][3])+str(list_ner[item[1]][3])))
     return result_list
     return result_list
                 
                 
@@ -1358,8 +1358,8 @@ def union_ner(list_ner):
 #                     index = 0
 #                     index = 0
 #                     for i in range(len(all_match)):
 #                     for i in range(len(all_match)):
 #                         if len(all_match[i][0])>0:
 #                         if len(all_match[i][0])>0:
-#                             # print("===",all_match[i])
-#                             #print(all_match[i][0])
+#                             # # print("===",all_match[i])
+#                             ## print(all_match[i][0])
 #                             unit = ""
 #                             unit = ""
 #                             entity_text = all_match[i][3]
 #                             entity_text = all_match[i][3]
 #                             if pattern_key in ["key_word","front_m"]:
 #                             if pattern_key in ["key_word","front_m"]:
@@ -1570,6 +1570,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
         #限流执行
         #限流执行
         key_nerToken = "nerToken"
         key_nerToken = "nerToken"
         start_time = time.time()
         start_time = time.time()
+        found_yeji = 0 # 2021/8/6 增加判断是否正文包含评标结果 及类似业绩判断用于过滤后面的金额
+        # found_pingbiao = False
         ner_entitys_all = getNers(sentences,useselffool=useselffool)
         ner_entitys_all = getNers(sentences,useselffool=useselffool)
         if key_nerToken not in cost_time:
         if key_nerToken not in cost_time:
             cost_time[key_nerToken] = 0
             cost_time[key_nerToken] = 0
@@ -1627,10 +1629,16 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             entity_type = "money"
             entity_type = "money"
             #money_patten_str = "(([1-9][\d,,]*(?:\.\d+)?[百千万亿]?[\(\)()元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[¥¥]+,?|报价|标价)[(\(]?([万])?元?[)\)]?[::]?.{,7}?([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?)|([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?)[\((]?([万元]{1,2}))*"
             #money_patten_str = "(([1-9][\d,,]*(?:\.\d+)?[百千万亿]?[\(\)()元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[¥¥]+,?|报价|标价)[(\(]?([万])?元?[)\)]?[::]?.{,7}?([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?)|([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?)[\((]?([万元]{1,2}))*"
 
 
+            # list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
+            #                       "key_word":"((?P<text_key_word>(?:[¥¥]+,?|[单报标限]价|金额|价格|标的基本情况|CNY|成交结果:)(?:[,(\(]*\s*(?P<unit_key_word_before>[万元]*(?P<filter_unit2>[台个只]*))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,8}?))(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)(?:[(\(]?(?P<filter_>[%])*\s*(?P<unit_key_word_behind>[万元]*(?P<filter_unit1>[台个只]*))\s*[)\)]?))",
+            #                       "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
+            #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                                  "key_word":"((?P<text_key_word>(?:[¥¥]+,?|[单报标限]价|金额|价格|标的基本情况|CNY|成交结果:)(?:[,(\(]*\s*(?P<unit_key_word_before>[万元]*(?P<filter_unit2>[台个只]*))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,8}?))(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)(?:[(\(]?(?P<filter_>[%])*\s*(?P<unit_key_word_behind>[万元]*(?P<filter_unit1>[台个只]*))\s*[)\)]?))",
+                                  "key_word":"((?P<text_key_word>(?:[¥¥]+,?|[单报标限]价|金额|成交报?价|价格|标的基本情况|CNY|成交结果:)(?:[,(\(]*\s*(?P<unit_key_word_before>[万元]*(?P<filter_unit2>[台个只]*))\s*(/?费率)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号]{,8}?))(第[123一二三]名[::])?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万元]*(?P<filter_unit1>[台个只吨斤棵株页亩方条米]*))\s*[)\)]?))",
                                   "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
                                   "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
-                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
+                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
+            # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
+
             pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
             pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
             set_begin = set()
             set_begin = set()
             # for pattern_key in list_money_pattern.keys():
             # for pattern_key in list_money_pattern.keys():
@@ -1641,8 +1649,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 # index = 0
                 # index = 0
                 # for i in range(len(all_match)):
                 # for i in range(len(all_match)):
                 #     if len(all_match[i][0])>0:
                 #     if len(all_match[i][0])>0:
-                #         print("===",all_match[i])
-                #         #print(all_match[i][0])
+                #         # print("===",all_match[i])
+                #         ## print(all_match[i][0])
                 #         unit = ""
                 #         unit = ""
                 #         entity_text = all_match[i][3]
                 #         entity_text = all_match[i][3]
                 #         if pattern_key in ["key_word","front_m"]:
                 #         if pattern_key in ["key_word","front_m"]:
@@ -1689,18 +1697,32 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 #
                 #
                 #     else:
                 #     else:
                 #         index += 1
                 #         index += 1
-            all_match = re.finditer(pattern_money, sentence_text)
+
+            # if re.search('评标结果|候选人公示', sentence_text):
+            #     found_pingbiao = True
+            if re.search('业绩', sentence_text):
+                found_yeji += 1
+            if found_yeji >= 2: # 过滤掉业绩后面的所有金额
+                all_match = []
+            else:
+                all_match = re.finditer(pattern_money, sentence_text)
             index = 0
             index = 0
             for _match in all_match:
             for _match in all_match:
                 if len(_match.group())>0:
                 if len(_match.group())>0:
                     # print("===",_match.group())
                     # print("===",_match.group())
-                    # print(_match.groupdict())
+                    # # print(_match.groupdict())
+                    notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
                     unit = ""
                     unit = ""
                     entity_text = ""
                     entity_text = ""
                     text_beforeMoney = ""
                     text_beforeMoney = ""
                     filter = ""
                     filter = ""
                     filter_unit = False
                     filter_unit = False
                     notSure = False
                     notSure = False
+                    if re.search('业绩', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
+                        # print('金额在业绩后面: ', _match.group(0))
+                        found_yeji += 1
+                        break
+
                     for k,v in _match.groupdict().items():
                     for k,v in _match.groupdict().items():
                         if v!="" and v is not None:
                         if v!="" and v is not None:
                             if k=='text_key_word':
                             if k=='text_key_word':
@@ -1715,8 +1737,33 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                                 filter = v
                                 filter = v
                             if re.search("filter_unit",k) is not None:
                             if re.search("filter_unit",k) is not None:
                                 filter_unit = True
                                 filter_unit = True
+
+                    if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
+                        if re.search('[幢栋号楼层]', sentence_text[_match.span()[0]-2:_match.span()[0]]):
+                            entity_text = re.sub('\d+,', '', entity_text)
+                        else:
+                            entity_text = entity_text.replace(',', '.')
+                        # print(' 修正OCR识别小数点为逗号')
+
                     if entity_text.find("元")>=0:
                     if entity_text.find("元")>=0:
                         unit = ""
                         unit = ""
+                    if unit == "":  #2021/7/21 有明显金额特征的补充单位,避免被过滤
+                        if ('¥' in text_beforeMoney or '¥' in text_beforeMoney):
+                            unit = '元'
+                            # print('明显金额特征补充单位 元')
+                        elif re.search('[单报标限]价|金额|价格[::]+$', text_beforeMoney.strip()) and \
+                                re.search('\d{5,}',entity_text) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}',entity_text)==None:
+                            unit = '元'
+                            # print('明显金额特征补充单位 元')
+                        elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7})$)|(^\d{,3}(,\d{3})+$)',entity_text):
+                            unit = '元'
+                            # print('明显金额特征补充单位 元')
+                    if unit.find("万") >= 0 and entity_text.find("万") >= 0:  #2021/7/19修改为金额文本有万,不计算单位
+                        # print('修正金额及单位都有万, 金额:',entity_text, '单位:',unit)
+                        unit = "元"
+                    if re.search('.*万元万元', entity_text):  #2021/7/19 修正两个万元
+                        # print(' 修正两个万元',entity_text)
+                        entity_text = entity_text.replace('万元万元','万元')
                     else:
                     else:
                         if filter_unit:
                         if filter_unit:
                             continue
                             continue
@@ -1742,15 +1789,36 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
 
 
                     entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
                     entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
+                    # print('转换前金额:', entity_text, '单位:', unit)
+                    if re.search('总投资', sentence_text[_match.span()[0] - 6:_match.span()[0]]):  # 2021/8/5过滤掉总投资金额
+                        # print('总投资金额: ', _match.group(0))
+                        notes = '总投资'
+                    if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
+                        notes = '大写'
+                        # print("补充备注:notes = 大写")
+                    elif re.search('单价', sentence_text[_match.span()[0]:_match.span()[1]]):
+                        notes = '单价'
+                        # print("补充备注:单价 ",sentence_text[_match.span()[0]-2:_match.span()[1]])
                     if len(unit)>0:
                     if len(unit)>0:
-                        entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0]))
+                        if unit.find('万')>=0 and len(entity_text.split('.')[0])>=8: # 2021/7/19 修正万元金额过大的情况
+                            # print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
+                            entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(unit[0])/10000)
+                        else:
+                            # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
+                            entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0]))
                     else:
                     else:
-                        entity_text = str(getUnifyMoney(entity_text))
+                        if entity_text.find('万')>=0 and entity_text.split('.')[0].isdigit() and len(entity_text.split('.')[0])>=8:
+                            entity_text = str(getUnifyMoney(entity_text)/10000)
+                            # print('修正金额字段含万 过大的情况')
+                        else:
+                            entity_text = str(getUnifyMoney(entity_text))
 
 
                     if float(entity_text)<100 or float(entity_text)>100000000000:
                     if float(entity_text)<100 or float(entity_text)>100000000000:
+                        # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
                         continue
                         continue
 
 
                     if notSure and unit=="" and float(entity_text)>100*10000:
                     if notSure and unit=="" and float(entity_text)>100*10000:
+                        # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000:', entity_text, unit)
                         continue
                         continue
 
 
                     _exists = False
                     _exists = False
@@ -1762,7 +1830,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     if not _exists:
                     if not _exists:
                         if float(entity_text)>1:
                         if float(entity_text)>1:
                             list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp))
                             list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp))
-
+                            list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
                 else:
                 else:
                     index += 1
                     index += 1
 
 
@@ -1824,7 +1892,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 end_index = changeIndexFromWordToWords(tokens, end_index_temp)
                 end_index = changeIndexFromWordToWords(tokens, end_index_temp)
                 if begin_index is None or end_index is None:
                 if begin_index is None or end_index is None:
                     continue
                     continue
-                print(begin_index_temp,end_index_temp,begin_index,end_index)
+                # print(begin_index_temp,end_index_temp,begin_index,end_index)
                 entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
                 entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
                 entity_text = bidway['body']
                 entity_text = bidway['body']
                 list_sentence_entitys.append(
                 list_sentence_entitys.append(
@@ -1949,7 +2017,7 @@ def getPredictTable():
             df_data["docid"].append(item["docid"])
             df_data["docid"].append(item["docid"])
             df_data["json_table"].append(item["json_table"])
             df_data["json_table"].append(item["json_table"])
         except Exception as e:
         except Exception as e:
-            print(e)
+            # print(e)
             break
             break
     df_1 = pd.DataFrame(df_data)
     df_1 = pd.DataFrame(df_data)
     df_1.to_csv("../form/websource_67000_table.csv",columns=["docid","json_table"])
     df_1.to_csv("../form/websource_67000_table.csv",columns=["docid","json_table"])
@@ -1965,7 +2033,7 @@ if __name__=="__main__":
             f.write(segment(tableToText(BeautifulSoup(content,"lxml"))))
             f.write(segment(tableToText(BeautifulSoup(content,"lxml"))))
     '''        
     '''        
     # content = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
     # content = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
-    # print(segment(tableToText(BeautifulSoup(content,"lxml"))))
+    # # print(segment(tableToText(BeautifulSoup(content,"lxml"))))
     getPredictTable()
     getPredictTable()
     
     
         
         

+ 15 - 1
BiddingKG/dl/interface/extract.py

@@ -91,8 +91,13 @@ def predict(doc_id,text,title=""):
     list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
     list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
     cost_time["punish"] = time.time()-start_time
     cost_time["punish"] = time.time()-start_time
 
 
+    start_time = time.time()
+    list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
+    cost_time["channel"] = time.time()-start_time
+
     #print(prem)
     #print(prem)
-    data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
+    # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
+    data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
     data_res["cost_time"] = cost_time
     data_res["cost_time"] = cost_time
     data_res["success"] = True
     data_res["success"] = True
 
 
@@ -122,4 +127,13 @@ def test(name,content):
 
 
 
 
 if __name__=="__main__":
 if __name__=="__main__":
+    import pandas as pd
+    df = pd.read_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0812.xlsx')
+    # for i in range(50):
+    i = 246
+    doc_id = df.loc[i, 'docid']
+    text = df.loc[i, 'dochtmlcon']
+    title = df.loc[i, 'doctitle']
+    rs = predict(doc_id,text,title)
+    print(rs)
     pass
     pass

+ 112 - 37
BiddingKG/dl/interface/getAttributes.py

@@ -2,6 +2,7 @@
 
 
 from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date
 from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date
 from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
 from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
+from decimal import Decimal
 import re
 import re
 import copy
 import copy
 import math
 import math
@@ -240,11 +241,11 @@ def get_legal_comba(list_entity,dict_role_combination):
         # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
         # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
         _list_all_selution = circle_package(dict_role_combination[packageName])
         _list_all_selution = circle_package(dict_role_combination[packageName])
         '''
         '''
-        print("===1")
-        print(packageName)
+        # print("===1")
+        # print(packageName)
         for item in _list_all_selution:
         for item in _list_all_selution:
-            print(item)
-        print("===2")
+            # print(item)
+        # print("===2")
         '''
         '''
         #去除包含子集
         #去除包含子集
         list_all_selution_simple = []
         list_all_selution_simple = []
@@ -759,8 +760,16 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
     def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
     def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
         for i in range(len(packDict[packageName]["roleList"])):
         for i in range(len(packDict[packageName]["roleList"])):
             if packDict[packageName]["roleList"][i].entity_text==entity:
             if packDict[packageName]["roleList"][i].entity_text==entity:
-                if money_prob>packDict[packageName]["roleList"][i].money_prob:
-                    packDict[packageName]["roleList"][i].money = money
+                # if money_prob>packDict[packageName]["roleList"][i].money_prob:
+                #     packDict[packageName]["roleList"][i].money = money
+                #     packDict[packageName]["roleList"][i].money_prob = money_prob
+                if packDict[packageName]["roleList"][i].money_prob==0 :  # 2021/7/20第一次更新金额
+                    packDict[packageName]["roleList"][i].money = money.entity_text
+                    packDict[packageName]["roleList"][i].money_prob = money_prob
+                elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or money.notes in ['大写']: # 2021/7/20改为优先选择大写金额,
+                    # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob)
+                    # print('链接金额备注 ',money.notes, money.entity_text, money.values)
+                    packDict[packageName]["roleList"][i].money = money.entity_text
                     packDict[packageName]["roleList"][i].money_prob = money_prob
                     packDict[packageName]["roleList"][i].money_prob = money_prob
         return packDict
         return packDict
     
     
@@ -778,6 +787,15 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                 return True
                 return True
     
     
     p_entity = 0
     p_entity = 0
+
+    # 2021/7/19 顺序比较金额,前面是后面的一万倍则把前面金额/10000
+    money_list = [it for it in list_entity if it.entity_type=="money"]
+    for i in range(len(money_list)-1):
+        for j in range(1, len(money_list)):
+            if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
+                    Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
+                money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
+                # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000')
     
     
     #遍历所有实体
     #遍历所有实体
     while(p_entity<len(list_entity)):
     while(p_entity<len(list_entity)):
@@ -818,28 +836,29 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                                 if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
                                 if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
                                     _entity.pointer_person = entity
                                     _entity.pointer_person = entity
     '''
     '''
-        #金额往前找实体
-        if entity.entity_type=="money":
-            if entity.values[entity.label]>=on_value:
-                p_entity_money= p_entity
-                entity_money = list_entity[p_entity_money]
-                if len(PackageSet)>0:
-                    packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
-                    if packagePointer is None:
-                        packageName_entity = "Project"
-                    else:
-                        packageName_entity = packagePointer.entity_text
-                else:
-                    packageName_entity = "Project"
-                while(p_entity_money>0):
-                    entity_before = list_entity[p_entity_money]
-                    if entity_before.entity_type in ['org','company']:
-                        if str(entity_before.label)=="1":
-                            addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
-                            #add pointer_money
-                            entity_before.pointer_money = entity_money
-                        break
-                    p_entity_money -= 1
+        # #金额往前找实体
+        # if entity.entity_type=="money":
+        #     if entity.values[entity.label]>=on_value:
+        #         p_entity_money= p_entity
+        #         entity_money = list_entity[p_entity_money]
+        #         if len(PackageSet)>0:
+        #             packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
+        #             if packagePointer is None:
+        #                 packageName_entity = "Project"
+        #             else:
+        #                 packageName_entity = packagePointer.entity_text
+        #         else:
+        #             packageName_entity = "Project"
+        #         while(p_entity_money>0):
+        #             entity_before = list_entity[p_entity_money]
+        #             if entity_before.entity_type in ['org','company']:
+        #                 if str(entity_before.label)=="1":
+        #                     addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
+        #                     #add pointer_money
+        #                     entity_before.pointer_money = entity_money
+        #                 break
+        #             p_entity_money -= 1
+
 
 
 
 
         #如果实体属于角色集合,则往后找属性
         #如果实体属于角色集合,则往后找属性
@@ -882,9 +901,23 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                                 else:
                                 else:
                                     packageName_entity = "Project"
                                     packageName_entity = "Project"
                                 if str(entity.label) in ["2","3","4"]:
                                 if str(entity.label) in ["2","3","4"]:
-                                    addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
+                                    # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
+                                    if entity_after.notes == '单价':
+                                        addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
+                                                         0.5)
+                                        entity.pointer_money = entity_after
+                                        # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
+                                    else:
+                                        addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
+                                                         entity_after.values[entity_after.label])
+                                        entity.pointer_money = entity_after
+                                        # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
+                                        break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额
                                     #add pointer_money
                                     #add pointer_money
-                                    entity.pointer_money = entity_after
+                                    # entity.pointer_money = entity_after
+                                    # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
+                                    # if entity_after.notes!='单价':
+                                    #     break  # 2021/7/16 新增,找到中标金额即停止,不再往后找金额
                         '''
                         '''
                     if entity_after.entity_type=="person":
                     if entity_after.entity_type=="person":
                         if entity_after.values[entity_after.label]>=on_value_person:
                         if entity_after.values[entity_after.label]>=on_value_person:
@@ -1034,6 +1067,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
     p_entity = len(list_entity)-1
     p_entity = len(list_entity)-1
 
 
     set_tenderer_money = set()
     set_tenderer_money = set()
+    list_tenderer_money = []  #2021/7/16 新增列表,倒序保存所有中标金额
+
     #遍历所有实体
     #遍历所有实体
     while(p_entity>=0):
     while(p_entity>=0):
         entity = list_entity[p_entity]
         entity = list_entity[p_entity]
@@ -1041,7 +1076,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
             if entity.values[entity.label]>=on_value:
             if entity.values[entity.label]>=on_value:
                 if str(entity.label)=="1":
                 if str(entity.label)=="1":
                     set_tenderer_money.add(float(entity.entity_text))
                     set_tenderer_money.add(float(entity.entity_text))
-                if str(entity.label)=="0":
+                    list_tenderer_money.append(float(entity.entity_text))  # 2021/7/16 新增列表,倒序保存所有中标金额
+                # if str(entity.label)=="0":
+                if str(entity.label)=="0" and entity.notes!='总投资':
                     '''
                     '''
                     if p_entity>0:
                     if p_entity>0:
                         p_before = list_entity[p_entity-1]
                         p_before = list_entity[p_entity-1]
@@ -1056,8 +1093,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                         packageName = packagePointer.entity_text
                         packageName = packagePointer.entity_text
                         
                         
                     if packageName=="Project":
                     if packageName=="Project":
-                        if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
-                            PackDict["Project"]["tendereeMoney"] = float(entity.entity_text) 
+                        # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
+                        #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
+                        if entity.values[entity.label]>on_value:
+                            PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
                     else:
                     else:
                         PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
                         PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
                         #add pointer_tendereeMoney
                         #add pointer_tendereeMoney
@@ -1091,6 +1130,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
     #只找到一个中标人和中标金额
     #只找到一个中标人和中标金额
     if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
     if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
         list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
         list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
+        # print('一个中标人一个金额:', list(set_tenderer_money)[0])
     #找到一个中标人和多个招标金额
     #找到一个中标人和多个招标金额
     if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
     if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
         _maxMoney = 0
         _maxMoney = 0
@@ -1101,8 +1141,15 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                 _maxMoney = _m
                 _maxMoney = _m
         if _sumMoney/_maxMoney==2:
         if _sumMoney/_maxMoney==2:
             list(set_tenderer_role)[0].money = _maxMoney
             list(set_tenderer_role)[0].money = _maxMoney
+            # print('一人多金额分项合计 取最大金额:', _maxMoney)
         else:
         else:
-            list(set_tenderer_role)[0].money = _maxMoney
+            # list(set_tenderer_role)[0].money = _maxMoney
+            if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000:
+                list(set_tenderer_role)[0].money = min(list_tenderer_money)
+                # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额:', min(list_tenderer_money))
+            else:
+                list(set_tenderer_role)[0].money = list_tenderer_money[-1]  # 2021/7/16 修改 不是单价合计方式取第一个中标金额
+                # print('一人多金额 取第一个中标金额:', list_tenderer_money[-1])
     #每个包都只找到一个金额
     #每个包都只找到一个金额
     _flag_pack_money = True
     _flag_pack_money = True
     for k,v in dict_pack_tenderer_money.items():
     for k,v in dict_pack_tenderer_money.items():
@@ -1111,6 +1158,31 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
     if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
     if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
         for k,v in dict_pack_tenderer_money.items():
         for k,v in dict_pack_tenderer_money.items():
             v[0].money = list(v[1])[0]
             v[0].money = list(v[1])[0]
+            # print('k,v in dict_pack_tenderer_money.items', k, v)
+
+    # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
+    for pack in PackDict.keys():
+        for i in range(len(PackDict[pack]["roleList"])):
+            if PackDict[pack]["tendereeMoney"] > 0:
+                # print('金额数据类型:',type(PackDict[pack]["roleList"][i].money))
+                if float(PackDict[pack]["roleList"][i].money) >10000000 and \
+                        float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
+                    PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
+                    # print('招标金额校正中标金额')
+
+    # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额
+    for pack in PackDict.keys():
+        tmp_moneys = []
+        for i in range(len(PackDict[pack]["roleList"])):
+            if float(PackDict[pack]["roleList"][i].money) >100000:
+                tmp_moneys.append(float(PackDict[pack]["roleList"][i].money))
+        if len(tmp_moneys)>2 and max(tmp_moneys)/min(tmp_moneys)>1000:
+            for i in range(len(PackDict[pack]["roleList"])):
+                if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000:
+                    PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
+                    # print('通过其他中标人投标金额校正中标金额')
+
+
     for pack in PackDict.keys():
     for pack in PackDict.keys():
         for i in range(len(PackDict[pack]["roleList"])):
         for i in range(len(PackDict[pack]["roleList"])):
             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
@@ -1148,7 +1220,7 @@ def getPackageRoleMoney(list_sentence,list_entity):
     RoleList,RoleSet,PackageList,PackageSet = theRole
     RoleList,RoleSet,PackageList,PackageSet = theRole
     '''
     '''
     for item in PackageList:
     for item in PackageList:
-        print(item)
+        # print(item)
     '''
     '''
     # print("=2")
     # print("=2")
     PackDict = initPackageAttr(RoleList, PackageSet)
     PackDict = initPackageAttr(RoleList, PackageSet)
@@ -1185,7 +1257,8 @@ def getOtherAttributes(list_entity):
                   "time_bidopen":"",
                   "time_bidopen":"",
                   "time_bidclose":"",
                   "time_bidclose":"",
                   "serviceTime":"",
                   "serviceTime":"",
-                  "product":[]}
+                  "product":[],
+                  "total_tendereeMoney":0}
     for entity in list_entity:
     for entity in list_entity:
         if entity.entity_type == 'bidway':
         if entity.entity_type == 'bidway':
             dict_other["bidway"] = turnBidWay(entity.entity_text)
             dict_other["bidway"] = turnBidWay(entity.entity_text)
@@ -1203,6 +1276,8 @@ def getOtherAttributes(list_entity):
             dict_other["person_review"].append(entity.entity_text)
             dict_other["person_review"].append(entity.entity_text)
         elif entity.entity_type=='product':
         elif entity.entity_type=='product':
             dict_other["product"].append(entity.entity_text)
             dict_other["product"].append(entity.entity_text)
+        elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
+            dict_other["total_tendereeMoney"] = float(entity.entity_text)
     dict_other["product"] = list(set(dict_other["product"]))
     dict_other["product"] = list(set(dict_other["product"]))
     return dict_other
     return dict_other
 
 
@@ -1241,7 +1316,7 @@ if __name__=="__main__":
     for row in rows:
     for row in rows:
         
         
         count += 1
         count += 1
-        print(count)
+        # print(count)
         doc_id = row[0]
         doc_id = row[0]
         
         
         roleList = getPackageRoleMoney(doc_id)
         roleList = getPackageRoleMoney(doc_id)

+ 4 - 3
BiddingKG/dl/interface/modelFactory.py

@@ -9,7 +9,7 @@ import sys
 sys.path.append(os.path.abspath("../.."))
 sys.path.append(os.path.abspath("../.."))
 from keras import models
 from keras import models
 from keras import layers
 from keras import layers
-from keras_contrib.layers import CRF
+# from keras_contrib.layers import CRF
 from keras.preprocessing.sequence import pad_sequences
 from keras.preprocessing.sequence import pad_sequences
 from keras import optimizers,losses,metrics
 from keras import optimizers,losses,metrics
 from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.common.Utils import *
@@ -77,7 +77,7 @@ class Model_role_classify_word():
     def encode(self,tokens,begin_index,end_index,entity_text,**kwargs):
     def encode(self,tokens,begin_index,end_index,entity_text,**kwargs):
         _span = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=12,center_include=True,word_flag=True,text=entity_text)
         _span = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=12,center_include=True,word_flag=True,text=entity_text)
         # print(_span)
         # print(_span)
-        _encode_span = encodeInput(_span, word_len=50, word_flag=True,userFool=False)
+        _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False)
         # print(_encode_span)
         # print(_encode_span)
         return _encode_span
         return _encode_span
     
     
@@ -109,6 +109,7 @@ class Model_money_classify():
         with self.sess_money.as_default() as sess:
         with self.sess_money.as_default() as sess:
           with sess.graph.as_default():
           with sess.graph.as_default():
             meta_graph_def = tf.saved_model.loader.load(sess,tags=["serve"],export_dir=os.path.dirname(__file__)+"/money_savedmodel")
             meta_graph_def = tf.saved_model.loader.load(sess,tags=["serve"],export_dir=os.path.dirname(__file__)+"/money_savedmodel")
+            # meta_graph_def = tf.saved_model.loader.load(sess,tags=["serve"],export_dir=os.path.dirname(__file__)+"/money_savedmodel_bilstmonly")
             signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
             signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
             signature_def = meta_graph_def.signature_def
             signature_def = meta_graph_def.signature_def
             
             
@@ -135,7 +136,7 @@ class Model_money_classify():
     def encode(self,tokens,begin_index,end_index,**kwargs):
     def encode(self,tokens,begin_index,end_index,**kwargs):
         _span = spanWindow(tokens=tokens, begin_index=begin_index, end_index=end_index, size=10, center_include=True, word_flag=True)
         _span = spanWindow(tokens=tokens, begin_index=begin_index, end_index=end_index, size=10, center_include=True, word_flag=True)
         # print(_span)
         # print(_span)
-        return encodeInput(_span, word_len=50, word_flag=True,userFool=False)
+        return encodeInput(_span, word_len=30, word_flag=True,userFool=False)
         return embedding_word(_span,shape=(3,100,60))
         return embedding_word(_span,shape=(3,100,60))
     
     
     def predict(self,x):
     def predict(self,x):

BIN
BiddingKG/dl/interface/money_savedmodel/saved_model.pb


BIN
BiddingKG/dl/interface/money_savedmodel/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/money_savedmodel/variables/variables.index


+ 219 - 31
BiddingKG/dl/interface/predictor.py

@@ -6,6 +6,7 @@ Created on 2018年12月26日
 
 
 import os
 import os
 import sys
 import sys
+import fool
 sys.path.append(os.path.abspath("../.."))
 sys.path.append(os.path.abspath("../.."))
 # from keras.engine import topology
 # from keras.engine import topology
 # from keras import models
 # from keras import models
@@ -28,7 +29,8 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
                   "form":{"predictor":None,"Lock":RLock()},
                   "form":{"predictor":None,"Lock":RLock()},
                   "time":{"predictor":None,"Lock":RLock()},
                   "time":{"predictor":None,"Lock":RLock()},
                   "punish":{"predictor":None,"Lock":RLock()},
                   "punish":{"predictor":None,"Lock":RLock()},
-                  "product":{"predictor":None,"Lock":RLock()}}
+                  "product":{"predictor":None,"Lock":RLock()},
+                  "channel": {"predictor": None, "Lock": RLock()}}
 
 
 
 
 def getPredictor(_type):
 def getPredictor(_type):
@@ -51,6 +53,8 @@ def getPredictor(_type):
                     dict_predictor[_type]["predictor"] = Punish_Extract()
                     dict_predictor[_type]["predictor"] = Punish_Extract()
                 if _type=="product":
                 if _type=="product":
                     dict_predictor[_type]["predictor"] = ProductPredictor()
                     dict_predictor[_type]["predictor"] = ProductPredictor()
+                if _type == "channel":
+                    dict_predictor[_type]["predictor"] = DocChannel()
             return dict_predictor[_type]["predictor"]
             return dict_predictor[_type]["predictor"]
     raise NameError("no this type of predictor")
     raise NameError("no this type of predictor")
 
 
@@ -1286,13 +1290,189 @@ class ProductPredictor():
                                                  wordOffset_end=end)
                                                  wordOffset_end=end)
                                 list_entity.append(_entity)
                                 list_entity.append(_entity)
                                 temp_list.append(sentence.sentence_text[start:end])
                                 temp_list.append(sentence.sentence_text[start:end])
-                        item["product"] = list(set(temp_list))
-                        result.append(item)
+                        # item["product"] = list(set(temp_list))
+                        # result.append(item)
                         if _begin_index+_LEN >= len(list_sentence):
                         if _begin_index+_LEN >= len(list_sentence):
                             break
                             break
                         _begin_index += _LEN
                         _begin_index += _LEN
+                    item["product"] = list(set(temp_list))
+                    result.append(item) # 修正bug
                 return result
                 return result
 
 
+# docchannel类型提取
+class DocChannel():
+  def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):
+    self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
+    self.mask, self.mask_title = self.load_life(life_model)
+    self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
+    self.type_mask, self.type_mask_title = self.load_type(type_model)
+    self.sequen_len = 200  # 150 200
+    self.title_len = 30
+    self.sentence_num = 10
+    self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
+
+    lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
+    lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
+    self.id2type = {k: v for k, v in enumerate(lb_type)}
+    self.id2life = {k: v for k, v in enumerate(lb_life)}
+
+  def load_life(self,life_model):
+    with tf.Graph().as_default() as graph:
+      output_graph_def = graph.as_graph_def()
+      with open(os.path.dirname(__file__)+life_model, 'rb') as f:
+        output_graph_def.ParseFromString(f.read())
+        tf.import_graph_def(output_graph_def, name='')
+        print("%d ops in the final graph" % len(output_graph_def.node))
+        del output_graph_def
+        sess = tf.Session(graph=graph)
+        sess.run(tf.global_variables_initializer())
+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+        title = sess.graph.get_tensor_by_name('inputs/title:0')
+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
+        return sess, title, inputs, prob, softmax, mask, mask_title
+
+  def load_type(self,type_model):
+    with tf.Graph().as_default() as graph:
+      output_graph_def = graph.as_graph_def()
+      with open(os.path.dirname(__file__)+type_model, 'rb') as f:
+        output_graph_def.ParseFromString(f.read())
+        tf.import_graph_def(output_graph_def, name='')
+        print("%d ops in the final graph" % len(output_graph_def.node))
+        del output_graph_def
+        sess = tf.Session(graph=graph)
+        sess.run(tf.global_variables_initializer())
+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+        title = sess.graph.get_tensor_by_name('inputs/title:0')
+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
+        return sess, title, inputs, prob, softmax, mask, mask_title
+
+  def predict_process(self, docid='', doctitle='', dochtmlcon=''):
+    # print('准备预处理')
+    def get_kw_senten(s, span=10):
+      doc_sens = []
+      tmp = 0
+      num = 0
+      end_idx = 0
+      for it in re.finditer(self.kws, s):  # '|'.join(keywordset)
+        left = s[end_idx:it.end()].split()
+        right = s[it.end():].split()
+        tmp_seg = s[tmp:it.start()].split()
+        if len(tmp_seg) > span or tmp == 0:
+          doc_sens.append(' '.join(left[-span:] + right[:span]))
+          end_idx = it.end() + 1 + len(' '.join(right[:span]))
+          tmp = it.end()
+          num += 1
+          if num >= self.sentence_num:
+            break
+      if doc_sens == []:
+        doc_sens.append(s)
+      return doc_sens
+
+    def word2id(wordlist, max_len=self.sequen_len):
+      ids = [getIndexOfWords(w) for w in wordlist]
+      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
+      assert len(ids) == max_len
+      return ids
+
+    cost_time = dict()
+    datas = []
+    datas_title = []
+    try:
+      segword_title = ' '.join(fool.cut(doctitle)[0])
+      segword_content = dochtmlcon
+    except:
+      segword_content = ''
+      segword_title = ''
+    if isinstance(segword_content, float):
+      segword_content = ''
+    if isinstance(segword_title, float):
+      segword_title = ''
+    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
+      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
+      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
+    segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
+    segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
+    doc_word_list = segword_content.split()
+    if len(doc_word_list) > self.sequen_len / 2:
+      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
+      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
+    else:
+      doc_sens = ' '.join(doc_word_list[:self.sequen_len])
+    datas.append(doc_sens.split())
+    datas_title.append(segword_title.split())
+    # print('完成预处理')
+    return datas, datas_title
+
+  def is_houxuan(self, title, content):
+    '''
+    通过标题和中文内容判断是否属于候选人公示类别
+    :param title: 公告标题
+    :param content: 公告正文文本内容
+    :return: 1 是候选人公示 ;0 不是
+    '''
+    if re.search('候选人的?公示|评标结果|评审结果|中标公示', title):  # (中标|成交|中选|入围)
+      if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
+        return 0
+      return 1
+    if re.search('候选人的?公示', content[:100]):
+      if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
+        return 0
+      return 1
+    else:
+      return 0
+
+  def predict(self, title='', content=''):
+    # print('准备预测')
+    if isinstance(content, list):
+      token_l = [it.tokens for it in content]
+      tokens = [it for l in token_l for it in l]
+      content = ' '.join(tokens)
+
+    data_content, data_title = self.predict_process(docid='', doctitle=title[:50], dochtmlcon=content) # 标题最多取50字
+    text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
+    title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
+
+    array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
+    array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
+    pred = self.type_sess.run(self.type_softmax,
+                                    feed_dict={
+                                              self.type_title: array_title,
+                                              self.type_content: array_content,
+                                              self.type_mask:[[0]*text_len+[1]*(self.sequen_len-text_len)],
+                                              self.type_mask_title:[[0]*title_len+[1]*(self.title_len-title_len)],
+                                              self.type_prob:1}
+                            )
+    id = np.argmax(pred, axis=1)[0]
+    prob = pred[0][id]
+    if id == 0:
+      pred = self.lift_sess.run(self.lift_softmax,
+                                      feed_dict={
+                                                self.lift_title: array_title,
+                                                self.lift_content: array_content,
+                                                self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
+                                                self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
+                                                self.lift_prob:1}
+                              )
+      id = np.argmax(pred, axis=1)[0]
+      prob = pred[0][id]
+      if id == 6:
+        if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
+          # return '候选人公示', prob
+          return [{'docchannel': '候选人公示'}]
+      # return self.id2life[id], prob
+      return [{'docchannel':self.id2life[id]}]
+    else:
+      # return self.id2type[id], prob
+      return [{'docchannel':self.id2type[id]}]
+
 def getSavedModel():
 def getSavedModel():
     #predictor = FormPredictor()
     #predictor = FormPredictor()
     graph = tf.Graph()
     graph = tf.Graph()
@@ -1493,20 +1673,28 @@ def save_role_model():
                                    )
                                    )
     
     
 def save_money_model():
 def save_money_model():
-    model_money = PREMPredict().model_money
-    with model_money.graph.as_default():
-        model = model_money.getModel()
-        sess = tf.Session(graph=model_money.graph)
-        model.summary()
-        sess.run(tf.global_variables_initializer())
-        h5_to_graph(sess, model_money.graph, model_money.model_money_file)
-        tf.saved_model.simple_save(sess,
-                                   "./money_savedmodel/",
-                                   inputs = {"input0":model.input[0],
-                                             "input1":model.input[1],
-                                             "input2":model.input[2]},
-                                   outputs = {"outputs":model.output}
-                                   )
+    model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
+    graph = tf.Graph()
+    with graph.as_default():
+
+        sess = tf.Session(graph=graph)
+
+        with sess.as_default():
+            # model = model_money.getModel()
+            # model.summary()
+            # sess.run(tf.global_variables_initializer())
+            # h5_to_graph(sess, model_money.graph, model_money.model_money_file)
+
+            model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
+            model.summary()
+            print(model.weights)
+            # tf.saved_model.simple_save(sess,
+            #                            "./money_savedmodel2/",
+            #                            inputs = {"input0":model.input[0],
+            #                                      "input1":model.input[1],
+            #                                      "input2":model.input[2]},
+            #                            outputs = {"outputs":model.output}
+            #                            )
     
     
 
 
 def save_person_model():
 def save_person_model():
@@ -1582,23 +1770,23 @@ def save_timesplit_model():
 if __name__=="__main__":
 if __name__=="__main__":
     #save_role_model()
     #save_role_model()
     # save_codename_model()
     # save_codename_model()
-    #save_money_model()
+    save_money_model()
     #save_person_model()
     #save_person_model()
     #save_form_model()
     #save_form_model()
     #save_codesplit_model()
     #save_codesplit_model()
     # save_timesplit_model()
     # save_timesplit_model()
     '''
     '''
-    with tf.Session(graph=tf.Graph()) as sess:
-        from tensorflow.python.saved_model import tag_constants
-        meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
-        graph = tf.get_default_graph()
-        signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-        signature = meta_graph_def.signature_def
-        input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
-        input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
-        outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
-        x = load("person_x.pk")
-        _data = np.transpose(x,[1,0,2,3])
-        y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
-        print(np.argmax(y,-1))
+    # with tf.Session(graph=tf.Graph()) as sess:
+    #     from tensorflow.python.saved_model import tag_constants
+    #     meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
+    #     graph = tf.get_default_graph()
+    #     signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    #     signature = meta_graph_def.signature_def
+    #     input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
+    #     input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
+    #     outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
+    #     x = load("person_x.pk")
+    #     _data = np.transpose(x,[1,0,2,3])
+    #     y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
+    #     print(np.argmax(y,-1))
     '''
     '''

BIN
BiddingKG/dl/interface/role_savedmodel/saved_model.pb


BIN
BiddingKG/dl/interface/role_savedmodel/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/role_savedmodel/variables/variables.index


+ 230 - 0
BiddingKG/dl/money/money_keras.py

@@ -0,0 +1,230 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/7/27 0027 15:05 
+
+import os
+import sys
+import h5py
+from keras import models,layers,losses,optimizers
+sys.path.append(os.path.abspath("../../.."))
+import pandas as pd
+import math
+from keras.callbacks import ModelCheckpoint
+from BiddingKG.dl.common.Utils import *
+import tensorflow as tf
+from keras.models import load_model
+
+lb = ['招标金额','中标金额','其他金额']
+id2lb = {k:v for k,v in enumerate(lb)}
+lb2id = {v:k for k,v in id2lb.items()}
+seq_len = 30
+
+def labeling(label, out_len=3):
+    out = np.zeros((out_len))
+    out[label] = 1
+    return out
+
+
+def getTrainData(percent=0.9):
+    df = pd.read_excel('traindata/2兼职标注数据_test22.xlsx')
+    df2 = pd.read_excel('traindata/原金额模型标注数据.xls')
+    df = df.append(df2, ignore_index=True)
+    df.dropna(subset=['left'], inplace=True)
+    df.fillna('', inplace=True)
+    if 'relabel' in df.columns:
+        df['label'] = df.apply(lambda x:x['relabel'] if x['relabel']!="" else x['label'], axis=1)
+        print('更新标注完成')
+    for i in df.index:
+        if df.loc[i, 'label'] not in lb:
+            print('标签错误:',df.loc[i, 'label'])
+    df['label'] = df['label'].apply(lambda x:lb2id.get(x, 0))
+
+    print('总样本:', len(df))
+    train_x = []
+    train_y = []
+    test_x = []
+    test_y = []
+
+    for before, text, after, label in zip(df["left"], df["center"], df["right"], df["label"]):
+        before = str(before) if str(before) != "nan" else ""
+        text = str(text)
+        after = str(after) if str(after) != "nan" else ""
+
+        x = encodeInput([before, text, after], word_len=seq_len, word_flag=True, userFool=False)
+        y = labeling(label)
+        if np.random.random() < percent:
+            train_x.append(x)
+            train_y.append(y)
+        else:
+            test_x.append(x)
+            test_y.append(y)
+    return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y), np.transpose(np.array(test_x),
+                                                                                       (1, 0, 2)), np.array(test_y)
+def word2id(df):
+    train_x = []
+    train_y = []
+    test_x = []
+    test_y = []
+
+    for before, text, after, label in zip(df["left"], df["center"], df["right"], df["label"]):
+        before = str(before) if str(before) != "nan" else ""
+        text = str(text)
+        after = str(after) if str(after) != "nan" else ""
+
+        x = encodeInput([before, text, after], word_len=seq_len, word_flag=True, userFool=False)
+        y = labeling(label)
+        train_x.append(x)
+        train_y.append(y)
+    return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y)
+
+
+def train():
+    # pk_file = "traindata/all_data.pk"
+    # if os.path.exists(pk_file):
+    #     train_x, train_y, test_x, test_y = load(pk_file)
+    # else:
+    #     train_x, train_y, test_x, test_y = getTrainData()
+    #     save([train_x, train_y, test_x, test_y], pk_file)
+
+    df_train = pd.read_excel('traindata/df_train.xlsx')
+    df_test = pd.read_excel('traindata/df_test.xlsx')
+    train_x, train_y = word2id(df_train)
+    test_x, test_y = word2id(df_test)
+    with tf.Session() as sess:
+        vocab, matrix = getVocabAndMatrix(getModel_word())
+        model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=3)
+        print("loading weights")
+        # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
+
+        callback = ModelCheckpoint(
+            filepath="log/" + "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",
+            monitor="val_loss", save_best_only=True, save_weights_only=True, mode="min")
+        model.fit(x=[train_x[0], train_x[1], train_x[2]], y=train_y, batch_size=128, epochs=600,callbacks=[callback],
+                  validation_data=[[test_x[0], test_x[1], test_x[2]], test_y]) #
+
+
+def test(_span = [':预算金额1000000元,中标金额', '1151元', ';']):
+    input = encodeInput(_span, word_len=seq_len, word_flag=True, userFool=False)
+    print(input)
+    graph = tf.get_default_graph()
+    with graph.as_default():
+        sess = tf.Session(graph=graph)
+        with sess.as_default():
+            vocab, matrix = getVocabAndMatrix(getModel_word())
+            model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix,
+                                   classes=3)
+            model.load_weights("log/ep007-loss0.079-val_loss0.099-f1_score0.966.h5", by_name=True, skip_mismatch=True)
+            logit = model.predict([np.array([input[0]]), np.array([input[1]]), np.array([input[2]])])
+            print(logit)
+            return logit
+
+def get_savedModel():
+    sess = tf.Session(graph=tf.Graph())
+    with sess.as_default():
+        with sess.graph.as_default():
+            vocab, matrix = getVocabAndMatrix(getModel_word())
+            model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=3)
+            sess.run(tf.global_variables_initializer())
+            # model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5")
+            # model.load_weights(filepath="log/ep007-loss0.079-val_loss0.099-f1_score0.966.h5") # 2021/7/27调整模型30字最优模型
+            model.load_weights(filepath="log/ep029-loss0.081-val_loss0.094-f1_score0.971.h5") # 2021/08/06 调整模型30字最优模型
+            tf.saved_model.simple_save(session=sess,
+                                       # export_dir="money_savedmodel20210727_3",
+                                       export_dir="money_savedmodel20210806",
+                                       inputs={"input0": model.input[0],
+                                               "input1": model.input[1],
+                                               "input2": model.input[2]},
+                                       outputs={"outputs": model.output})
+
+def tensorboard_model():
+    with tf.Session(graph=tf.Graph()).as_default() as sess:
+        with sess.graph.as_default():
+            tf.saved_model.loader.load(sess, tags=["serve"], export_dir="money_savedmodel1")
+            tf.summary.FileWriter(graph=sess.graph, logdir="log2")
+
+def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
+    assert len(input_shape)==3
+    list_input = []
+    for i in range(input_shape[0]):
+        list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
+    print("list_input",list_input)
+    list_embedding = []
+
+    embedding_input = list_input
+    embedding = layers.Embedding(len(vocab),input_shape[2],
+                                 weights=[embedding_weights] if embedding_weights is not None else None,
+                                 trainable=True,name="char_embeding")
+    for i in range(len(embedding_input)):
+        print(i)
+        list_embedding.append(embedding(embedding_input[i]))
+        print(list_embedding)
+
+    list_w2v = list_embedding
+    list_lstm = []
+
+    list_lstm.append(layers.Bidirectional(layers.LSTM(32, dropout=0.5, recurrent_dropout=0.5))(list_w2v[0]))
+    list_lstm.append(layers.Bidirectional(layers.LSTM(8, dropout=0.5, recurrent_dropout=0.5))(list_w2v[1]))
+    list_lstm.append(layers.Bidirectional(layers.LSTM(16, dropout=0.5, recurrent_dropout=0.5))(list_w2v[2]))
+
+    concat = layers.concatenate(list_lstm)
+    dropout = layers.Dropout(0.5)(concat)
+
+    out = layers.Dense(classes,activation="softmax")(dropout)
+
+    model = models.Model(list_input,out)
+    model.compile(optimizer=optimizers.Adam(lr=0.001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
+    model.summary()
+
+    return model
+def verification():
+    graph = tf.get_default_graph()
+    with graph.as_default():
+        sess = tf.Session(graph=graph)
+        with sess.as_default():
+            vocab, matrix = getVocabAndMatrix(getModel_word())
+            model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix,
+                                   classes=3)
+            model.load_weights("log/ep029-loss0.081-val_loss0.094-f1_score0.971.h5", by_name=True, skip_mismatch=True)
+
+            df_val = pd.read_excel('traindata/df_val_predict.xlsx')
+            val_x, val_y = word2id(df_val)
+            logit = model.predict([val_x[0], val_x[1], val_x[2]])
+            lg = np.argmax(logit, axis=-1)
+            df_val['pred_kera'] = pd.DataFrame(lg)
+            df_val['prob_kera'] = pd.DataFrame(np.amax(logit, axis=1))
+            df_val['tf=kera'] = df_val.apply(lambda x:1 if x['pred_kera']==x['pred_tf'] else 0, axis=1)
+            df_val['tf=lb'] = df_val.apply(lambda x:1 if x['label']==x['pred_tf'] else 0, axis=1)
+            df_val['kera=lb'] = df_val.apply(lambda x:1 if x['pred_kera']==x['label'] else 0, axis=1)
+            df_val.to_excel('traindata/df_val_predict2.xlsx')
+
+
+
+            df = pd.read_excel('traindata/2兼职标注数据_test22.xlsx')
+            df.fillna('', inplace=True)
+            df.reset_index(drop=True, inplace=True)
+            preds = []
+            if 'relabel' in df.columns:
+                df['label'] = df.apply(lambda x:x['relabel'] if x['relabel']!="" else x['label'], axis=1)
+                print('更新标注完成')
+            for left, center, right, label in zip(df['left'], df['center'], df['right'], df['label']):
+                _span=[left, center, right]
+                input = encodeInput(_span, word_len=seq_len, word_flag=True, userFool=False)
+                logit = model.predict([np.array([input[0]]), np.array([input[1]]), np.array([input[2]])])
+                lg = np.argmax(logit, axis=-1)[0]
+                prob = logit[0][lg]
+                lg = id2lb.get(lg, '')
+                preds.append(lg)
+                # if lg != label:
+                #     print(left, '###', center, '###', right)
+                #     print('预测类别:%s, 预测:%.4f, 标签:%s'%(lg, prob, label))
+                #     print()
+            df['pred'] = pd.DataFrame(preds)
+            df.to_excel('traindata/2兼职标注数据_test22_predict.xlsx')
+
+if __name__ == "__main__":
+    # train()
+    verification()
+    # test(_span=['预算金额:50万,中标金额:','100.600万','元,'])
+    # get_savedModel()
+    # tensorboard_model()

+ 185 - 0
BiddingKG/dl/role/role_train.py

@@ -0,0 +1,185 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/7/28 0028 11:32 
+
+import os
+import sys
+import h5py
+from keras import models,layers,losses,optimizers
+sys.path.append(os.path.abspath("../../.."))
+import pandas as pd
+import math
+from keras.callbacks import ModelCheckpoint
+from BiddingKG.dl.interface.modelFactory import Model_role_classify_word
+from BiddingKG.dl.common.Utils import *
+import tensorflow as tf
+
+seq_len = 20
+lb2id = {'招标人':0,
+         '代理人':1,
+         '中标人':2,
+         '第二候选人':3,
+         '第三候选人':4,
+         '其他角色':5}
+def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
+    # assert len(input_shape)==3
+    list_input = []
+    for i in range(input_shape[0]):
+        list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
+    list_embedding = []
+
+    embedding_input = list_input
+    embedding = layers.Embedding(len(vocab),input_shape[2],
+                                 weights=[embedding_weights] if embedding_weights is not None else None,
+                                 mask_zero=True,trainable=True,name="char_embeding")
+    for i in range(len(embedding_input)):
+        list_embedding.append(embedding(embedding_input[i]))
+
+    list_w2v = list_embedding
+    list_lstm = []
+
+    list_lstm.append(layers.Bidirectional(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.5))(list_w2v[0])) #dropout=0.2, recurrent_dropout=0.5
+    list_lstm.append(layers.Bidirectional(layers.LSTM(8, dropout=0.2, recurrent_dropout=0.5))(list_w2v[1]))
+    list_lstm.append(layers.Bidirectional(layers.LSTM(16, dropout=0.2, recurrent_dropout=0.5))(list_w2v[2]))
+    concat = layers.concatenate(list_lstm)
+
+    concat = layers.Dropout(0.5)(concat)
+    out = layers.Dense(classes,activation="softmax")(concat)
+    model = models.Model(list_input,out)
+    model.compile(optimizer=optimizers.Adam(lr=0.001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
+    model.summary()
+
+    return model
+
+def labeling(label, out_len=6):
+    out = np.zeros((out_len))
+    out[label] = 1
+    return out
+
+def word2id(df, seq_len=seq_len):
+    train_x = []
+    train_y = []
+    test_x = []
+    test_y = []
+    # print(set(df['label']))
+    # print(set(lb2id))
+    if set(df['label']) == set(lb2id):
+        df['label'] = df['label'].apply(lambda x:lb2id[x])
+
+    for before, text, after, label in zip(df["left"], df["center"], df["right"], df["label"]):
+        before = str(before) if str(before) != "nan" else ""
+        text = str(text)
+        after = str(after) if str(after) != "nan" else ""
+
+        x = encodeInput([before, text, after], word_len=seq_len, word_flag=True, userFool=False)
+        y = labeling(label)
+        train_x.append(x)
+        train_y.append(y)
+    return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y)
+    # return train_x, np.array(train_y)
+
+def train():
+    df_train = pd.read_excel('traindata/df_train.xlsx')
+    df_test = pd.read_excel('traindata/df_test.xlsx')
+    train_x, train_y = word2id(df_train)
+    test_x, test_y = word2id(df_test)
+    with tf.Session() as sess:
+        vocab, matrix = getVocabAndMatrix(getModel_word())
+        model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
+        print("loading weights")
+        # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
+
+        callback = ModelCheckpoint(
+            filepath="log/" + "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",
+            monitor="val_loss", save_best_only=True, save_weights_only=True, mode="min")
+        model.fit(x=[train_x[0], train_x[1], train_x[2]], y=train_y, batch_size=128, epochs=600, callbacks=[callback],
+                  validation_data=[[test_x[0], test_x[1], test_x[2]], test_y])
+def test():
+    # df_val = pd.read_excel('traindata/df_val.xlsx')
+    # df_val = pd.read_excel('traindata/兼职标注数据_test29.xlsx')
+    # df_val = pd.read_excel('traindata/兼职标注数据_test3_predict.xlsx')
+    df_val = pd.read_excel('traindata/兼职标注数据_test22_待测试数据.xlsx')
+    df_val.reset_index(drop=True, inplace=True)
+    val_x, val_y = word2id(df_val, seq_len=seq_len)
+    # val_x = np.transpose(np.array(train_x), (1, 0, 2))
+
+    old_x, old_y = word2id(df_val, seq_len=50)
+    old_x = np.transpose(np.array(old_x), (1, 0, 2))
+    role_old = Model_role_classify_word()
+
+    with tf.Session() as sess:
+        vocab, matrix = getVocabAndMatrix(getModel_word())
+        model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
+        print("loading weights")
+        # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
+        # model.load_weights("log/ep006-loss0.174-val_loss0.234-f1_score0.917.h5",by_name=True, skip_mismatch=True)
+        # model.load_weights("log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5",by_name=True, skip_mismatch=True)
+        model.load_weights("log/ep014-loss0.091-val_loss0.110-f1_score0.968.h5",by_name=True, skip_mismatch=True)
+
+        lg_old = role_old.predict(old_x)
+        df_val['pred_old'] = pd.DataFrame(np.argmax(lg_old, axis=1))
+        df_val['prob_old'] = pd.DataFrame(np.amax(lg_old, axis=1))
+
+        logit = model.predict([val_x[0], val_x[1], val_x[2]])
+        print('新模型预测结果',logit[:3])
+        print('旧模型预测结果:',lg_old[:3])
+        df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1))
+        df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1))
+        # df_val['new=new3'] = df_val.apply(lambda x: 1 if x['pred_new3'] == x['pred_new2'] else 0, axis=1)
+        df_val['new=old'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
+        df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['pred_old'] else 0, axis=1)
+        df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
+        # df_val.to_excel('traindata/df_val_predict.xlsx')
+        # df_val.to_excel('traindata/兼职标注数据_test29_predict.xlsx')
+        # df_val.to_excel('traindata/兼职标注数据_test3_predict.xlsx')
+        df_val.to_excel('traindata/兼职标注数据_test22_待测试数据_predict.xlsx')
+        print('')
+
+def get_savedModel():
+    sess = tf.Session(graph=tf.Graph())
+    with sess.as_default():
+        with sess.graph.as_default():
+            vocab, matrix = getVocabAndMatrix(getModel_word())
+            model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
+            sess.run(tf.global_variables_initializer())
+            # model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5")
+            # model.load_weights(filepath="log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5")  #7月30日训练最优模型20字
+            model.load_weights(filepath="log/ep015-loss0.090-val_loss0.113-f1_score0.967.h5") #8月5日调整部分招标人标注后重新训练结果20字
+            tf.saved_model.simple_save(session=sess,
+                                       export_dir="role_savedmodel2021-8-5",
+                                       inputs={"input0": model.input[0],
+                                               "input1": model.input[1],
+                                               "input2": model.input[2]},
+                                       outputs={"outputs": model.output})
+
+def predict_pb():
+    df_val = pd.read_excel('traindata/df_val.xlsx')
+    old_x, old_y = word2id(df_val, seq_len=20)
+    # old_x = np.transpose(np.array(old_x), (1, 0, 2))
+
+    sess_role = tf.Session()
+    with sess_role.as_default() as sess:
+        with sess_role.graph.as_default():
+            meta_graph_def = tf.saved_model.loader.load(sess=sess_role, tags=["serve"],
+                                                        export_dir="role_savedmodel2021-8-5")
+            signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+            signature_def = meta_graph_def.signature_def
+
+            input0 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)
+            input1 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)
+            input2 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
+            output = sess_role.graph.get_tensor_by_name(
+                signature_def[signature_key].outputs["outputs"].name)
+            model_role = [[input0, input1, input2], output]
+            lg_old = sess_role.run(output, feed_dict={input0:old_x[0],
+                                                      input1:old_x[1],
+                                                      input2:old_x[2]})
+            print(lg_old[:3])
+
+
+if __name__ == "__main__":
+    # train()
+    # test()
+    get_savedModel()
+    # predict_pb()

+ 103 - 77
BiddingKG/dl/test/测试所有提取信息.py

@@ -30,6 +30,7 @@ import BiddingKG.dl.interface.getAttributes as getAttributes
 import BiddingKG.dl.entityLink.entityLink as entityLink
 import BiddingKG.dl.entityLink.entityLink as entityLink
 import BiddingKG.dl.complaint.punish_predictor as punish_predictor
 import BiddingKG.dl.complaint.punish_predictor as punish_predictor
 # import BiddingKG.dl.complaint.punish_rule as punish_predictor
 # import BiddingKG.dl.complaint.punish_rule as punish_predictor
+import BiddingKG.dl.channel.channel_predictor as channel_predictor
 import json
 import json
 
 
 '''
 '''
@@ -53,11 +54,12 @@ row = cursor.fetchall()[0]
 codeNamePredict = predictor.CodeNamePredict()
 codeNamePredict = predictor.CodeNamePredict()
 premPredict = predictor.PREMPredict()
 premPredict = predictor.PREMPredict()
 epcPredict = predictor.EPCPredict()
 epcPredict = predictor.EPCPredict()
-roleRulePredict = predictor.RoleRulePredictor()
+# roleRulePredict = predictor.RoleRulePredictor()
 timePredict = predictor.TimePredictor()
 timePredict = predictor.TimePredictor()
 # punish = punish_rule.Punish_Extract()
 # punish = punish_rule.Punish_Extract()
 punish = punish_predictor.Punish_Extract()
 punish = punish_predictor.Punish_Extract()
 productPredict = predictor.ProductPredictor()
 productPredict = predictor.ProductPredictor()
+channelPredict = channel_predictor.DocChannel()
 
 
 # 自定义jsonEncoder
 # 自定义jsonEncoder
 class MyEncoder(json.JSONEncoder):
 class MyEncoder(json.JSONEncoder):
@@ -97,14 +99,30 @@ def predict(doc_id, text, title=""):
     print("getPREMs")
     print("getPREMs")
     list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
     list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
     product = productPredict.predict(list_sentences,list_entitys)
     product = productPredict.predict(list_sentences,list_entitys)
+    channel = channelPredict.predict(title, list_sentences[0])
+
+    total_tendereeMoney_list = []
+    for entity in list_entitys[0]:
+        if entity.notes == '总投资':
+            total_tendereeMoney_list.append(entity.entity_text)
+    total_tendereeMoney = max([total_tendereeMoney_list]) if len(total_tendereeMoney_list)>=1 else 0
 
 
     for entitys in list_entitys:
     for entitys in list_entitys:
         for entity in entitys:
         for entity in entitys:
             print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
             print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
                   entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end,entity.sentence_index)
                   entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end,entity.sentence_index)
     # print(prem)
     # print(prem)
-    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product)[0],
-                      cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
+    # return json.dumps(Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product)[0],
+    #                   cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
+    # return json.dumps(Preprocessing.union_result(
+    #     Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product), channel)[0],
+    #                   cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
+    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(
+        Preprocessing.union_result(
+            Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic), product), [{'total_tendereeMoney':total_tendereeMoney}]
+    ),
+        channel),
+            cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
 
 
 
 
 def predict_back(doc_id, html):
 def predict_back(doc_id, html):
@@ -189,84 +207,92 @@ def test(name, content):
 
 
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    from tablestore import *
-    endpoint = 'https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com'
-    access_key_id = 'LTAI4GJxbioV1y2WM3XcZTmP'
-    access_key_secret = 'B3VITMoCnKtTQE6eAkDkat6UNFHped'
-    instance_name = 'bxkc-ots'
-    ots_client = OTSClient(endpoint, access_key_id, access_key_secret, instance_name)
-
-    def get_data(query, max_rows, table_name='document',
-                 index_name='document_index',
-                 column_names=['docid', 'dochtmlcon','doctitle', 'info_type', 'page_time'],
-                 sorters=[FieldSort("page_time", SortOrder.DESC), FieldSort("docid", SortOrder.DESC)]):
-        '''
-        从阿里云ots查询数据
-        :param query: 查询命令
-        :param max_rows: 最大返回多少数据
-        :param table_name: 表名
-        :param index_name: 表索引名
-        :param column_names: 返回字段名
-        :param sorters: 排序规则列表
-        :return: 处理后的数据列表
-        '''
-        next_token = None
-        data = []
-        all_rows = []
-        rows, next_token, total_count, is_all_succeed = \
-            ots_client.search(table_name,
-                              index_name,
-                              SearchQuery(query,
-                                          next_token=next_token,
-                                          sort=Sort(sorters=sorters),  # ASC升序
-                                          limit=100,
-                                          get_total_count=True),
-                              ColumnsToGet(column_names=column_names,
-                                           return_type=ColumnReturnType.SPECIFIED))
-        all_rows.extend(rows)
-        while next_token:
-            rows, next_token, total_count, is_all_succeed = \
-                ots_client.search(table_name,
-                                  index_name,
-                                  SearchQuery(query,
-                                              next_token=next_token,
-                                              sort=None,
-                                              limit=100,
-                                              get_total_count=True),
-                                  ColumnsToGet(column_names=column_names,
-                                               return_type=ColumnReturnType.SPECIFIED))
-            all_rows.extend(rows)
-            if len(all_rows) > max_rows:
-                print('已获取%d条数据' % len(all_rows))
-                break
-
-        if all_rows:
-            for row in all_rows:
-                tmp = []
-                tmp.append(row[0][1][1])
-                for tup in row[1]:
-                    tmp.append(tup[1])
-                data.append(tmp)
-        return data
-
-
-    bool_query = TermQuery('docid','124113339')
-    # bool_query = BoolQuery(
-    #     must_queries=[TermsQuery(field_name='info_type', column_values=['办公设备', '计算机设备']),
-    #                   RangeQuery('page_time', range_from='2020-11-01', range_to='2020-11-31')]
-    # )
-
-    data = get_data(bool_query, 1)
-    print(data)
-    docid = str(data[0][0])
-    html = data[0][1]
-    title = data[0][2]
+    # from tablestore import *
+    # endpoint = 'https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com'
+    # access_key_id = 'LTAI4GJxbioV1y2WM3XcZTmP'
+    # access_key_secret = 'B3VITMoCnKtTQE6eAkDkat6UNFHped'
+    # instance_name = 'bxkc-ots'
+    # ots_client = OTSClient(endpoint, access_key_id, access_key_secret, instance_name)
+    #
+    # def get_data(query, max_rows, table_name='document',
+    #              index_name='document_index',
+    #              column_names=['docid', 'dochtmlcon','doctitle', 'info_type', 'page_time'],
+    #              sorters=[FieldSort("page_time", SortOrder.DESC), FieldSort("docid", SortOrder.DESC)]):
+    #     '''
+    #     从阿里云ots查询数据
+    #     :param query: 查询命令
+    #     :param max_rows: 最大返回多少数据
+    #     :param table_name: 表名
+    #     :param index_name: 表索引名
+    #     :param column_names: 返回字段名
+    #     :param sorters: 排序规则列表
+    #     :return: 处理后的数据列表
+    #     '''
+    #     next_token = None
+    #     data = []
+    #     all_rows = []
+    #     rows, next_token, total_count, is_all_succeed = \
+    #         ots_client.search(table_name,
+    #                           index_name,
+    #                           SearchQuery(query,
+    #                                       next_token=next_token,
+    #                                       sort=Sort(sorters=sorters),  # ASC升序
+    #                                       limit=100,
+    #                                       get_total_count=True),
+    #                           ColumnsToGet(column_names=column_names,
+    #                                        return_type=ColumnReturnType.SPECIFIED))
+    #     all_rows.extend(rows)
+    #     while next_token:
+    #         rows, next_token, total_count, is_all_succeed = \
+    #             ots_client.search(table_name,
+    #                               index_name,
+    #                               SearchQuery(query,
+    #                                           next_token=next_token,
+    #                                           sort=None,
+    #                                           limit=100,
+    #                                           get_total_count=True),
+    #                               ColumnsToGet(column_names=column_names,
+    #                                            return_type=ColumnReturnType.SPECIFIED))
+    #         all_rows.extend(rows)
+    #         if len(all_rows) > max_rows:
+    #             print('已获取%d条数据' % len(all_rows))
+    #             break
+    #
+    #     if all_rows:
+    #         for row in all_rows:
+    #             tmp = []
+    #             tmp.append(row[0][1][1])
+    #             for tup in row[1]:
+    #                 tmp.append(tup[1])
+    #             data.append(tmp)
+    #     return data
+    #
+    #
+    # bool_query = TermQuery('docid','124113339')
+    # # bool_query = BoolQuery(
+    # #     must_queries=[TermsQuery(field_name='info_type', column_values=['办公设备', '计算机设备']),
+    # #                   RangeQuery('page_time', range_from='2020-11-01', range_to='2020-11-31')]
+    # # )
+    #
+    # data = get_data(bool_query, 1)
+    # print(data)
+    # docid = str(data[0][0])
+    # html = data[0][1]
+    # title = data[0][2]
     # text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
     # text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
     # 投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
     # 投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
     # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
     # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
     # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
     # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
-    # docid = ""
-    # html = '首都医科大学附属北京地坛医院1.5T核磁共振、16排CT和血管造影机维保服务医疗设备维修和保养服务采购项目政府采购合同公告'
+    docid = ""
+    # title = '招标公告'
+    # html = '招标人:广州市人民医院。代理人:广州医疗代理服务公司。招标金额:3000元,总投资:5万元。中标人:比地科技有限公司,中标金额:1万元。'
+    html = """, [ 正在 公告 ] 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) , 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) 采购 结果 公告 , 项目 名称 , 公司 2020 - 2021 年度 打印 制作 服务 项目 编号 , 20200803030110070001 采购 组织 人 , 中 节能 建筑 节能 有限公司 河南 分公司 采购 方式 , 谈判 采购 成交 信息 , 序号 , 标段 ( 包 ) 编号 , 标段 ( 包 ) 名称 , 成交 供应商 , 成交 金额 20200803030110070001001 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) 郑州市 上街区 永达 文印部 null 元 公告 起 止 时间 2021年 04月 14日 - 2021年 04月 17日 ,
+"""
+    title = """[ 正在 公告 ] 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) ,
+"""
+    html = html.replace(' ', '')
+    title = title.replace(' ', '')
+    # html = '首都医科大学附属北京地坛医院1.5T核磁共振、16排CT和血管造影机维保服务医疗设备维修和保养服务采购项目政府采购中标候选人公示,中标人:广州比地科技有限公司,中标金额:6000万元'
     # html = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
     # html = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
 
 
     a = time.time()
     a = time.time()