Эх сурвалжийг харах

总价单价作为金额属性,比率作为新实体

fangjiasheng 3 жил өмнө
parent
commit
6fc64525d1

+ 2 - 2
BiddingKG/dl/entityLink/entityLink.py

@@ -65,7 +65,7 @@ def link_entitys(list_entitys,on_value=0.8):
         # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
         for _entity in range_entity:
             for _ent in _entity.linked_entitys:
-                print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
+                # print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
                 if re.search("公司$", _ent.entity_text) is not None \
                         and _ent.if_dict_match == 1:
                     if len(_ent.entity_text) > len(_entity.entity_text):
@@ -155,7 +155,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
         for p_sentence in list_sentence:
             sentence = p_sentence.sentence_text
             list_match = match_enterprise_max_first(sentence)
-            print("list_match", list_match)
+            # print("list_match", list_match)
 
             doc_id = p_sentence.doc_id
             sentence_index = p_sentence.sentence_index

+ 4 - 2
BiddingKG/dl/interface/Entitys.py

@@ -168,8 +168,10 @@ class Entity():
         self.pointer_email = None
         self.is_tail = False
         self.notes = ''  # 2021/7/20 新增,保存金额大小写,单位等备注
-        self.money_unit = '' #2021/8/17 新增,保存金额单位 元、万元 、亿元
-        self.if_dict_match = 0 # 2021/12/21 新增,判断公司实体是否由字典识别得到
+        self.money_unit = ''  # 2021/8/17 新增,保存金额单位 元、万元 、亿元
+        self.if_dict_match = 0  # 2021/12/21 新增,判断公司实体是否由字典识别得到
+        self.is_total_money = 0  # 2021/12/29 新增,判断金额是否总价
+        self.is_unit_money = 0  # 2021/12/29 新增,判断金额是否单价
 
     def set_Role(self,role_label,role_values):
         self.label = int(role_label)

+ 28 - 0
BiddingKG/dl/interface/Preprocessing.py

@@ -6,6 +6,9 @@ import sys
 import os
 import time
 import codecs
+
+from BiddingKG.dl.ratio.re_ratio import extract_ratio
+
 sys.setrecursionlimit(1000000)
 sys.path.append(os.path.abspath("../.."))
 sys.path.append(os.path.abspath(".."))
@@ -2095,6 +2098,31 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             #         Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
             #                begin_index_temp, end_index_temp))
 
+            # 2021/12/29 新增比率提取
+            list_ratio = extract_ratio(sentence_text)
+            entity_type = "ratio"
+            for ratio in list_ratio:
+                # print("ratio", ratio)
+                begin_index_temp = ratio['begin_index']
+                for j in range(len(list_tokenbegin)):
+                    if list_tokenbegin[j] == begin_index_temp:
+                        begin_index = j
+                        break
+                    elif list_tokenbegin[j] > begin_index_temp:
+                        begin_index = j - 1
+                        break
+                index = ratio['end_index']
+                end_index_temp = index
+                for j in range(begin_index, len(list_tokenbegin)):
+                    if list_tokenbegin[j] >= index:
+                        end_index = j - 1
+                        break
+                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
+                entity_text = ratio['body']
+                list_sentence_entitys.append(
+                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
+                           begin_index_temp, end_index_temp))
+
             list_sentence_entitys.sort(key=lambda x:x.begin_index)
             list_entitys_temp = list_entitys_temp+list_sentence_entitys
         list_entitys.append(list_entitys_temp)

+ 16 - 50
BiddingKG/dl/interface/extract.py

@@ -53,45 +53,45 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     cost_time.update(_cost_time)
 
     # 依赖句子顺序
-    start_time = time.time() # 公告类型/生命周期提取
+    start_time = time.time()  # 公告类型/生命周期提取
     list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
     cost_time["channel"] = round(time.time()-start_time,2)
 
-    start_time = time.time() # 项目编号、名称提取
+    start_time = time.time()  # 项目编号、名称提取
     codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
     log("get codename done of doc_id%s"%(doc_id))
     cost_time["codename"] = round(time.time()-start_time,2)
 
-    start_time = time.time() # 角色金额模型提取
+    start_time = time.time()  # 角色金额模型提取
     predictor.getPredictor("prem").predict(list_sentences,list_entitys)
     log("get prem done of doc_id%s"%(doc_id))
     cost_time["prem"] = round(time.time()-start_time,2)
 
-    start_time = time.time() # 产品名称及废标原因提取
+    start_time = time.time()  # 产品名称及废标原因提取
     predictor.getPredictor("product").predict(list_sentences,list_entitys)
     log("get product done of doc_id%s"%(doc_id))
     cost_time["product"] = round(time.time()-start_time,2)
 
-    start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
+    start_time = time.time()  # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
     product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
     log("get product attributes done of doc_id%s"%(doc_id))
     cost_time["product_attrs"] = round(time.time()-start_time,2)
 
-    start_time = time.time() # 正则角色提取
+    start_time = time.time()  # 正则角色提取
     predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
     cost_time["rule"] = round(time.time()-start_time,2)
 
-    start_time = time.time() # 联系人模型提取
+    start_time = time.time()  # 联系人模型提取
     predictor.getPredictor("epc").predict(list_sentences,list_entitys)
     log("get epc done of doc_id%s"%(doc_id))
     cost_time["person"] = round(time.time()-start_time,2)
 
-    start_time = time.time() # 时间类别提取
+    start_time = time.time()  # 时间类别提取
     predictor.getPredictor("time").predict(list_sentences, list_entitys)
     log("get time done of doc_id%s"%(doc_id))
     cost_time["time"] = round(time.time()-start_time,2)
 
-    start_time = time.time() # 保证金支付方式
+    start_time = time.time()  # 保证金支付方式
     payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content)
     cost_time["deposit"] = round(time.time()-start_time,2)
 
@@ -109,49 +109,19 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
                         _entity.values[1] = 0.51
                         _entity.set_Money(1, _entity.values)
 
-    # 2021-12-08新增:提取:总价,单价,比率
-    total_money_list = []
-    unit_money_list = []
-    ratio_list = []
-    for i in range(len(list_entitys)):
-        list_entity = list_entitys[i]
-
-        # 总价单价
-        for _entity in list_entity:
-            if _entity.entity_type == 'money':
-                word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
-                # 总价在中投标金额中
-                if _entity.label == 1:
-                    result = extract_total_money(word_of_sentence,
-                                                 _entity.entity_text,
-                                                 [_entity.wordOffset_begin, _entity.wordOffset_end])
-                    if result:
-                        total_money_list.append(result)
-
-                # 单价在普通金额中
-                else:
-                    result = extract_unit_money(word_of_sentence,
-                                                _entity.entity_text,
-                                                [_entity.wordOffset_begin, _entity.wordOffset_end])
-                    if result:
-                        unit_money_list.append(result)
-
-        # 比率
-        all_sentence = ""
-        for sentence in list_sentences[i]:
-            all_sentence += sentence.sentence_text + ","
-        result = extract_ratio(all_sentence)
-        if result:
-            ratio_list.append(result)
+    # 2021-12-29新增:提取:总价,单价
+    start_time = time.time()  # 总价单价提取
+    predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys)
+    cost_time["total_unit_money"] = round(time.time()-start_time, 2)
 
     # 依赖句子顺序
-    start_time = time.time() # 实体链接
+    start_time = time.time()  # 实体链接
     entityLink.link_entitys(list_entitys)
     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
 
-    start_time = time.time() # 失信数据要素提取
+    start_time = time.time()  # 失信数据要素提取
     list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
     cost_time["punish"] = round(time.time()-start_time,2)
 
@@ -159,7 +129,7 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
         for d in product_attrs[1]['demand_info']['data']:
             for product in set(prem[0]['product']):
                 if product in d['project_name']:
-                    d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
+                    d['product'].append(product)  # 把产品在项目名称中的添加进需求要素中
 
     # print(prem)
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
@@ -168,10 +138,6 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     data_res["cost_time"] = cost_time
     data_res["success"] = True
 
-    data_res["total_money"] = total_money_list
-    data_res["unit_money"] = unit_money_list
-    data_res["ratio"] = ratio_list
-
     # for _article in list_articles:
     #     log(_article.content)
     #

+ 79 - 30
BiddingKG/dl/interface/predictor.py

@@ -20,55 +20,61 @@ import tensorflow as tf
 from BiddingKG.dl.product.data_util import decode, process_data
 from BiddingKG.dl.interface.Entitys import Entity
 from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
+from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
 from bs4 import BeautifulSoup
 import copy
 import calendar
 import datetime
 
 from threading import RLock
-dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
-              "prem":{"predictor":None,"Lock":RLock()},
-              "epc":{"predictor":None,"Lock":RLock()},
-              "roleRule":{"predictor":None,"Lock":RLock()},
-                  "form":{"predictor":None,"Lock":RLock()},
-                  "time":{"predictor":None,"Lock":RLock()},
-                  "punish":{"predictor":None,"Lock":RLock()},
-                  "product":{"predictor":None,"Lock":RLock()},
-                "product_attrs":{"predictor":None,"Lock":RLock()},
+dict_predictor = {"codeName": {"predictor": None, "Lock": RLock()},
+                  "prem": {"predictor": None, "Lock": RLock()},
+                  "epc": {"predictor": None, "Lock": RLock()},
+                  "roleRule": {"predictor": None, "Lock": RLock()},
+                  "form": {"predictor": None, "Lock": RLock()},
+                  "time": {"predictor": None, "Lock": RLock()},
+                  "punish": {"predictor": None, "Lock": RLock()},
+                  "product": {"predictor": None, "Lock": RLock()},
+                  "product_attrs": {"predictor": None, "Lock": RLock()},
                   "channel": {"predictor": None, "Lock": RLock()},
-                  "deposit_payment_way": {"predictor": None, "Lock": RLock()}}
+                  "deposit_payment_way": {"predictor": None, "Lock": RLock()},
+                  "total_unit_money": {"predictor": None, "Lock": RLock()}
+                  }
 
 
 def getPredictor(_type):
     if _type in dict_predictor:
         with dict_predictor[_type]["Lock"]:
             if dict_predictor[_type]["predictor"] is None:
-                if _type=="codeName":
+                if _type == "codeName":
                     dict_predictor[_type]["predictor"] = CodeNamePredict()
-                if _type=="prem":
+                if _type == "prem":
                     dict_predictor[_type]["predictor"] = PREMPredict()
-                if _type=="epc":
+                if _type == "epc":
                     dict_predictor[_type]["predictor"] = EPCPredict()
-                if _type=="roleRule":
+                if _type == "roleRule":
                     dict_predictor[_type]["predictor"] = RoleRulePredictor()
-                if _type=="form":
+                if _type == "form":
                     dict_predictor[_type]["predictor"] = FormPredictor()
-                if _type=="time":
+                if _type == "time":
                     dict_predictor[_type]["predictor"] = TimePredictor()
-                if _type=="punish":
+                if _type == "punish":
                     dict_predictor[_type]["predictor"] = Punish_Extract()
-                if _type=="product":
+                if _type == "product":
                     dict_predictor[_type]["predictor"] = ProductPredictor()
-                if _type=="product_attrs":
+                if _type == "product_attrs":
                     dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
                 if _type == "channel":
                     dict_predictor[_type]["predictor"] = DocChannel()
                 if _type == 'deposit_payment_way':
                     dict_predictor[_type]["predictor"] = DepositPaymentWay()
+                if _type == 'total_unit_money':
+                    dict_predictor[_type]["predictor"] = TotalUnitMoney()
             return dict_predictor[_type]["predictor"]
     raise NameError("no this type of predictor")
 
-#编号名称模型
+
+# 编号名称模型
 class CodeNamePredict():
     
     def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()):
@@ -525,7 +531,7 @@ class CodeNamePredict():
         return result
     '''
         
-#角色金额模型        
+# 角色金额模型
 class PREMPredict():
 
     
@@ -704,7 +710,7 @@ class PREMPredict():
         self.predict_money(list_sentences,list_entitys)
         
         
-#联系人模型    
+# 联系人模型
 class EPCPredict():
     
     def __init__(self):
@@ -1044,7 +1050,7 @@ class EPCPredict():
     def predict(self,list_sentences,list_entitys):
         self.predict_person(list_sentences,list_entitys)
             
-#表格预测
+# 表格预测
 class FormPredictor():
     
     def __init__(self,lazyLoad=getLazyLoad()):
@@ -1076,10 +1082,9 @@ class FormPredictor():
         else:
             return self.getModel(type).predict(form_datas)
 
-    
 
-#角色规则
-#依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率
+# 角色规则
+# 依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率
 class RoleRulePredictor():
     
     def __init__(self):
@@ -1371,6 +1376,7 @@ class RoleRulePredictor():
                 if p_entity.entity_text in self.SET_NOT_TENDERER:
                     p_entity.label=5
 
+
 # 时间类别
 class TimePredictor():
     def __init__(self):
@@ -1476,6 +1482,7 @@ class TimePredictor():
                         values[0] = 0.5
                 entity.set_Role(label, values)
 
+
 # 产品字段提取
 class ProductPredictor():
     def __init__(self):
@@ -1552,6 +1559,7 @@ class ProductPredictor():
                     result.append(item) # 修正bug
                 return result
 
+
 # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
 class ProductAttributesPredictor():
     def __init__(self,):
@@ -2023,6 +2031,7 @@ class ProductAttributesPredictor():
             demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
         return [attr_dic, demand_dic]
 
+
 # docchannel类型提取
 class DocChannel():
   def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):
@@ -2204,6 +2213,7 @@ class DocChannel():
       # return self.id2type[id], prob
       return [{'docchannel':self.id2type[id]}]
 
+
 # 保证金支付方式提取
 class DepositPaymentWay():
     def __init__(self,):
@@ -2237,6 +2247,39 @@ class DepositPaymentWay():
         else:
             return pay_way
 
+
+# 总价单价提取
+class TotalUnitMoney:
+    def __init__(self):
+        pass
+
+    def predict(self, list_sentences, list_entitys):
+        for i in range(len(list_entitys)):
+            list_entity = list_entitys[i]
+
+            # 总价单价
+            for _entity in list_entity:
+                if _entity.entity_type == 'money':
+                    word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
+                    # 总价在中投标金额中
+                    if _entity.label == 1:
+                        result = extract_total_money(word_of_sentence,
+                                                     _entity.entity_text,
+                                                     [_entity.wordOffset_begin, _entity.wordOffset_end])
+                        if result:
+                            _entity.is_total_money = 1
+
+                    # 单价在普通金额中
+                    else:
+                        result = extract_unit_money(word_of_sentence,
+                                                    _entity.entity_text,
+                                                    [_entity.wordOffset_begin, _entity.wordOffset_end])
+                        if result:
+                            _entity.is_unit_money = 1
+                # print("total_unit_money", _entity.entity_text,
+                #       _entity.is_total_money, _entity.is_unit_money)
+
+
 def getSavedModel():
     #predictor = FormPredictor()
     graph = tf.Graph()
@@ -2250,7 +2293,8 @@ def getSavedModel():
           inputs={"image": model.input},
           outputs={"scores": model.output}
         )
-        
+
+
 def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
     '''
     model = models.Sequential()
@@ -2354,6 +2398,7 @@ def h5_to_graph(sess,graph,h5file):
         print(name,graph.get_tensor_by_name(name),np.shape(value))
         sess.run(tf.assign(graph.get_tensor_by_name(name),value))
 
+
 def initialize_uninitialized(sess):
     global_vars          = tf.global_variables()
     is_not_initialized   = sess.run([tf.is_variable_initialized(var) for var in global_vars])
@@ -2435,7 +2480,8 @@ def save_role_model():
                                            "input2":model.input[2]},
                                    outputs={"outputs":model.output}
                                    )
-    
+
+
 def save_money_model():
     model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
     graph = tf.Graph()
@@ -2487,7 +2533,8 @@ def save_person_model():
                                    inputs={"input0":model.input[0],
                                            "input1":model.input[1]},
                                    outputs = {"outputs":model.output})
-    
+
+
 def save_form_model():
     model_form = FormPredictor()
     with model_form.graph.as_default():
@@ -2499,7 +2546,8 @@ def save_form_model():
                                    "./form_savedmodel/",
                                    inputs={"inputs":model.input},
                                    outputs = {"outputs":model.output})
-    
+
+
 def save_codesplit_model():
     filepath_code = "../projectCode/models/model_code.hdf5"
     
@@ -2517,6 +2565,7 @@ def save_codesplit_model():
                                            "input2":model_code.input[2]},
                                    outputs={"outputs":model_code.output})
 
+
 def save_timesplit_model():
     filepath = '../time/model_label_time_classify.model.hdf5'
     with tf.Graph().as_default() as graph:

+ 16 - 4
BiddingKG/dl/money/re_money_total_unit.py

@@ -46,8 +46,8 @@ def re_standard_unit(_str):
 
 def re_total(text, money, index):
     # 对已提取的中投标金额的前面文字进行正则
-    prefix_threshold = 10
-    suffix_threshold = 10
+    prefix_threshold = 7
+    suffix_threshold = 0
     # if index_threshold < index[0]:
     #     money_text = text[index[0]-index_threshold:index[0]]
     #     print("total", money, text[index[0]-index_threshold:index[1]], money_text)
@@ -66,9 +66,21 @@ def re_total(text, money, index):
 
 
 def re_unit(text, money, index):
+    # 根据逗号隔开
+    # texts = text.split(",")
+    # for t in texts:
+    #     match = re.search(money, t)
+    #     if match:
+    #         text = t
+    #         index = match.span()
+    #         break
+    #     else:
+    #         text = ""
+    #         index = (0, 0)
+
     # 对已提取的中投标金额的前面文字进行正则
-    prefix_threshold = 10
-    suffix_threshold = 10
+    prefix_threshold = 7
+    suffix_threshold = 3
     # if prefix_threshold < index[0]:
     #     money_text = text[index[0]-prefix_threshold:index[0]]
     #     print("unit", money, text[index[0]-prefix_threshold:index[1]], money_text)